mesa/src/compiler/nir/nir_schedule.c

1208 lines
36 KiB
C
Raw Normal View History

/*
* Copyright © 2019 Broadcom
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "nir_schedule.h"
#include "util/dag.h"
#include "util/u_dynarray.h"
/** @file
*
* Implements basic-block-level prepass instruction scheduling in NIR to
* manage register pressure.
*
* This is based on the Goodman/Hsu paper (1988, cached copy at
* https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
* make up the DDG for NIR (which can be mostly done using the NIR def/use
* chains for SSA instructions, plus some edges for ordering register writes
* vs reads, and some more for ordering intrinsics). Then we pick heads off
* of the DDG using their heuristic to emit the NIR instructions back into the
* block in their new order.
*
* The hard case for prepass scheduling on GPUs seems to always be consuming
* texture/ubo results. The register pressure heuristic doesn't want to pick
* an instr that starts consuming texture results because it usually won't be
* the only usage, so that instruction will increase pressure.
*
* If you try to force consumption of tex results always, then in a case where
* single sample is used for many outputs, you'll end up picking every other
* user and expanding register pressure. The partially_evaluated_path flag
* helps tremendously, in that if you happen for whatever reason to pick a
* texture sample's output, then you'll try to finish off that sample. Future
* work may include doing some local search before locking in a choice, to try
* to more reliably find the case where just a few choices going against the
* heuristic can manage to free the whole vector.
*/
static bool debug;
/**
* Represents a node in the DDG for a NIR instruction.
*/
typedef struct {
struct dag_node dag; /* must be first for our u_dynarray_foreach */
nir_instr *instr;
bool partially_evaluated_path;
/* Approximate estimate of the delay between starting this instruction and
* its results being available.
*
* Accuracy is not too important, given that we're prepass scheduling here
* and just trying to reduce excess dependencies introduced by a register
* allocator by stretching out the live intervals of expensive
* instructions.
*/
uint32_t delay;
/* Cost of the maximum-delay path from this node to the leaves. */
uint32_t max_delay;
/* scoreboard->time value when this instruction can be scheduled without
* any stalls expected.
*/
uint32_t ready_time;
} nir_schedule_node;
typedef struct {
struct dag *dag;
nir_shader *shader;
/* Mapping from nir_register * or nir_ssa_def * to a struct set of
* instructions remaining to be scheduled using the register.
*/
struct hash_table *remaining_uses;
/* Map from nir_instr to nir_schedule_node * */
struct hash_table *instr_map;
/* Set of nir_register * or nir_ssa_def * that have had any instruction
* scheduled on them.
*/
struct set *live_values;
/* An abstract approximation of the number of nir_scheduler_node->delay
* units since the start of the shader.
*/
uint32_t time;
/* Number of channels currently used by the NIR instructions that have been
* scheduled.
*/
int pressure;
/* Options specified by the backend */
const nir_schedule_options *options;
} nir_schedule_scoreboard;
/* When walking the instructions in reverse, we use this flag to swap
* before/after in add_dep().
*/
enum direction { F, R };
struct nir_schedule_class_dep {
int klass;
nir_schedule_node *node;
struct nir_schedule_class_dep *next;
};
typedef struct {
nir_schedule_scoreboard *scoreboard;
/* Map from nir_register to nir_schedule_node * */
struct hash_table *reg_map;
/* Scheduler nodes for last instruction involved in some class of dependency.
*/
nir_schedule_node *load_input;
nir_schedule_node *store_shared;
nir_schedule_node *unknown_intrinsic;
nir_schedule_node *discard;
nir_schedule_node *jump;
struct nir_schedule_class_dep *class_deps;
enum direction dir;
} nir_deps_state;
static void *
_mesa_hash_table_search_data(struct hash_table *ht, void *key)
{
struct hash_entry *entry = _mesa_hash_table_search(ht, key);
if (!entry)
return NULL;
return entry->data;
}
static nir_schedule_node *
nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
{
return _mesa_hash_table_search_data(instr_map, instr);
}
static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
{
if (src->is_ssa) {
return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
} else {
return _mesa_hash_table_search_data(scoreboard->remaining_uses,
src->reg.reg);
}
}
static int
nir_schedule_def_pressure(nir_ssa_def *def)
{
return def->num_components;
}
static int
nir_schedule_src_pressure(nir_src *src)
{
if (src->is_ssa)
return nir_schedule_def_pressure(src->ssa);
else
return src->reg.reg->num_components;
}
static int
nir_schedule_dest_pressure(nir_dest *dest)
{
if (dest->is_ssa)
return nir_schedule_def_pressure(&dest->ssa);
else
return dest->reg.reg->num_components;
}
/**
* Adds a dependency such that @after must appear in the final program after
* @before.
*
* We add @before as a child of @after, so that DAG heads are the outputs of
* the program and we make our scheduling decisions bottom to top.
*/
static void
add_dep(nir_deps_state *state,
nir_schedule_node *before,
nir_schedule_node *after)
{
if (!before || !after)
return;
assert(before != after);
if (state->dir == F)
dag_add_edge(&before->dag, &after->dag, 0);
else
dag_add_edge(&after->dag, &before->dag, 0);
}
static void
add_read_dep(nir_deps_state *state,
nir_schedule_node *before,
nir_schedule_node *after)
{
add_dep(state, before, after);
}
static void
add_write_dep(nir_deps_state *state,
nir_schedule_node **before,
nir_schedule_node *after)
{
add_dep(state, *before, after);
*before = after;
}
static bool
nir_schedule_reg_src_deps(nir_src *src, void *in_state)
{
nir_deps_state *state = in_state;
if (src->is_ssa)
return true;
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
src->reg.reg);
if (!entry)
return true;
nir_schedule_node *dst_n = entry->data;
nir_schedule_node *src_n =
nir_schedule_get_node(state->scoreboard->instr_map,
src->parent_instr);
add_dep(state, dst_n, src_n);
return true;
}
static bool
nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
{
nir_deps_state *state = in_state;
if (dest->is_ssa)
return true;
nir_schedule_node *dest_n =
nir_schedule_get_node(state->scoreboard->instr_map,
dest->reg.parent_instr);
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
dest->reg.reg);
if (!entry) {
_mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
return true;
}
nir_schedule_node **before = (nir_schedule_node **)&entry->data;
add_write_dep(state, before, dest_n);
return true;
}
static bool
nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
{
nir_deps_state *state = in_state;
struct hash_table *instr_map = state->scoreboard->instr_map;
nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
nir_foreach_use(src, def) {
nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
src->parent_instr);
add_read_dep(state, def_n, use_n);
}
return true;
}
static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state *state,
int klass)
{
for (struct nir_schedule_class_dep *class_dep = state->class_deps;
class_dep != NULL;
class_dep = class_dep->next) {
if (class_dep->klass == klass)
return class_dep;
}
struct nir_schedule_class_dep *class_dep =
ralloc(state->reg_map, struct nir_schedule_class_dep);
class_dep->klass = klass;
class_dep->node = NULL;
class_dep->next = state->class_deps;
state->class_deps = class_dep;
return class_dep;
}
static void
nir_schedule_intrinsic_deps(nir_deps_state *state,
nir_intrinsic_instr *instr)
{
nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
&instr->instr);
const nir_schedule_options *options = state->scoreboard->options;
nir_schedule_dependency dep;
if (options->intrinsic_cb &&
options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
struct nir_schedule_class_dep *class_dep =
nir_schedule_get_class_dep(state, dep.klass);
switch (dep.type) {
case NIR_SCHEDULE_READ_DEPENDENCY:
add_read_dep(state, class_dep->node, n);
break;
case NIR_SCHEDULE_WRITE_DEPENDENCY:
add_write_dep(state, &class_dep->node, n);
break;
}
}
switch (instr->intrinsic) {
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_front_face:
break;
case nir_intrinsic_discard:
case nir_intrinsic_discard_if:
case nir_intrinsic_demote:
case nir_intrinsic_demote_if:
case nir_intrinsic_terminate:
case nir_intrinsic_terminate_if:
/* We are adding two dependencies:
*
* * A individual one that we could use to add a read_dep while handling
* nir_instr_type_tex
*
* * Include it on the unknown intrinsic set, as we want discard to be
* serialized in in the same order relative to intervening stores or
* atomic accesses to SSBOs and images
*/
add_write_dep(state, &state->discard, n);
add_write_dep(state, &state->unknown_intrinsic, n);
break;
case nir_intrinsic_store_output:
/* For some hardware and stages, output stores affect the same shared
* memory as input loads.
*/
if ((state->scoreboard->options->stages_with_shared_io_memory &
(1 << state->scoreboard->shader->info.stage)))
add_write_dep(state, &state->load_input, n);
/* Make sure that preceding discards stay before the store_output */
add_read_dep(state, state->discard, n);
break;
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_vertex_input:
add_read_dep(state, state->load_input, n);
break;
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared2_amd:
/* Don't move load_shared beyond a following store_shared, as it could
* change their value
*/
add_read_dep(state, state->store_shared, n);
break;
case nir_intrinsic_store_shared:
case nir_intrinsic_store_shared2_amd:
add_write_dep(state, &state->store_shared, n);
break;
case nir_intrinsic_scoped_barrier: {
const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
if (modes & nir_var_mem_shared)
add_write_dep(state, &state->store_shared, n);
/* Serialize against other categories. */
add_write_dep(state, &state->unknown_intrinsic, n);
break;
}
default:
/* Attempt to handle other intrinsics that we haven't individually
* categorized by serializing them in the same order relative to each
* other.
*/
add_write_dep(state, &state->unknown_intrinsic, n);
break;
}
}
/**
* Common code for dependencies that need to be tracked both forward and
* backward.
*
* This is for things like "all reads of r4 have to happen between the r4
* writes that surround them".
*/
static void
nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
{
nir_instr *instr = n->instr;
/* For NIR SSA defs, we only need to do a single pass of making the uses
* depend on the def.
*/
if (state->dir == F)
nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
/* For NIR regs, track the last writer in the scheduler state so that we
* can keep the writes in order and let reads get reordered only between
* each write.
*/
nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
/* Make sure any other instructions keep their positions relative to
* jumps.
*/
if (instr->type != nir_instr_type_jump)
add_read_dep(state, state->jump, n);
switch (instr->type) {
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
case nir_instr_type_alu:
case nir_instr_type_deref:
break;
case nir_instr_type_tex:
/* Don't move texture ops before a discard, as that could increase
* memory bandwidth for reading the discarded samples.
*/
add_read_dep(state, state->discard, n);
break;
case nir_instr_type_jump:
add_write_dep(state, &state->jump, n);
break;
case nir_instr_type_call:
unreachable("Calls should have been lowered");
break;
case nir_instr_type_parallel_copy:
unreachable("Parallel copies should have been lowered");
break;
case nir_instr_type_phi:
unreachable("nir_schedule() should be called after lowering from SSA");
break;
case nir_instr_type_intrinsic:
nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
break;
}
}
static void
calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
nir_deps_state state = {
.scoreboard = scoreboard,
.dir = F,
.reg_map = _mesa_pointer_hash_table_create(NULL),
};
nir_foreach_instr(instr, block) {
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
instr);
nir_schedule_calculate_deps(&state, node);
}
ralloc_free(state.reg_map);
}
static void
calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
nir_deps_state state = {
.scoreboard = scoreboard,
.dir = R,
.reg_map = _mesa_pointer_hash_table_create(NULL),
};
nir_foreach_instr_reverse(instr, block) {
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
instr);
nir_schedule_calculate_deps(&state, node);
}
ralloc_free(state.reg_map);
}
typedef struct {
nir_schedule_scoreboard *scoreboard;
int regs_freed;
} nir_schedule_regs_freed_state;
static bool
nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
nir_schedule_scoreboard *scoreboard = state->scoreboard;
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
if (remaining_uses->entries == 1 &&
_mesa_set_search(remaining_uses, src->parent_instr)) {
state->regs_freed += nir_schedule_src_pressure(src);
}
return true;
}
static bool
nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
state->regs_freed -= nir_schedule_def_pressure(def);
return true;
}
static bool
nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
nir_schedule_scoreboard *scoreboard = state->scoreboard;
if (dest->is_ssa)
return true;
nir_register *reg = dest->reg.reg;
/* Only the first def of a reg counts against register pressure. */
if (!_mesa_set_search(scoreboard->live_values, reg))
state->regs_freed -= nir_schedule_dest_pressure(dest);
return true;
}
static int
nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
{
nir_schedule_regs_freed_state state = {
.scoreboard = scoreboard,
};
nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
return state.regs_freed;
}
/**
* Chooses an instruction that will minimise the register pressure as much as
* possible. This should only be used as a fallback when the regular scheduling
* generates a shader whose register allocation fails.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find the leader in the ready (shouldn't-stall) set with the mininum
* cost.
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (scoreboard->time < n->ready_time)
continue;
if (!chosen || chosen->max_delay > n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready fallback): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Otherwise, choose the leader with the minimum cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay > n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (leader fallback): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/**
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
* Scheduling for Parallelism) heuristic.
*
* Picks an instruction on the critical that's ready to execute without
* stalls, if possible, otherwise picks the instruction on the critical path.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find the leader in the ready (shouldn't-stall) set with the maximum
* cost.
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (scoreboard->time < n->ready_time)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Otherwise, choose the leader with the maximum cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (leader): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/**
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
* Scheduling for Register pressure) heuristic.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find a ready inst with regs freed and pick the one with max cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->ready_time > scoreboard->time)
continue;
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (freed+ready): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Find a leader with regs freed and pick the one with max cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (regs freed): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Find a partially evaluated path and try to finish it off */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->partially_evaluated_path &&
(!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (partial path): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Contra the paper, pick a leader with no effect on used regs. This may
* open up new opportunities, as otherwise a single-operand instr consuming
* a value will tend to block finding freeing that value. This had a
* massive effect on reducing spilling on V3D.
*
* XXX: Should this prioritize ready?
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (nir_schedule_regs_freed(scoreboard, n) != 0)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (regs no-op): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Pick the max delay of the remaining ready set. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->ready_time > scoreboard->time)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready max delay): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Pick the max delay of the remaining leaders. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (max delay): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
static void
dump_state(nir_schedule_scoreboard *scoreboard)
{
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
fprintf(stderr, "maxdel %5d ", n->max_delay);
nir_print_instr(n->instr, stderr);
fprintf(stderr, "\n");
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
nir_print_instr(child->instr, stderr);
fprintf(stderr, "\n");
}
}
}
static void
nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
void *reg_or_def,
nir_instr *reg_or_def_parent,
int pressure)
{
/* Make the value live if it's the first time it's been used. */
if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
_mesa_set_add(scoreboard->live_values, reg_or_def);
scoreboard->pressure += pressure;
}
/* Make the value dead if it's the last remaining use. Be careful when one
* instruction uses a value twice to not decrement pressure twice.
*/
struct set *remaining_uses =
_mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
if (entry) {
_mesa_set_remove(remaining_uses, entry);
if (remaining_uses->entries == 0)
scoreboard->pressure -= pressure;
}
}
static bool
nir_schedule_mark_src_scheduled(nir_src *src, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
struct set_entry *entry = _mesa_set_search(remaining_uses,
src->parent_instr);
if (entry) {
/* Once we've used an SSA value in one instruction, bump the priority of
* the other uses so the SSA value can get fully consumed.
*
* We don't do this for registers, and it's would be a hassle and it's
* unclear if that would help or not. Also, skip it for constants, as
* they're often folded as immediates into backend instructions and have
* many unrelated instructions all referencing the same value (0).
*/
if (src->is_ssa &&
src->ssa->parent_instr->type != nir_instr_type_load_const) {
nir_foreach_use(other_src, src->ssa) {
if (other_src->parent_instr == src->parent_instr)
continue;
nir_schedule_node *n =
nir_schedule_get_node(scoreboard->instr_map,
other_src->parent_instr);
if (n && !n->partially_evaluated_path) {
if (debug) {
fprintf(stderr, " New partially evaluated path: ");
nir_print_instr(n->instr, stderr);
fprintf(stderr, "\n");
}
n->partially_evaluated_path = true;
}
}
}
}
nir_schedule_mark_use(scoreboard,
src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
src->parent_instr,
nir_schedule_src_pressure(src));
return true;
}
static bool
nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
nir_schedule_mark_use(scoreboard, def, def->parent_instr,
nir_schedule_def_pressure(def));
return true;
}
static bool
nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
/* SSA defs were handled in nir_schedule_mark_def_scheduled()
*/
if (dest->is_ssa)
return true;
/* XXX: This is not actually accurate for regs -- the last use of a reg may
* have a live interval that extends across control flow. We should
* calculate the live ranges of regs, and have scheduler nodes for the CF
* nodes that also "use" the reg.
*/
nir_schedule_mark_use(scoreboard, dest->reg.reg,
dest->reg.parent_instr,
nir_schedule_dest_pressure(dest));
return true;
}
static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
nir_schedule_node *n)
{
nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
child->ready_time = MAX2(child->ready_time,
scoreboard->time + n->delay);
if (child->dag.parent_count == 1) {
if (debug) {
fprintf(stderr, " New DAG head: ");
nir_print_instr(child->instr, stderr);
fprintf(stderr, "\n");
}
}
}
dag_prune_head(scoreboard->dag, &n->dag);
scoreboard->time = MAX2(n->ready_time, scoreboard->time);
scoreboard->time++;
}
static void
nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
while (!list_is_empty(&scoreboard->dag->heads)) {
if (debug) {
fprintf(stderr, "current list:\n");
dump_state(scoreboard);
}
nir_schedule_node *chosen;
if (scoreboard->options->fallback)
chosen = nir_schedule_choose_instruction_fallback(scoreboard);
else if (scoreboard->pressure < scoreboard->options->threshold)
chosen = nir_schedule_choose_instruction_csp(scoreboard);
else
chosen = nir_schedule_choose_instruction_csr(scoreboard);
/* Now that we've scheduled a new instruction, some of its children may
* be promoted to the list of instructions ready to be scheduled.
*/
nir_schedule_mark_node_scheduled(scoreboard, chosen);
/* Move the instruction to the end (so our first chosen instructions are
* the start of the program).
*/
exec_node_remove(&chosen->instr->node);
exec_list_push_tail(&block->instr_list, &chosen->instr->node);
if (debug)
fprintf(stderr, "\n");
}
}
static uint32_t
nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
{
if (scoreboard->options->instr_delay_cb) {
void *cb_data = scoreboard->options->instr_delay_cb_data;
return scoreboard->options->instr_delay_cb(instr, cb_data);
}
switch (instr->type) {
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
case nir_instr_type_alu:
case nir_instr_type_deref:
case nir_instr_type_jump:
case nir_instr_type_parallel_copy:
case nir_instr_type_call:
case nir_instr_type_phi:
return 1;
case nir_instr_type_intrinsic:
nir/schedule: use larger delay for non-filtered memory reads This has been pending for a long time. It is not very consistent to add a significant delay for textures and not do it for UBOs, etc The reason we have not been doing this so far is the accumulated effect on register pressure for V3D as shown by shader-db results below, but from the point of view of a generic scheduler it makes sense to do this. Later patches will address V3D specific issues with register pressure derived from this by letting the driver control its instruction delay settings. total instructions in shared programs: 12662138 -> 13126587 (3.67%) instructions in affected programs: 1813091 -> 2277540 (25.62%) helped: 2410 HURT: 10499 total threads in shared programs: 415858 -> 407208 (-2.08%) threads in affected programs: 17348 -> 8698 (-49.86%) helped: 8 HURT: 4333 total uniforms in shared programs: 3711483 -> 3812698 (2.73%) uniforms in affected programs: 128012 -> 229227 (79.07%) helped: 3474 HURT: 2143 total max-temps in shared programs: 2138763 -> 2318430 (8.40%) max-temps in affected programs: 318780 -> 498447 (56.36%) helped: 588 HURT: 11997 total spills in shared programs: 3860 -> 49086 (1171.66%) spills in affected programs: 709 -> 45935 (6378.84%) helped: 23 HURT: 1595 total fills in shared programs: 5573 -> 55810 (901.44%) fills in affected programs: 1067 -> 51304 (4708.25%) helped: 23 HURT: 1595 LOST: 3 GAINED: 0 Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15276>
2022-03-02 12:15:15 +01:00
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
case nir_intrinsic_image_load:
return 50;
default:
return 1;
}
break;
case nir_instr_type_tex:
/* Pick some large number to try to fetch textures early and sample them
* late.
*/
return 100;
}
return 0;
}
static void
nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
{
nir_schedule_node *n = (nir_schedule_node *)node;
uint32_t max_delay = 0;
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
max_delay = MAX2(child->max_delay, max_delay);
}
n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
}
static void
nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
void *mem_ctx = ralloc_context(NULL);
scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
scoreboard->dag = dag_create(mem_ctx);
nir_foreach_instr(instr, block) {
nir_schedule_node *n =
rzalloc(mem_ctx, nir_schedule_node);
n->instr = instr;
n->delay = nir_schedule_get_delay(scoreboard, instr);
dag_init_node(scoreboard->dag, &n->dag);
_mesa_hash_table_insert(scoreboard->instr_map, instr, n);
}
calculate_forward_deps(scoreboard, block);
calculate_reverse_deps(scoreboard, block);
dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
nir_schedule_instructions(scoreboard, block);
ralloc_free(mem_ctx);
scoreboard->instr_map = NULL;
}
static bool
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
struct set *def_uses = _mesa_pointer_set_create(scoreboard);
_mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
_mesa_set_add(def_uses, def->parent_instr);
nir_foreach_use(src, def) {
_mesa_set_add(def_uses, src->parent_instr);
}
/* XXX: Handle if uses */
return true;
}
static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader *shader,
const nir_schedule_options *options)
{
nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
scoreboard->shader = shader;
scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
scoreboard->options = options;
scoreboard->pressure = 0;
nir_foreach_function(function, shader) {
nir_foreach_register(reg, &function->impl->registers) {
struct set *register_uses =
_mesa_pointer_set_create(scoreboard);
_mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
nir_foreach_use(src, reg) {
_mesa_set_add(register_uses, src->parent_instr);
}
/* XXX: Handle if uses */
nir_foreach_def(dest, reg) {
_mesa_set_add(register_uses, dest->reg.parent_instr);
}
}
nir_foreach_block(block, function->impl) {
nir_foreach_instr(instr, block) {
nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
scoreboard);
}
/* XXX: We're ignoring if uses, which may prioritize scheduling other
* uses of the if src even when it doesn't help. That's not many
* values, though, so meh.
*/
}
}
return scoreboard;
}
static void
nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
{
#ifdef NDEBUG
return;
#endif
bool any_uses = false;
hash_table_foreach(scoreboard->remaining_uses, entry) {
struct set *remaining_uses = entry->data;
set_foreach(remaining_uses, instr_entry) {
if (!any_uses) {
fprintf(stderr, "Tracked uses remain after scheduling. "
"Affected instructions: \n");
any_uses = true;
}
nir_print_instr(instr_entry->key, stderr);
fprintf(stderr, "\n");
}
}
assert(!any_uses);
}
/**
* Schedules the NIR instructions to try to decrease stalls (for example,
* delaying texture reads) while managing register pressure.
*
* The threshold represents "number of NIR register/SSA def channels live
* before switching the scheduling heuristic to reduce register pressure",
* since most of our GPU architectures are scalar (extending to vector with a
* flag wouldn't be hard). This number should be a bit below the number of
* registers available (counting any that may be occupied by system value
* payload values, for example), since the heuristic may not always be able to
* free a register immediately. The amount below the limit is up to you to
* tune.
*/
void
nir_schedule(nir_shader *shader,
const nir_schedule_options *options)
{
nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
options);
if (debug) {
fprintf(stderr, "NIR shader before scheduling:\n");
nir_print_shader(shader, stderr);
}
nir_foreach_function(function, shader) {
if (!function->impl)
continue;
nir_foreach_block(block, function->impl) {
nir_schedule_block(scoreboard, block);
}
}
nir_schedule_validate_uses(scoreboard);
ralloc_free(scoreboard);
}