mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 15:40:11 +01:00
Nobody was actually using it as a pointer, and I'm going to introduce a shared function which relies on it not being a pointer so let's fix this once and for all. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13722>
1199 lines
35 KiB
C
1199 lines
35 KiB
C
/*
|
|
* Copyright © 2019 Broadcom
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "nir_schedule.h"
|
|
#include "util/dag.h"
|
|
#include "util/u_dynarray.h"
|
|
|
|
/** @file
|
|
*
|
|
* Implements basic-block-level prepass instruction scheduling in NIR to
|
|
* manage register pressure.
|
|
*
|
|
* This is based on the Goodman/Hsu paper (1988, cached copy at
|
|
* https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
|
|
* make up the DDG for NIR (which can be mostly done using the NIR def/use
|
|
* chains for SSA instructions, plus some edges for ordering register writes
|
|
* vs reads, and some more for ordering intrinsics). Then we pick heads off
|
|
* of the DDG using their heuristic to emit the NIR instructions back into the
|
|
* block in their new order.
|
|
*
|
|
* The hard case for prepass scheduling on GPUs seems to always be consuming
|
|
* texture/ubo results. The register pressure heuristic doesn't want to pick
|
|
* an instr that starts consuming texture results because it usually won't be
|
|
* the only usage, so that instruction will increase pressure.
|
|
*
|
|
* If you try to force consumption of tex results always, then in a case where
|
|
* single sample is used for many outputs, you'll end up picking every other
|
|
* user and expanding register pressure. The partially_evaluated_path flag
|
|
* helps tremendously, in that if you happen for whatever reason to pick a
|
|
* texture sample's output, then you'll try to finish off that sample. Future
|
|
* work may include doing some local search before locking in a choice, to try
|
|
* to more reliably find the case where just a few choices going against the
|
|
* heuristic can manage to free the whole vector.
|
|
*/
|
|
|
|
static bool debug;
|
|
|
|
/**
|
|
* Represents a node in the DDG for a NIR instruction.
|
|
*/
|
|
typedef struct {
|
|
struct dag_node dag; /* must be first for our u_dynarray_foreach */
|
|
nir_instr *instr;
|
|
bool partially_evaluated_path;
|
|
|
|
/* Approximate estimate of the delay between starting this instruction and
|
|
* its results being available.
|
|
*
|
|
* Accuracy is not too important, given that we're prepass scheduling here
|
|
* and just trying to reduce excess dependencies introduced by a register
|
|
* allocator by stretching out the live intervals of expensive
|
|
* instructions.
|
|
*/
|
|
uint32_t delay;
|
|
|
|
/* Cost of the maximum-delay path from this node to the leaves. */
|
|
uint32_t max_delay;
|
|
|
|
/* scoreboard->time value when this instruction can be scheduled without
|
|
* any stalls expected.
|
|
*/
|
|
uint32_t ready_time;
|
|
} nir_schedule_node;
|
|
|
|
typedef struct {
|
|
struct dag *dag;
|
|
|
|
nir_shader *shader;
|
|
|
|
/* Mapping from nir_register * or nir_ssa_def * to a struct set of
|
|
* instructions remaining to be scheduled using the register.
|
|
*/
|
|
struct hash_table *remaining_uses;
|
|
|
|
/* Map from nir_instr to nir_schedule_node * */
|
|
struct hash_table *instr_map;
|
|
|
|
/* Set of nir_register * or nir_ssa_def * that have had any instruction
|
|
* scheduled on them.
|
|
*/
|
|
struct set *live_values;
|
|
|
|
/* An abstract approximation of the number of nir_scheduler_node->delay
|
|
* units since the start of the shader.
|
|
*/
|
|
uint32_t time;
|
|
|
|
/* Number of channels currently used by the NIR instructions that have been
|
|
* scheduled.
|
|
*/
|
|
int pressure;
|
|
|
|
/* Options specified by the backend */
|
|
const nir_schedule_options *options;
|
|
} nir_schedule_scoreboard;
|
|
|
|
/* When walking the instructions in reverse, we use this flag to swap
|
|
* before/after in add_dep().
|
|
*/
|
|
enum direction { F, R };
|
|
|
|
struct nir_schedule_class_dep {
|
|
int klass;
|
|
nir_schedule_node *node;
|
|
struct nir_schedule_class_dep *next;
|
|
};
|
|
|
|
typedef struct {
|
|
nir_schedule_scoreboard *scoreboard;
|
|
|
|
/* Map from nir_register to nir_schedule_node * */
|
|
struct hash_table *reg_map;
|
|
|
|
/* Scheduler nodes for last instruction involved in some class of dependency.
|
|
*/
|
|
nir_schedule_node *load_input;
|
|
nir_schedule_node *store_shared;
|
|
nir_schedule_node *unknown_intrinsic;
|
|
nir_schedule_node *discard;
|
|
nir_schedule_node *jump;
|
|
|
|
struct nir_schedule_class_dep *class_deps;
|
|
|
|
enum direction dir;
|
|
} nir_deps_state;
|
|
|
|
static void *
|
|
_mesa_hash_table_search_data(struct hash_table *ht, void *key)
|
|
{
|
|
struct hash_entry *entry = _mesa_hash_table_search(ht, key);
|
|
if (!entry)
|
|
return NULL;
|
|
return entry->data;
|
|
}
|
|
|
|
static nir_schedule_node *
|
|
nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
|
|
{
|
|
return _mesa_hash_table_search_data(instr_map, instr);
|
|
}
|
|
|
|
static struct set *
|
|
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
|
|
{
|
|
if (src->is_ssa) {
|
|
return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
|
|
} else {
|
|
return _mesa_hash_table_search_data(scoreboard->remaining_uses,
|
|
src->reg.reg);
|
|
}
|
|
}
|
|
|
|
static int
|
|
nir_schedule_def_pressure(nir_ssa_def *def)
|
|
{
|
|
return def->num_components;
|
|
}
|
|
|
|
static int
|
|
nir_schedule_src_pressure(nir_src *src)
|
|
{
|
|
if (src->is_ssa)
|
|
return nir_schedule_def_pressure(src->ssa);
|
|
else
|
|
return src->reg.reg->num_components;
|
|
}
|
|
|
|
static int
|
|
nir_schedule_dest_pressure(nir_dest *dest)
|
|
{
|
|
if (dest->is_ssa)
|
|
return nir_schedule_def_pressure(&dest->ssa);
|
|
else
|
|
return dest->reg.reg->num_components;
|
|
}
|
|
|
|
/**
|
|
* Adds a dependency such that @after must appear in the final program after
|
|
* @before.
|
|
*
|
|
* We add @before as a child of @after, so that DAG heads are the outputs of
|
|
* the program and we make our scheduling decisions bottom to top.
|
|
*/
|
|
static void
|
|
add_dep(nir_deps_state *state,
|
|
nir_schedule_node *before,
|
|
nir_schedule_node *after)
|
|
{
|
|
if (!before || !after)
|
|
return;
|
|
|
|
assert(before != after);
|
|
|
|
if (state->dir == F)
|
|
dag_add_edge(&before->dag, &after->dag, 0);
|
|
else
|
|
dag_add_edge(&after->dag, &before->dag, 0);
|
|
}
|
|
|
|
|
|
static void
|
|
add_read_dep(nir_deps_state *state,
|
|
nir_schedule_node *before,
|
|
nir_schedule_node *after)
|
|
{
|
|
add_dep(state, before, after);
|
|
}
|
|
|
|
static void
|
|
add_write_dep(nir_deps_state *state,
|
|
nir_schedule_node **before,
|
|
nir_schedule_node *after)
|
|
{
|
|
add_dep(state, *before, after);
|
|
*before = after;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_reg_src_deps(nir_src *src, void *in_state)
|
|
{
|
|
nir_deps_state *state = in_state;
|
|
|
|
if (src->is_ssa)
|
|
return true;
|
|
|
|
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
|
|
src->reg.reg);
|
|
if (!entry)
|
|
return true;
|
|
nir_schedule_node *dst_n = entry->data;
|
|
|
|
nir_schedule_node *src_n =
|
|
nir_schedule_get_node(state->scoreboard->instr_map,
|
|
src->parent_instr);
|
|
|
|
add_dep(state, dst_n, src_n);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
|
|
{
|
|
nir_deps_state *state = in_state;
|
|
|
|
if (dest->is_ssa)
|
|
return true;
|
|
|
|
nir_schedule_node *dest_n =
|
|
nir_schedule_get_node(state->scoreboard->instr_map,
|
|
dest->reg.parent_instr);
|
|
|
|
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
|
|
dest->reg.reg);
|
|
if (!entry) {
|
|
_mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
|
|
return true;
|
|
}
|
|
nir_schedule_node **before = (nir_schedule_node **)&entry->data;
|
|
|
|
add_write_dep(state, before, dest_n);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
|
|
{
|
|
nir_deps_state *state = in_state;
|
|
struct hash_table *instr_map = state->scoreboard->instr_map;
|
|
nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
|
|
|
|
nir_foreach_use(src, def) {
|
|
nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
|
|
src->parent_instr);
|
|
|
|
add_read_dep(state, def_n, use_n);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static struct nir_schedule_class_dep *
|
|
nir_schedule_get_class_dep(nir_deps_state *state,
|
|
int klass)
|
|
{
|
|
for (struct nir_schedule_class_dep *class_dep = state->class_deps;
|
|
class_dep != NULL;
|
|
class_dep = class_dep->next) {
|
|
if (class_dep->klass == klass)
|
|
return class_dep;
|
|
}
|
|
|
|
struct nir_schedule_class_dep *class_dep =
|
|
ralloc(state->reg_map, struct nir_schedule_class_dep);
|
|
|
|
class_dep->klass = klass;
|
|
class_dep->node = NULL;
|
|
class_dep->next = state->class_deps;
|
|
|
|
state->class_deps = class_dep;
|
|
|
|
return class_dep;
|
|
}
|
|
|
|
static void
|
|
nir_schedule_intrinsic_deps(nir_deps_state *state,
|
|
nir_intrinsic_instr *instr)
|
|
{
|
|
nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
|
|
&instr->instr);
|
|
const nir_schedule_options *options = state->scoreboard->options;
|
|
nir_schedule_dependency dep;
|
|
|
|
if (options->intrinsic_cb &&
|
|
options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
|
|
struct nir_schedule_class_dep *class_dep =
|
|
nir_schedule_get_class_dep(state, dep.klass);
|
|
|
|
switch (dep.type) {
|
|
case NIR_SCHEDULE_READ_DEPENDENCY:
|
|
add_read_dep(state, class_dep->node, n);
|
|
break;
|
|
case NIR_SCHEDULE_WRITE_DEPENDENCY:
|
|
add_write_dep(state, &class_dep->node, n);
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (instr->intrinsic) {
|
|
case nir_intrinsic_load_uniform:
|
|
case nir_intrinsic_load_ubo:
|
|
case nir_intrinsic_load_front_face:
|
|
break;
|
|
|
|
case nir_intrinsic_discard:
|
|
case nir_intrinsic_discard_if:
|
|
case nir_intrinsic_demote:
|
|
case nir_intrinsic_demote_if:
|
|
case nir_intrinsic_terminate:
|
|
case nir_intrinsic_terminate_if:
|
|
/* We are adding two dependencies:
|
|
*
|
|
* * A individual one that we could use to add a read_dep while handling
|
|
* nir_instr_type_tex
|
|
*
|
|
* * Include it on the unknown intrinsic set, as we want discard to be
|
|
* serialized in in the same order relative to intervening stores or
|
|
* atomic accesses to SSBOs and images
|
|
*/
|
|
add_write_dep(state, &state->discard, n);
|
|
add_write_dep(state, &state->unknown_intrinsic, n);
|
|
break;
|
|
|
|
case nir_intrinsic_store_output:
|
|
/* For some hardware and stages, output stores affect the same shared
|
|
* memory as input loads.
|
|
*/
|
|
if ((state->scoreboard->options->stages_with_shared_io_memory &
|
|
(1 << state->scoreboard->shader->info.stage)))
|
|
add_write_dep(state, &state->load_input, n);
|
|
|
|
/* Make sure that preceding discards stay before the store_output */
|
|
add_read_dep(state, state->discard, n);
|
|
|
|
break;
|
|
|
|
case nir_intrinsic_load_input:
|
|
case nir_intrinsic_load_per_vertex_input:
|
|
add_read_dep(state, state->load_input, n);
|
|
break;
|
|
|
|
case nir_intrinsic_load_shared:
|
|
/* Don't move load_shared beyond a following store_shared, as it could
|
|
* change their value
|
|
*/
|
|
add_read_dep(state, state->store_shared, n);
|
|
break;
|
|
|
|
case nir_intrinsic_store_shared:
|
|
add_write_dep(state, &state->store_shared, n);
|
|
break;
|
|
|
|
case nir_intrinsic_control_barrier:
|
|
case nir_intrinsic_memory_barrier_shared:
|
|
add_write_dep(state, &state->store_shared, n);
|
|
|
|
/* Serialize against ssbos/atomics/etc. */
|
|
add_write_dep(state, &state->unknown_intrinsic, n);
|
|
break;
|
|
|
|
case nir_intrinsic_scoped_barrier: {
|
|
const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
|
|
|
|
if (modes & nir_var_mem_shared)
|
|
add_write_dep(state, &state->store_shared, n);
|
|
|
|
/* Serialize against other categories. */
|
|
add_write_dep(state, &state->unknown_intrinsic, n);
|
|
|
|
break;
|
|
}
|
|
|
|
default:
|
|
/* Attempt to handle other intrinsics that we haven't individually
|
|
* categorized by serializing them in the same order relative to each
|
|
* other.
|
|
*/
|
|
add_write_dep(state, &state->unknown_intrinsic, n);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Common code for dependencies that need to be tracked both forward and
|
|
* backward.
|
|
*
|
|
* This is for things like "all reads of r4 have to happen between the r4
|
|
* writes that surround them".
|
|
*/
|
|
static void
|
|
nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
|
|
{
|
|
nir_instr *instr = n->instr;
|
|
|
|
/* For NIR SSA defs, we only need to do a single pass of making the uses
|
|
* depend on the def.
|
|
*/
|
|
if (state->dir == F)
|
|
nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
|
|
|
|
/* For NIR regs, track the last writer in the scheduler state so that we
|
|
* can keep the writes in order and let reads get reordered only between
|
|
* each write.
|
|
*/
|
|
nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
|
|
|
|
nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
|
|
|
|
/* Make sure any other instructions keep their positions relative to
|
|
* jumps.
|
|
*/
|
|
if (instr->type != nir_instr_type_jump)
|
|
add_read_dep(state, state->jump, n);
|
|
|
|
switch (instr->type) {
|
|
case nir_instr_type_ssa_undef:
|
|
case nir_instr_type_load_const:
|
|
case nir_instr_type_alu:
|
|
case nir_instr_type_deref:
|
|
break;
|
|
|
|
case nir_instr_type_tex:
|
|
/* Don't move texture ops before a discard, as that could increase
|
|
* memory bandwidth for reading the discarded samples.
|
|
*/
|
|
add_read_dep(state, state->discard, n);
|
|
break;
|
|
|
|
case nir_instr_type_jump:
|
|
add_write_dep(state, &state->jump, n);
|
|
break;
|
|
|
|
case nir_instr_type_call:
|
|
unreachable("Calls should have been lowered");
|
|
break;
|
|
|
|
case nir_instr_type_parallel_copy:
|
|
unreachable("Parallel copies should have been lowered");
|
|
break;
|
|
|
|
case nir_instr_type_phi:
|
|
unreachable("nir_schedule() should be called after lowering from SSA");
|
|
break;
|
|
|
|
case nir_instr_type_intrinsic:
|
|
nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
|
|
{
|
|
nir_deps_state state = {
|
|
.scoreboard = scoreboard,
|
|
.dir = F,
|
|
.reg_map = _mesa_pointer_hash_table_create(NULL),
|
|
};
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
|
|
instr);
|
|
nir_schedule_calculate_deps(&state, node);
|
|
}
|
|
|
|
ralloc_free(state.reg_map);
|
|
}
|
|
|
|
static void
|
|
calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
|
|
{
|
|
nir_deps_state state = {
|
|
.scoreboard = scoreboard,
|
|
.dir = R,
|
|
.reg_map = _mesa_pointer_hash_table_create(NULL),
|
|
};
|
|
|
|
nir_foreach_instr_reverse(instr, block) {
|
|
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
|
|
instr);
|
|
nir_schedule_calculate_deps(&state, node);
|
|
}
|
|
|
|
ralloc_free(state.reg_map);
|
|
}
|
|
|
|
typedef struct {
|
|
nir_schedule_scoreboard *scoreboard;
|
|
int regs_freed;
|
|
} nir_schedule_regs_freed_state;
|
|
|
|
static bool
|
|
nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
|
|
{
|
|
nir_schedule_regs_freed_state *state = in_state;
|
|
nir_schedule_scoreboard *scoreboard = state->scoreboard;
|
|
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
|
|
|
|
if (remaining_uses->entries == 1 &&
|
|
_mesa_set_search(remaining_uses, src->parent_instr)) {
|
|
state->regs_freed += nir_schedule_src_pressure(src);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
|
|
{
|
|
nir_schedule_regs_freed_state *state = in_state;
|
|
|
|
state->regs_freed -= nir_schedule_def_pressure(def);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
|
|
{
|
|
nir_schedule_regs_freed_state *state = in_state;
|
|
nir_schedule_scoreboard *scoreboard = state->scoreboard;
|
|
|
|
if (dest->is_ssa)
|
|
return true;
|
|
|
|
nir_register *reg = dest->reg.reg;
|
|
|
|
/* Only the first def of a reg counts against register pressure. */
|
|
if (!_mesa_set_search(scoreboard->live_values, reg))
|
|
state->regs_freed -= nir_schedule_dest_pressure(dest);
|
|
|
|
return true;
|
|
}
|
|
|
|
static int
|
|
nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
|
|
{
|
|
nir_schedule_regs_freed_state state = {
|
|
.scoreboard = scoreboard,
|
|
};
|
|
|
|
nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
|
|
|
|
nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
|
|
|
|
nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
|
|
|
|
return state.regs_freed;
|
|
}
|
|
|
|
/**
|
|
* Chooses an instruction that will minimise the register pressure as much as
|
|
* possible. This should only be used as a fallback when the regular scheduling
|
|
* generates a shader whose register allocation fails.
|
|
*/
|
|
static nir_schedule_node *
|
|
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
|
|
{
|
|
nir_schedule_node *chosen = NULL;
|
|
|
|
/* Find the leader in the ready (shouldn't-stall) set with the mininum
|
|
* cost.
|
|
*/
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (scoreboard->time < n->ready_time)
|
|
continue;
|
|
|
|
if (!chosen || chosen->max_delay > n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (ready fallback): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Otherwise, choose the leader with the minimum cost. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (!chosen || chosen->max_delay > n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (debug) {
|
|
fprintf(stderr, "chose (leader fallback): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/**
|
|
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
|
|
* Scheduling for Parallelism) heuristic.
|
|
*
|
|
* Picks an instruction on the critical that's ready to execute without
|
|
* stalls, if possible, otherwise picks the instruction on the critical path.
|
|
*/
|
|
static nir_schedule_node *
|
|
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
|
|
{
|
|
nir_schedule_node *chosen = NULL;
|
|
|
|
/* Find the leader in the ready (shouldn't-stall) set with the maximum
|
|
* cost.
|
|
*/
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (scoreboard->time < n->ready_time)
|
|
continue;
|
|
|
|
if (!chosen || chosen->max_delay < n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (ready): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Otherwise, choose the leader with the maximum cost. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (!chosen || chosen->max_delay < n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (debug) {
|
|
fprintf(stderr, "chose (leader): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/**
|
|
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
|
|
* Scheduling for Register pressure) heuristic.
|
|
*/
|
|
static nir_schedule_node *
|
|
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
|
|
{
|
|
nir_schedule_node *chosen = NULL;
|
|
|
|
/* Find a ready inst with regs freed and pick the one with max cost. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (n->ready_time > scoreboard->time)
|
|
continue;
|
|
|
|
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
|
|
|
|
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
|
|
chosen = n;
|
|
}
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (freed+ready): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Find a leader with regs freed and pick the one with max cost. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
|
|
|
|
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
|
|
chosen = n;
|
|
}
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (regs freed): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Find a partially evaluated path and try to finish it off */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (n->partially_evaluated_path &&
|
|
(!chosen || chosen->max_delay < n->max_delay)) {
|
|
chosen = n;
|
|
}
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (partial path): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Contra the paper, pick a leader with no effect on used regs. This may
|
|
* open up new opportunities, as otherwise a single-operand instr consuming
|
|
* a value will tend to block finding freeing that value. This had a
|
|
* massive effect on reducing spilling on V3D.
|
|
*
|
|
* XXX: Should this prioritize ready?
|
|
*/
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (nir_schedule_regs_freed(scoreboard, n) != 0)
|
|
continue;
|
|
|
|
if (!chosen || chosen->max_delay < n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (regs no-op): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
/* Pick the max delay of the remaining ready set. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (n->ready_time > scoreboard->time)
|
|
continue;
|
|
if (!chosen || chosen->max_delay < n->max_delay)
|
|
chosen = n;
|
|
}
|
|
if (chosen) {
|
|
if (debug) {
|
|
fprintf(stderr, "chose (ready max delay): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
return chosen;
|
|
}
|
|
|
|
/* Pick the max delay of the remaining leaders. */
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
if (!chosen || chosen->max_delay < n->max_delay)
|
|
chosen = n;
|
|
}
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "chose (max delay): ");
|
|
nir_print_instr(chosen->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
static void
|
|
dump_state(nir_schedule_scoreboard *scoreboard)
|
|
{
|
|
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
|
|
fprintf(stderr, "maxdel %5d ", n->max_delay);
|
|
nir_print_instr(n->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
nir_schedule_node *child = (nir_schedule_node *)edge->child;
|
|
|
|
fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
|
|
nir_print_instr(child->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
|
|
void *reg_or_def,
|
|
nir_instr *reg_or_def_parent,
|
|
int pressure)
|
|
{
|
|
/* Make the value live if it's the first time it's been used. */
|
|
if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
|
|
_mesa_set_add(scoreboard->live_values, reg_or_def);
|
|
scoreboard->pressure += pressure;
|
|
}
|
|
|
|
/* Make the value dead if it's the last remaining use. Be careful when one
|
|
* instruction uses a value twice to not decrement pressure twice.
|
|
*/
|
|
struct set *remaining_uses =
|
|
_mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
|
|
struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
|
|
if (entry) {
|
|
_mesa_set_remove(remaining_uses, entry);
|
|
|
|
if (remaining_uses->entries == 0)
|
|
scoreboard->pressure -= pressure;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_mark_src_scheduled(nir_src *src, void *state)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = state;
|
|
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
|
|
|
|
struct set_entry *entry = _mesa_set_search(remaining_uses,
|
|
src->parent_instr);
|
|
if (entry) {
|
|
/* Once we've used an SSA value in one instruction, bump the priority of
|
|
* the other uses so the SSA value can get fully consumed.
|
|
*
|
|
* We don't do this for registers, and it's would be a hassle and it's
|
|
* unclear if that would help or not. Also, skip it for constants, as
|
|
* they're often folded as immediates into backend instructions and have
|
|
* many unrelated instructions all referencing the same value (0).
|
|
*/
|
|
if (src->is_ssa &&
|
|
src->ssa->parent_instr->type != nir_instr_type_load_const) {
|
|
nir_foreach_use(other_src, src->ssa) {
|
|
if (other_src->parent_instr == src->parent_instr)
|
|
continue;
|
|
|
|
nir_schedule_node *n =
|
|
nir_schedule_get_node(scoreboard->instr_map,
|
|
other_src->parent_instr);
|
|
|
|
if (n && !n->partially_evaluated_path) {
|
|
if (debug) {
|
|
fprintf(stderr, " New partially evaluated path: ");
|
|
nir_print_instr(n->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
n->partially_evaluated_path = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
nir_schedule_mark_use(scoreboard,
|
|
src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
|
|
src->parent_instr,
|
|
nir_schedule_src_pressure(src));
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = state;
|
|
|
|
nir_schedule_mark_use(scoreboard, def, def->parent_instr,
|
|
nir_schedule_def_pressure(def));
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = state;
|
|
|
|
/* SSA defs were handled in nir_schedule_mark_def_scheduled()
|
|
*/
|
|
if (dest->is_ssa)
|
|
return true;
|
|
|
|
/* XXX: This is not actually accurate for regs -- the last use of a reg may
|
|
* have a live interval that extends across control flow. We should
|
|
* calculate the live ranges of regs, and have scheduler nodes for the CF
|
|
* nodes that also "use" the reg.
|
|
*/
|
|
nir_schedule_mark_use(scoreboard, dest->reg.reg,
|
|
dest->reg.parent_instr,
|
|
nir_schedule_dest_pressure(dest));
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
|
|
nir_schedule_node *n)
|
|
{
|
|
nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
|
|
nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
|
|
nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
nir_schedule_node *child = (nir_schedule_node *)edge->child;
|
|
|
|
child->ready_time = MAX2(child->ready_time,
|
|
scoreboard->time + n->delay);
|
|
|
|
if (child->dag.parent_count == 1) {
|
|
if (debug) {
|
|
fprintf(stderr, " New DAG head: ");
|
|
nir_print_instr(child->instr, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
dag_prune_head(scoreboard->dag, &n->dag);
|
|
|
|
scoreboard->time = MAX2(n->ready_time, scoreboard->time);
|
|
scoreboard->time++;
|
|
}
|
|
|
|
static void
|
|
nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
|
|
{
|
|
while (!list_is_empty(&scoreboard->dag->heads)) {
|
|
if (debug) {
|
|
fprintf(stderr, "current list:\n");
|
|
dump_state(scoreboard);
|
|
}
|
|
|
|
nir_schedule_node *chosen;
|
|
if (scoreboard->options->fallback)
|
|
chosen = nir_schedule_choose_instruction_fallback(scoreboard);
|
|
else if (scoreboard->pressure < scoreboard->options->threshold)
|
|
chosen = nir_schedule_choose_instruction_csp(scoreboard);
|
|
else
|
|
chosen = nir_schedule_choose_instruction_csr(scoreboard);
|
|
|
|
/* Now that we've scheduled a new instruction, some of its children may
|
|
* be promoted to the list of instructions ready to be scheduled.
|
|
*/
|
|
nir_schedule_mark_node_scheduled(scoreboard, chosen);
|
|
|
|
/* Move the instruction to the end (so our first chosen instructions are
|
|
* the start of the program).
|
|
*/
|
|
exec_node_remove(&chosen->instr->node);
|
|
exec_list_push_tail(&block->instr_list, &chosen->instr->node);
|
|
|
|
if (debug)
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
|
|
static uint32_t
|
|
nir_schedule_get_delay(nir_instr *instr)
|
|
{
|
|
switch (instr->type) {
|
|
case nir_instr_type_ssa_undef:
|
|
case nir_instr_type_load_const:
|
|
case nir_instr_type_alu:
|
|
case nir_instr_type_deref:
|
|
case nir_instr_type_jump:
|
|
case nir_instr_type_parallel_copy:
|
|
case nir_instr_type_call:
|
|
case nir_instr_type_phi:
|
|
return 1;
|
|
|
|
case nir_instr_type_intrinsic:
|
|
/* XXX: Pick a large number for UBO/SSBO/image/shared loads */
|
|
return 1;
|
|
|
|
case nir_instr_type_tex:
|
|
/* Pick some large number to try to fetch textures early and sample them
|
|
* late.
|
|
*/
|
|
return 100;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
|
|
{
|
|
nir_schedule_node *n = (nir_schedule_node *)node;
|
|
uint32_t max_delay = 0;
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
nir_schedule_node *child = (nir_schedule_node *)edge->child;
|
|
max_delay = MAX2(child->max_delay, max_delay);
|
|
}
|
|
|
|
n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
|
|
}
|
|
|
|
static void
|
|
nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
|
|
{
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
|
|
|
|
scoreboard->dag = dag_create(mem_ctx);
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
nir_schedule_node *n =
|
|
rzalloc(mem_ctx, nir_schedule_node);
|
|
|
|
n->instr = instr;
|
|
n->delay = nir_schedule_get_delay(instr);
|
|
dag_init_node(scoreboard->dag, &n->dag);
|
|
|
|
_mesa_hash_table_insert(scoreboard->instr_map, instr, n);
|
|
}
|
|
|
|
calculate_forward_deps(scoreboard, block);
|
|
calculate_reverse_deps(scoreboard, block);
|
|
|
|
dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
|
|
|
|
nir_schedule_instructions(scoreboard, block);
|
|
|
|
ralloc_free(mem_ctx);
|
|
scoreboard->instr_map = NULL;
|
|
}
|
|
|
|
static bool
|
|
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = state;
|
|
struct set *def_uses = _mesa_pointer_set_create(scoreboard);
|
|
|
|
_mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
|
|
|
|
_mesa_set_add(def_uses, def->parent_instr);
|
|
|
|
nir_foreach_use(src, def) {
|
|
_mesa_set_add(def_uses, src->parent_instr);
|
|
}
|
|
|
|
/* XXX: Handle if uses */
|
|
|
|
return true;
|
|
}
|
|
|
|
static nir_schedule_scoreboard *
|
|
nir_schedule_get_scoreboard(nir_shader *shader,
|
|
const nir_schedule_options *options)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
|
|
|
|
scoreboard->shader = shader;
|
|
scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
|
|
scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
|
|
scoreboard->options = options;
|
|
scoreboard->pressure = 0;
|
|
|
|
nir_foreach_function(function, shader) {
|
|
nir_foreach_register(reg, &function->impl->registers) {
|
|
struct set *register_uses =
|
|
_mesa_pointer_set_create(scoreboard);
|
|
|
|
_mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
|
|
|
|
nir_foreach_use(src, reg) {
|
|
_mesa_set_add(register_uses, src->parent_instr);
|
|
}
|
|
|
|
/* XXX: Handle if uses */
|
|
|
|
nir_foreach_def(dest, reg) {
|
|
_mesa_set_add(register_uses, dest->reg.parent_instr);
|
|
}
|
|
}
|
|
|
|
nir_foreach_block(block, function->impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
|
|
scoreboard);
|
|
}
|
|
|
|
/* XXX: We're ignoring if uses, which may prioritize scheduling other
|
|
* uses of the if src even when it doesn't help. That's not many
|
|
* values, though, so meh.
|
|
*/
|
|
}
|
|
}
|
|
|
|
return scoreboard;
|
|
}
|
|
|
|
static void
|
|
nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
|
|
{
|
|
#ifdef NDEBUG
|
|
return;
|
|
#endif
|
|
|
|
bool any_uses = false;
|
|
|
|
hash_table_foreach(scoreboard->remaining_uses, entry) {
|
|
struct set *remaining_uses = entry->data;
|
|
|
|
set_foreach(remaining_uses, instr_entry) {
|
|
if (!any_uses) {
|
|
fprintf(stderr, "Tracked uses remain after scheduling. "
|
|
"Affected instructions: \n");
|
|
any_uses = true;
|
|
}
|
|
nir_print_instr(instr_entry->key, stderr);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
|
|
assert(!any_uses);
|
|
}
|
|
|
|
/**
|
|
* Schedules the NIR instructions to try to decrease stalls (for example,
|
|
* delaying texture reads) while managing register pressure.
|
|
*
|
|
* The threshold represents "number of NIR register/SSA def channels live
|
|
* before switching the scheduling heuristic to reduce register pressure",
|
|
* since most of our GPU architectures are scalar (extending to vector with a
|
|
* flag wouldn't be hard). This number should be a bit below the number of
|
|
* registers available (counting any that may be occupied by system value
|
|
* payload values, for example), since the heuristic may not always be able to
|
|
* free a register immediately. The amount below the limit is up to you to
|
|
* tune.
|
|
*/
|
|
void
|
|
nir_schedule(nir_shader *shader,
|
|
const nir_schedule_options *options)
|
|
{
|
|
nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
|
|
options);
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "NIR shader before scheduling:\n");
|
|
nir_print_shader(shader, stderr);
|
|
}
|
|
|
|
nir_foreach_function(function, shader) {
|
|
if (!function->impl)
|
|
continue;
|
|
|
|
nir_foreach_block(block, function->impl) {
|
|
nir_schedule_block(scoreboard, block);
|
|
}
|
|
}
|
|
|
|
nir_schedule_validate_uses(scoreboard);
|
|
|
|
ralloc_free(scoreboard);
|
|
}
|