mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-29 01:30:08 +01:00
brw: add scheduler support for address registers
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
This commit is contained in:
parent
0a5bdf1199
commit
aac906c16c
1 changed files with 216 additions and 3 deletions
|
|
@ -95,6 +95,14 @@ public:
|
|||
*/
|
||||
int issue_time;
|
||||
|
||||
/**
|
||||
* Whether the instruction reads any part of the address register (to speed
|
||||
* up instruction checks).
|
||||
*/
|
||||
schedule_node **address_read;
|
||||
int address_read_count;
|
||||
int address_read_cap;
|
||||
|
||||
/* Temporary data used during the scheduling process. */
|
||||
struct {
|
||||
int parent_count;
|
||||
|
|
@ -579,6 +587,7 @@ public:
|
|||
void add_cross_lane_deps(schedule_node *n);
|
||||
void add_dep(schedule_node *before, schedule_node *after, int latency);
|
||||
void add_dep(schedule_node *before, schedule_node *after);
|
||||
void add_address_dep(schedule_node *before, schedule_node *after);
|
||||
|
||||
void set_current_block(bblock_t *block);
|
||||
void compute_delays();
|
||||
|
|
@ -590,6 +599,7 @@ public:
|
|||
void calculate_deps();
|
||||
bool is_compressed(const fs_inst *inst);
|
||||
bool register_needs_barrier(const brw_reg ®);
|
||||
bool address_register_interfere(const schedule_node *n);
|
||||
schedule_node *choose_instruction_to_schedule();
|
||||
int calculate_issue_time(const fs_inst *inst);
|
||||
|
||||
|
|
@ -626,6 +636,9 @@ public:
|
|||
unsigned cand_generation;
|
||||
int time;
|
||||
exec_list available;
|
||||
|
||||
/* Currently used address register */
|
||||
uint32_t address_register[16];
|
||||
} current;
|
||||
|
||||
bool post_reg_alloc;
|
||||
|
|
@ -946,8 +959,19 @@ instruction_scheduler::compute_delays()
|
|||
n->delay = n->issue_time;
|
||||
} else {
|
||||
for (int i = 0; i < n->children_count; i++) {
|
||||
assert(n->children[i].n->delay);
|
||||
n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
|
||||
if (n->children[i].n->delay == 0) {
|
||||
/* This is a special case for address register, where a child
|
||||
* could be a prior instruction.
|
||||
*
|
||||
* This ensures that a address register write instruction will
|
||||
* always unblock the reader of the address register. Otherwise
|
||||
* we could end up with scheduling deadlocks.
|
||||
*/
|
||||
assert(n->children[i].n->inst->dst.is_address());
|
||||
n->delay = MAX2(n->delay, 1);
|
||||
} else {
|
||||
n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1023,6 +1047,10 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
|
|||
child->effective_latency = latency;
|
||||
before->children_count++;
|
||||
after->initial_parent_count++;
|
||||
|
||||
/* Propagate the dependency to the address register instructions. */
|
||||
for (int i = 0; i < after->address_read_count; i++)
|
||||
add_dep(before, after->address_read[i]);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -1034,6 +1062,24 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
|
|||
add_dep(before, after, before->latency);
|
||||
}
|
||||
|
||||
void
|
||||
instruction_scheduler::add_address_dep(schedule_node *before, schedule_node *after)
|
||||
{
|
||||
assert(before && after);
|
||||
|
||||
add_dep(before, after, before->latency);
|
||||
|
||||
if (after->address_read_cap <= after->address_read_count) {
|
||||
after->address_read_cap = MAX2(2 * after->address_read_cap, 1);
|
||||
|
||||
after->address_read = reralloc(mem_ctx, after->address_read,
|
||||
schedule_node *,
|
||||
after->address_read_cap);
|
||||
}
|
||||
|
||||
after->address_read[after->address_read_count++] = before;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_scheduling_barrier(const fs_inst *inst)
|
||||
{
|
||||
|
|
@ -1199,8 +1245,76 @@ instruction_scheduler::calculate_deps()
|
|||
* granular level.
|
||||
*/
|
||||
schedule_node *last_fixed_grf_write = NULL;
|
||||
schedule_node *last_address_write[16] = {};
|
||||
|
||||
/* top-to-bottom dependencies: RAW and WAW. */
|
||||
|
||||
if (!post_reg_alloc) {
|
||||
/* Address registers have virtual identifier, allowing us to identify
|
||||
* what instructions needs the values written to the register. The
|
||||
* address register is written/read in pairs of instructions (enforced
|
||||
* by the brw_fs_validate.cpp).
|
||||
*
|
||||
* To allow scheduling of SEND messages, out of order, without the
|
||||
* address register tracking generating serialized dependency between
|
||||
* all the messages, we first track all the dependencies of the address
|
||||
* register. Those dependencies are added to the instructions consuming
|
||||
* the address register value. Then when doing the normal dependency
|
||||
* tracking, any node adding a dependency to an instruction consuming
|
||||
* the address register is also added as dependency to the instruction
|
||||
* writing the value to the address register.
|
||||
*
|
||||
* This scheme allows the scheduling done by
|
||||
* choose_instruction_to_schedule() to ensure that once an instruction
|
||||
* writing the address register is scheduled, we can always schedule all
|
||||
* instructions making use of the address register value. Otherwise we
|
||||
* could run into scheduling deadlocks.
|
||||
*
|
||||
* Here is a deadlock example :
|
||||
*
|
||||
* mov a0, 0x42
|
||||
* send grf1, ..., a0
|
||||
* mov a0, 0x43
|
||||
* send grf2, grf1, a0
|
||||
*
|
||||
* Let say choose_instruction_to_schedule() chooses the second mov
|
||||
* instruction first (mov a0, 0x43). Then it cannot schedule the second
|
||||
* send instruction because the first send instruction populating grf1
|
||||
* and has not been scheduled and we cannot schedule the first mov
|
||||
* either because the address register is already in use for another
|
||||
* message.
|
||||
*
|
||||
* In post-register-allocation mode, this scheme cannot work as all GRFs
|
||||
* can get reused and we have to serializae all address register usages
|
||||
* (like the accumulator, flag, etc...).
|
||||
*/
|
||||
for (schedule_node *n = current.start; n < current.end; n++) {
|
||||
fs_inst *inst = (fs_inst *)n->inst;
|
||||
|
||||
/* Pre pass going over instruction using the register flag as a
|
||||
* source.
|
||||
*/
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (!inst->src[i].is_address())
|
||||
continue;
|
||||
|
||||
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
|
||||
assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write));
|
||||
schedule_node *write_addr_node =
|
||||
last_address_write[inst->src[i].address_slot(byte)];
|
||||
assert(write_addr_node->inst->dst.nr == inst->src[i].nr);
|
||||
add_address_dep(write_addr_node, n);
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.is_address()) {
|
||||
for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
|
||||
last_address_write[inst->dst.address_slot(byte)] = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (schedule_node *n = current.start; n < current.end; n++) {
|
||||
fs_inst *inst = (fs_inst *)n->inst;
|
||||
|
||||
|
|
@ -1225,12 +1339,16 @@ instruction_scheduler::calculate_deps()
|
|||
}
|
||||
} else if (inst->src[i].is_accumulator()) {
|
||||
add_dep(last_accumulator_write, n);
|
||||
} else if (inst->src[i].is_address()) {
|
||||
if (post_reg_alloc) {
|
||||
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2)
|
||||
add_dep(last_address_write[inst->src[i].address_slot(byte)], n);
|
||||
}
|
||||
} else if (register_needs_barrier(inst->src[i])) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (const unsigned mask = inst->flags_read(s->devinfo)) {
|
||||
assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
|
||||
|
||||
|
|
@ -1264,6 +1382,13 @@ instruction_scheduler::calculate_deps()
|
|||
} else if (inst->dst.is_accumulator()) {
|
||||
add_dep(last_accumulator_write, n);
|
||||
last_accumulator_write = n;
|
||||
} else if (inst->dst.is_address()) {
|
||||
if (post_reg_alloc) {
|
||||
for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
|
||||
add_dep(last_address_write[inst->dst.address_slot(byte)], n);
|
||||
last_address_write[inst->dst.address_slot(byte)] = n;
|
||||
}
|
||||
}
|
||||
} else if (register_needs_barrier(inst->dst)) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
|
|
@ -1284,6 +1409,13 @@ instruction_scheduler::calculate_deps()
|
|||
add_dep(last_accumulator_write, n);
|
||||
last_accumulator_write = n;
|
||||
}
|
||||
|
||||
if (post_reg_alloc && inst->uses_address_register_implicitly()) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) {
|
||||
add_dep(last_address_write[i], n);
|
||||
last_address_write[i] = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clear_last_grf_write();
|
||||
|
|
@ -1292,6 +1424,7 @@ instruction_scheduler::calculate_deps()
|
|||
memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
|
||||
last_accumulator_write = NULL;
|
||||
last_fixed_grf_write = NULL;
|
||||
memset(last_address_write, 0, sizeof(last_address_write));
|
||||
|
||||
for (schedule_node *n = current.end - 1; n >= current.start; n--) {
|
||||
fs_inst *inst = (fs_inst *)n->inst;
|
||||
|
|
@ -1310,6 +1443,12 @@ instruction_scheduler::calculate_deps()
|
|||
}
|
||||
} else if (inst->src[i].is_accumulator()) {
|
||||
add_dep(n, last_accumulator_write, 0);
|
||||
} else if (inst->src[i].is_address()) {
|
||||
if (post_reg_alloc) {
|
||||
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
|
||||
add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0);
|
||||
}
|
||||
}
|
||||
} else if (register_needs_barrier(inst->src[i])) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
|
|
@ -1328,6 +1467,11 @@ instruction_scheduler::calculate_deps()
|
|||
add_dep(n, last_accumulator_write);
|
||||
}
|
||||
|
||||
if (post_reg_alloc && inst->uses_address_register_implicitly()) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++)
|
||||
last_address_write[i] = n;
|
||||
}
|
||||
|
||||
/* Update the things this instruction wrote, so earlier reads
|
||||
* can mark this as WAR dependency.
|
||||
*/
|
||||
|
|
@ -1343,6 +1487,11 @@ instruction_scheduler::calculate_deps()
|
|||
}
|
||||
} else if (inst->dst.is_accumulator()) {
|
||||
last_accumulator_write = n;
|
||||
} else if (inst->dst.is_address()) {
|
||||
if (post_reg_alloc) {
|
||||
for (unsigned byte = 0; byte < inst->size_written; byte += 2)
|
||||
last_address_write[inst->dst.address_slot(byte)] = n;
|
||||
}
|
||||
} else if (register_needs_barrier(inst->dst)) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
|
|
@ -1364,6 +1513,39 @@ instruction_scheduler::calculate_deps()
|
|||
clear_last_grf_write();
|
||||
}
|
||||
|
||||
bool
|
||||
instruction_scheduler::address_register_interfere(const schedule_node *n)
|
||||
{
|
||||
if (n->inst->uses_address_register_implicitly()) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(current.address_register); i++)
|
||||
if (current.address_register[i] != 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (n->inst->dst.is_address()) {
|
||||
for (unsigned byte = 0; byte < n->inst->size_written; byte += 2) {
|
||||
if (current.address_register[n->inst->dst.address_slot(byte)] != 0 &&
|
||||
current.address_register[n->inst->dst.address_slot(byte)] != n->inst->dst.nr)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (n->address_read_count > 0) {
|
||||
for (unsigned i = 0; i < n->inst->sources; i++) {
|
||||
if (!n->inst->src[i].is_address())
|
||||
continue;
|
||||
for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) {
|
||||
if (current.address_register[n->inst->src[i].address_slot(byte)] !=
|
||||
n->inst->src[i].nr)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
schedule_node *
|
||||
instruction_scheduler::choose_instruction_to_schedule()
|
||||
{
|
||||
|
|
@ -1377,6 +1559,9 @@ instruction_scheduler::choose_instruction_to_schedule()
|
|||
* otherwise the oldest one.
|
||||
*/
|
||||
foreach_in_list(schedule_node, n, ¤t.available) {
|
||||
if (!post_reg_alloc && address_register_interfere(n))
|
||||
continue;
|
||||
|
||||
if (!chosen ||
|
||||
exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
|
||||
(exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
|
||||
|
|
@ -1395,6 +1580,9 @@ instruction_scheduler::choose_instruction_to_schedule()
|
|||
* latency.
|
||||
*/
|
||||
foreach_in_list(schedule_node, n, ¤t.available) {
|
||||
if (!post_reg_alloc && address_register_interfere(n))
|
||||
continue;
|
||||
|
||||
if (!chosen) {
|
||||
chosen = n;
|
||||
chosen_register_pressure_benefit =
|
||||
|
|
@ -1512,6 +1700,29 @@ instruction_scheduler::schedule(schedule_node *chosen)
|
|||
void
|
||||
instruction_scheduler::update_children(schedule_node *chosen)
|
||||
{
|
||||
if (chosen->address_read_count > 0) {
|
||||
for (unsigned i = 0; i < chosen->inst->sources; i++) {
|
||||
if (!chosen->inst->src[i].is_address())
|
||||
continue;
|
||||
for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) {
|
||||
assert(chosen->inst->src[i].address_slot(byte) <
|
||||
ARRAY_SIZE(current.address_register));
|
||||
current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (chosen->inst->dst.is_address()) {
|
||||
for (unsigned byte = 0; byte < chosen->inst->size_written; byte += 2) {
|
||||
assert(chosen->inst->dst.address_slot(byte) <
|
||||
ARRAY_SIZE(current.address_register));
|
||||
current.address_register[
|
||||
chosen->inst->dst.address_slot(byte)] = chosen->inst->dst.nr;
|
||||
}
|
||||
} else if (chosen->inst->uses_address_register_implicitly()) {
|
||||
memset(current.address_register, 0, sizeof(current.address_register));
|
||||
}
|
||||
|
||||
/* Now that we've scheduled a new instruction, some of its
|
||||
* children can be promoted to the list of instructions ready to
|
||||
* be scheduled. Update the children's unblocked time for this
|
||||
|
|
@ -1557,6 +1768,8 @@ instruction_scheduler::schedule_instructions()
|
|||
|
||||
current.block->instructions.make_empty();
|
||||
|
||||
memset(current.address_register, 0, sizeof(current.address_register));
|
||||
|
||||
while (!current.available.is_empty()) {
|
||||
schedule_node *chosen = choose_instruction_to_schedule();
|
||||
schedule(chosen);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue