diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index eaa894ff041..6990e061a3c 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -95,6 +95,14 @@ public: */ int issue_time; + /** + * Whether the instruction reads any part of the address register (to speed + * up instruction checks). + */ + schedule_node **address_read; + int address_read_count; + int address_read_cap; + /* Temporary data used during the scheduling process. */ struct { int parent_count; @@ -579,6 +587,7 @@ public: void add_cross_lane_deps(schedule_node *n); void add_dep(schedule_node *before, schedule_node *after, int latency); void add_dep(schedule_node *before, schedule_node *after); + void add_address_dep(schedule_node *before, schedule_node *after); void set_current_block(bblock_t *block); void compute_delays(); @@ -590,6 +599,7 @@ public: void calculate_deps(); bool is_compressed(const fs_inst *inst); bool register_needs_barrier(const brw_reg ®); + bool address_register_interfere(const schedule_node *n); schedule_node *choose_instruction_to_schedule(); int calculate_issue_time(const fs_inst *inst); @@ -626,6 +636,9 @@ public: unsigned cand_generation; int time; exec_list available; + + /* Currently used address register */ + uint32_t address_register[16]; } current; bool post_reg_alloc; @@ -946,8 +959,19 @@ instruction_scheduler::compute_delays() n->delay = n->issue_time; } else { for (int i = 0; i < n->children_count; i++) { - assert(n->children[i].n->delay); - n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay); + if (n->children[i].n->delay == 0) { + /* This is a special case for address register, where a child + * could be a prior instruction. + * + * This ensures that a address register write instruction will + * always unblock the reader of the address register. Otherwise + * we could end up with scheduling deadlocks. + */ + assert(n->children[i].n->inst->dst.is_address()); + n->delay = MAX2(n->delay, 1); + } else { + n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay); + } } } } @@ -1023,6 +1047,10 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, child->effective_latency = latency; before->children_count++; after->initial_parent_count++; + + /* Propagate the dependency to the address register instructions. */ + for (int i = 0; i < after->address_read_count; i++) + add_dep(before, after->address_read[i]); } void @@ -1034,6 +1062,24 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) add_dep(before, after, before->latency); } +void +instruction_scheduler::add_address_dep(schedule_node *before, schedule_node *after) +{ + assert(before && after); + + add_dep(before, after, before->latency); + + if (after->address_read_cap <= after->address_read_count) { + after->address_read_cap = MAX2(2 * after->address_read_cap, 1); + + after->address_read = reralloc(mem_ctx, after->address_read, + schedule_node *, + after->address_read_cap); + } + + after->address_read[after->address_read_count++] = before; +} + static bool is_scheduling_barrier(const fs_inst *inst) { @@ -1199,8 +1245,76 @@ instruction_scheduler::calculate_deps() * granular level. */ schedule_node *last_fixed_grf_write = NULL; + schedule_node *last_address_write[16] = {}; /* top-to-bottom dependencies: RAW and WAW. */ + + if (!post_reg_alloc) { + /* Address registers have virtual identifier, allowing us to identify + * what instructions needs the values written to the register. The + * address register is written/read in pairs of instructions (enforced + * by the brw_fs_validate.cpp). + * + * To allow scheduling of SEND messages, out of order, without the + * address register tracking generating serialized dependency between + * all the messages, we first track all the dependencies of the address + * register. Those dependencies are added to the instructions consuming + * the address register value. Then when doing the normal dependency + * tracking, any node adding a dependency to an instruction consuming + * the address register is also added as dependency to the instruction + * writing the value to the address register. + * + * This scheme allows the scheduling done by + * choose_instruction_to_schedule() to ensure that once an instruction + * writing the address register is scheduled, we can always schedule all + * instructions making use of the address register value. Otherwise we + * could run into scheduling deadlocks. + * + * Here is a deadlock example : + * + * mov a0, 0x42 + * send grf1, ..., a0 + * mov a0, 0x43 + * send grf2, grf1, a0 + * + * Let say choose_instruction_to_schedule() chooses the second mov + * instruction first (mov a0, 0x43). Then it cannot schedule the second + * send instruction because the first send instruction populating grf1 + * and has not been scheduled and we cannot schedule the first mov + * either because the address register is already in use for another + * message. + * + * In post-register-allocation mode, this scheme cannot work as all GRFs + * can get reused and we have to serializae all address register usages + * (like the accumulator, flag, etc...). + */ + for (schedule_node *n = current.start; n < current.end; n++) { + fs_inst *inst = (fs_inst *)n->inst; + + /* Pre pass going over instruction using the register flag as a + * source. + */ + for (int i = 0; i < inst->sources; i++) { + if (!inst->src[i].is_address()) + continue; + + for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) { + assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write)); + schedule_node *write_addr_node = + last_address_write[inst->src[i].address_slot(byte)]; + assert(write_addr_node->inst->dst.nr == inst->src[i].nr); + add_address_dep(write_addr_node, n); + } + } + + if (inst->dst.is_address()) { + for (unsigned byte = 0; byte < inst->size_written; byte += 2) { + last_address_write[inst->dst.address_slot(byte)] = n; + } + } + } + } + for (schedule_node *n = current.start; n < current.end; n++) { fs_inst *inst = (fs_inst *)n->inst; @@ -1225,12 +1339,16 @@ instruction_scheduler::calculate_deps() } } else if (inst->src[i].is_accumulator()) { add_dep(last_accumulator_write, n); + } else if (inst->src[i].is_address()) { + if (post_reg_alloc) { + for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) + add_dep(last_address_write[inst->src[i].address_slot(byte)], n); + } } else if (register_needs_barrier(inst->src[i])) { add_barrier_deps(n); } } - if (const unsigned mask = inst->flags_read(s->devinfo)) { assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); @@ -1264,6 +1382,13 @@ instruction_scheduler::calculate_deps() } else if (inst->dst.is_accumulator()) { add_dep(last_accumulator_write, n); last_accumulator_write = n; + } else if (inst->dst.is_address()) { + if (post_reg_alloc) { + for (unsigned byte = 0; byte < inst->size_written; byte += 2) { + add_dep(last_address_write[inst->dst.address_slot(byte)], n); + last_address_write[inst->dst.address_slot(byte)] = n; + } + } } else if (register_needs_barrier(inst->dst)) { add_barrier_deps(n); } @@ -1284,6 +1409,13 @@ instruction_scheduler::calculate_deps() add_dep(last_accumulator_write, n); last_accumulator_write = n; } + + if (post_reg_alloc && inst->uses_address_register_implicitly()) { + for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) { + add_dep(last_address_write[i], n); + last_address_write[i] = n; + } + } } clear_last_grf_write(); @@ -1292,6 +1424,7 @@ instruction_scheduler::calculate_deps() memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); last_accumulator_write = NULL; last_fixed_grf_write = NULL; + memset(last_address_write, 0, sizeof(last_address_write)); for (schedule_node *n = current.end - 1; n >= current.start; n--) { fs_inst *inst = (fs_inst *)n->inst; @@ -1310,6 +1443,12 @@ instruction_scheduler::calculate_deps() } } else if (inst->src[i].is_accumulator()) { add_dep(n, last_accumulator_write, 0); + } else if (inst->src[i].is_address()) { + if (post_reg_alloc) { + for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) { + add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0); + } + } } else if (register_needs_barrier(inst->src[i])) { add_barrier_deps(n); } @@ -1328,6 +1467,11 @@ instruction_scheduler::calculate_deps() add_dep(n, last_accumulator_write); } + if (post_reg_alloc && inst->uses_address_register_implicitly()) { + for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) + last_address_write[i] = n; + } + /* Update the things this instruction wrote, so earlier reads * can mark this as WAR dependency. */ @@ -1343,6 +1487,11 @@ instruction_scheduler::calculate_deps() } } else if (inst->dst.is_accumulator()) { last_accumulator_write = n; + } else if (inst->dst.is_address()) { + if (post_reg_alloc) { + for (unsigned byte = 0; byte < inst->size_written; byte += 2) + last_address_write[inst->dst.address_slot(byte)] = n; + } } else if (register_needs_barrier(inst->dst)) { add_barrier_deps(n); } @@ -1364,6 +1513,39 @@ instruction_scheduler::calculate_deps() clear_last_grf_write(); } +bool +instruction_scheduler::address_register_interfere(const schedule_node *n) +{ + if (n->inst->uses_address_register_implicitly()) { + for (unsigned i = 0; i < ARRAY_SIZE(current.address_register); i++) + if (current.address_register[i] != 0) + return true; + return false; + } + + if (n->inst->dst.is_address()) { + for (unsigned byte = 0; byte < n->inst->size_written; byte += 2) { + if (current.address_register[n->inst->dst.address_slot(byte)] != 0 && + current.address_register[n->inst->dst.address_slot(byte)] != n->inst->dst.nr) + return true; + } + } + + if (n->address_read_count > 0) { + for (unsigned i = 0; i < n->inst->sources; i++) { + if (!n->inst->src[i].is_address()) + continue; + for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) { + if (current.address_register[n->inst->src[i].address_slot(byte)] != + n->inst->src[i].nr) + return true; + } + } + } + + return false; +} + schedule_node * instruction_scheduler::choose_instruction_to_schedule() { @@ -1377,6 +1559,9 @@ instruction_scheduler::choose_instruction_to_schedule() * otherwise the oldest one. */ foreach_in_list(schedule_node, n, ¤t.available) { + if (!post_reg_alloc && address_register_interfere(n)) + continue; + if (!chosen || exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) || (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) && @@ -1395,6 +1580,9 @@ instruction_scheduler::choose_instruction_to_schedule() * latency. */ foreach_in_list(schedule_node, n, ¤t.available) { + if (!post_reg_alloc && address_register_interfere(n)) + continue; + if (!chosen) { chosen = n; chosen_register_pressure_benefit = @@ -1512,6 +1700,29 @@ instruction_scheduler::schedule(schedule_node *chosen) void instruction_scheduler::update_children(schedule_node *chosen) { + if (chosen->address_read_count > 0) { + for (unsigned i = 0; i < chosen->inst->sources; i++) { + if (!chosen->inst->src[i].is_address()) + continue; + for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) { + assert(chosen->inst->src[i].address_slot(byte) < + ARRAY_SIZE(current.address_register)); + current.address_register[chosen->inst->src[i].address_slot(byte)] = 0; + } + } + } + + if (chosen->inst->dst.is_address()) { + for (unsigned byte = 0; byte < chosen->inst->size_written; byte += 2) { + assert(chosen->inst->dst.address_slot(byte) < + ARRAY_SIZE(current.address_register)); + current.address_register[ + chosen->inst->dst.address_slot(byte)] = chosen->inst->dst.nr; + } + } else if (chosen->inst->uses_address_register_implicitly()) { + memset(current.address_register, 0, sizeof(current.address_register)); + } + /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this @@ -1557,6 +1768,8 @@ instruction_scheduler::schedule_instructions() current.block->instructions.make_empty(); + memset(current.address_register, 0, sizeof(current.address_register)); + while (!current.available.is_empty()) { schedule_node *chosen = choose_instruction_to_schedule(); schedule(chosen);