From dcefa0e6b351e8957034d7e1b2cd2ae8d55e95eb Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Mon, 8 Dec 2025 09:58:47 -0800 Subject: [PATCH] brw: Rework UIP and JIP setting code The current code walks the instructions, and when needed, it will scan to find the next "end of scope" and sometimes the next "end of block". It also has a separate patching logic for HALTs. The new code collects the necessary scope information up front, then walks the instruction backwards, making avoiding the need to scan for the end of scope. It will also walk only the relevant instructions that were previously collected. It also replaces the previous HALT-specific patching logic. With this new change, many cases that were jumping to intermediate HALTs, will now jump straight to the end of scope (or the "end of the program" section). E.g. in ``` if ... (...) HALT ... (...) HALT endif ``` both HALTs now will jump to the end of the scope, instead of the first HALT jumping into the second one. Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw/brw_eu.h | 2 +- src/intel/compiler/brw/brw_eu_emit.c | 157 ------------- src/intel/compiler/brw/brw_generator.cpp | 272 +++++++++++++++++------ src/intel/compiler/brw/brw_generator.h | 6 +- 4 files changed, 208 insertions(+), 229 deletions(-) diff --git a/src/intel/compiler/brw/brw_eu.h b/src/intel/compiler/brw/brw_eu.h index 4e1606da3bb..77cfdaecb28 100644 --- a/src/intel/compiler/brw/brw_eu.h +++ b/src/intel/compiler/brw/brw_eu.h @@ -1589,7 +1589,7 @@ brw_set_desc(struct brw_codegen *p, brw_eu_inst *insn, unsigned desc, bool gathe brw_set_desc_ex(p, insn, desc, 0, gather); } -void brw_set_uip_jip(struct brw_codegen *p, int start_offset); +void brw_set_uip_jip(struct brw_codegen *p, int start_offset, int final_halt_offset); enum brw_conditional_mod brw_negate_cmod(enum brw_conditional_mod cmod); enum brw_conditional_mod brw_swap_cmod(enum brw_conditional_mod cmod); diff --git a/src/intel/compiler/brw/brw_eu_emit.c b/src/intel/compiler/brw/brw_eu_emit.c index a2ccb23736e..07852db8c4c 100644 --- a/src/intel/compiler/brw/brw_eu_emit.c +++ b/src/intel/compiler/brw/brw_eu_emit.c @@ -1411,163 +1411,6 @@ brw_send_indirect_split_message(struct brw_codegen *p, brw_eu_inst_set_eot(devinfo, send, eot); } -static bool -while_jumps_before_offset(const struct intel_device_info *devinfo, - brw_eu_inst *insn, int while_offset, int start_offset) -{ - int scale = 16 / brw_jump_scale(devinfo); - int jip = brw_eu_inst_jip(devinfo, insn); - assert(jip < 0); - return while_offset + jip * scale <= start_offset; -} - - -static int -brw_find_next_block_end(struct brw_codegen *p, int start_offset) -{ - int offset; - void *store = p->store; - const struct intel_device_info *devinfo = p->devinfo; - - int depth = 0; - - for (offset = next_offset(p, store, start_offset); - offset < p->next_insn_offset; - offset = next_offset(p, store, offset)) { - brw_eu_inst *insn = store + offset; - - switch (brw_eu_inst_opcode(p->isa, insn)) { - case BRW_OPCODE_IF: - depth++; - break; - case BRW_OPCODE_ENDIF: - if (depth == 0) - return offset; - depth--; - break; - case BRW_OPCODE_WHILE: - /* If the while doesn't jump before our instruction, it's the end - * of a sibling do...while loop. Ignore it. - */ - if (!while_jumps_before_offset(devinfo, insn, offset, start_offset)) - continue; - FALLTHROUGH; - case BRW_OPCODE_ELSE: - case BRW_OPCODE_HALT: - if (depth == 0) - return offset; - break; - default: - break; - } - } - - return 0; -} - -/* There is no DO instruction on gfx6, so to find the end of the loop - * we have to see if the loop is jumping back before our start - * instruction. - */ -static int -brw_find_loop_end(struct brw_codegen *p, int start_offset) -{ - const struct intel_device_info *devinfo = p->devinfo; - int offset; - void *store = p->store; - - /* Always start after the instruction (such as a WHILE) we're trying to fix - * up. - */ - for (offset = next_offset(p, store, start_offset); - offset < p->next_insn_offset; - offset = next_offset(p, store, offset)) { - brw_eu_inst *insn = store + offset; - - if (brw_eu_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) { - if (while_jumps_before_offset(devinfo, insn, offset, start_offset)) - return offset; - } - } - UNREACHABLE("not reached"); -} - -/* After program generation, go back and update the UIP and JIP of - * BREAK, CONT, and HALT instructions to their correct locations. - */ -void -brw_set_uip_jip(struct brw_codegen *p, int start_offset) -{ - const struct intel_device_info *devinfo = p->devinfo; - int offset; - int br = brw_jump_scale(devinfo); - int scale = 16 / br; - void *store = p->store; - - for (offset = start_offset; offset < p->next_insn_offset; offset += 16) { - brw_eu_inst *insn = store + offset; - assert(brw_eu_inst_cmpt_control(devinfo, insn) == 0); - - switch (brw_eu_inst_opcode(p->isa, insn)) { - case BRW_OPCODE_BREAK: { - int block_end_offset = brw_find_next_block_end(p, offset); - assert(block_end_offset != 0); - brw_eu_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); - /* Gfx7 UIP points to WHILE; Gfx6 points just after it */ - brw_eu_inst_set_uip(devinfo, insn, - (brw_find_loop_end(p, offset) - offset) / scale); - break; - } - - case BRW_OPCODE_CONTINUE: { - int block_end_offset = brw_find_next_block_end(p, offset); - assert(block_end_offset != 0); - brw_eu_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); - brw_eu_inst_set_uip(devinfo, insn, - (brw_find_loop_end(p, offset) - offset) / scale); - - assert(brw_eu_inst_uip(devinfo, insn) != 0); - assert(brw_eu_inst_jip(devinfo, insn) != 0); - break; - } - - case BRW_OPCODE_ENDIF: { - int block_end_offset = brw_find_next_block_end(p, offset); - int32_t jump = (block_end_offset == 0) ? - 1 * br : (block_end_offset - offset) / scale; - brw_eu_inst_set_jip(devinfo, insn, jump); - break; - } - - case BRW_OPCODE_HALT: { - /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): - * - * "In case of the halt instruction not inside any conditional - * code block, the value of and should be the - * same. In case of the halt instruction inside conditional code - * block, the should be the end of the program, and the - * should be end of the most inner conditional code block." - * - * The uip will have already been set by whoever set up the - * instruction. - */ - int block_end_offset = brw_find_next_block_end(p, offset); - if (block_end_offset == 0) { - brw_eu_inst_set_jip(devinfo, insn, brw_eu_inst_uip(devinfo, insn)); - } else { - brw_eu_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); - } - assert(brw_eu_inst_uip(devinfo, insn) != 0); - assert(brw_eu_inst_jip(devinfo, insn) != 0); - break; - } - - default: - break; - } - } -} - void brw_broadcast(struct brw_codegen *p, struct brw_reg dst, diff --git a/src/intel/compiler/brw/brw_generator.cpp b/src/intel/compiler/brw/brw_generator.cpp index c475f2fb354..766060ff8c5 100644 --- a/src/intel/compiler/brw/brw_generator.cpp +++ b/src/intel/compiler/brw/brw_generator.cpp @@ -27,6 +27,9 @@ * native instructions. */ +#include +#include + #include "brw_eu.h" #include "brw_disasm_info.h" #include "brw_shader.h" @@ -110,56 +113,6 @@ brw_generator::~brw_generator() { } -class ip_record : public brw_exec_node { -public: - DECLARE_RALLOC_CXX_OPERATORS(ip_record) - - ip_record(int ip) - { - this->ip = ip; - } - - int ip; -}; - -bool -brw_generator::patch_halt_jumps() -{ - if (this->discard_halt_patches.is_empty()) - return false; - - int scale = brw_jump_scale(p->devinfo); - - /* There is a somewhat strange undocumented requirement of using - * HALT, according to the simulator. If some channel has HALTed to - * a particular UIP, then by the end of the program, every channel - * must have HALTed to that UIP. Furthermore, the tracking is a - * stack, so you can't do the final halt of a UIP after starting - * halting to a new UIP. - * - * Symptoms of not emitting this instruction on actual hardware - * included GPU hangs and sparkly rendering on the piglit discard - * tests. - */ - brw_eu_inst *last_halt = brw_HALT(p); - brw_eu_inst_set_uip(p->devinfo, last_halt, 1 * scale); - brw_eu_inst_set_jip(p->devinfo, last_halt, 1 * scale); - - int ip = p->nr_insn; - - brw_foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { - brw_eu_inst *patch = &p->store[patch_ip->ip]; - - assert(brw_eu_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT); - /* HALT takes a half-instruction distance from the pre-incremented IP. */ - brw_eu_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); - } - - this->discard_halt_patches.make_empty(); - - return true; -} - void brw_generator::generate_send(brw_send_inst *inst, struct brw_reg dst, @@ -650,17 +603,6 @@ brw_generator::generate_ddy(const brw_inst *inst, } } -void -brw_generator::generate_halt(brw_inst *) -{ - /* This HALT will be patched up at FB write time to point UIP at the end of - * the program, and at brw_uip_jip() JIP will be set to the end of the - * current block (or the program). - */ - this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); - brw_HALT(p); -} - DEBUG_GET_ONCE_OPTION(shader_bin_override_path, "INTEL_SHADER_ASM_READ_PATH", NULL); @@ -755,6 +697,8 @@ brw_generator::generate_code(const brw_shader &s, brw_realign(p, 64); this->dispatch_width = dispatch_width; + this->final_halt_offset = -1; + this->needs_final_halt = false; int start_offset = p->next_insn_offset; @@ -1120,7 +1064,9 @@ brw_generator::generate_code(const brw_shader &s, break; case BRW_OPCODE_HALT: - generate_halt(inst); + /* This HALT will be patched by brw_set_uip_jip(). */ + this->needs_final_halt = true; + brw_HALT(p); break; case FS_OPCODE_SCHEDULING_FENCE: @@ -1236,11 +1182,23 @@ brw_generator::generate_code(const brw_shader &s, /* This is the place where the final HALT needs to be inserted if * we've emitted any discards. If not, this will emit no code. */ - if (!patch_halt_jumps()) { - if (unlikely(annotate)) { - disasm_info->use_tail = true; - } - } else if (devinfo->ver >= 12) { + if (!this->needs_final_halt) { + disasm_info->use_tail = true; + break; + } + + /* HALT temporarily disables channels, and the same instruction + * is used to re-enable them: once all channels are + * disabled, then they are re-enabled again immediately. + * + * So put a HALT right before the "epilogue" of the shader to make + * sure all channels get HALTed, so that this last HALT will re-enable + * them again. + */ + final_halt_offset = p->next_insn_offset; + brw_HALT(p); + + if (devinfo->ver >= 12) { /* This works around synchronization issues consequence of the * HALT instruction not being considered a control flow * instruction by the back-end -- The fact that it doesn't @@ -1374,7 +1332,7 @@ brw_generator::generate_code(const brw_shader &s, } } - brw_set_uip_jip(p, start_offset); + brw_set_uip_jip(p, start_offset, final_halt_offset); /* end of program sentinel */ disasm_new_inst_group(disasm_info, p->next_insn_offset); @@ -1582,3 +1540,181 @@ void brw_prog_data_init(struct brw_stage_prog_data *prog_data, prog_data->total_scratch = 0; prog_data->total_shared = params->nir->info.shared_size; } + +/* After program generation, go back and update the UIP and JIP of + * BREAK, CONT, ENDIF and HALT instructions to their correct locations. + */ +void +brw_set_uip_jip(struct brw_codegen *p, int start_offset, int final_halt_offset) +{ + const struct intel_device_info *devinfo = p->devinfo; + const int end_offset = p->next_insn_offset; + brw_eu_inst *store = p->store; + + struct branch_info { + enum opcode opcode; + int offset; + + /* For loop headers. */ + int loop_end_offset; + }; + + /* Collect information about the control flow instructions and any + * instruction that are loop headers. There might be multiple entries + * for instructions that act as loop header for multiple loops and/or that + * are control flow instruction themselves (e.g. IF as the loop header). + */ + std::vector infos; + for (int offset = start_offset; offset < end_offset; offset += 16) { + brw_eu_inst *insn = store + (offset / 16); + assert(brw_eu_inst_cmpt_control(devinfo, insn) == 0); + + const enum opcode opcode = brw_eu_inst_opcode(p->isa, insn); + switch (opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_HALT: + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case BRW_OPCODE_WHILE: + infos.push_back({ + .opcode = opcode, + .offset = offset, + }); + if (opcode == BRW_OPCODE_WHILE) { + /* Also add an entry for the loop header. */ + const int jip = brw_eu_inst_jip(devinfo, insn); + assert(jip < 0); + infos.push_back({ + /* Use NOP to indicate this is a loop header entry. */ + .opcode = BRW_OPCODE_NOP, + .offset = offset + jip, + .loop_end_offset = offset, + }); + } + break; + + default: + /* Nothing to do. */ + break; + } + } + + /* Sort in scope order. */ + std::sort(infos.begin(), infos.end(), [](const auto &a, const auto &b) { + if (a.offset != b.offset) + return a.offset < b.offset; + /* Note the flipped comparison: want to see the largest scope first, + * since it contains the other. + */ + return a.loop_end_offset > b.loop_end_offset; + }); + + struct scope { + int end_offset; + + /* End of current loop if exists. */ + int loop_end_offset; + }; + + std::vector scopes; + scopes.push_back({-1, -1}); + + /* Walk backwards keeping track of the scopes. This make easy to + * get the innermost end of scope and the innermost end of loop. + */ + for (int i = infos.size() - 1; i >= 0; i--) { + const branch_info &info = infos[i]; + + brw_eu_inst *insn = store + (info.offset / 16); + + switch (info.opcode) { + case BRW_OPCODE_NOP: + case BRW_OPCODE_IF: + /* Pop the scope. NOP here is a stand in for loop headers. */ + scopes.pop_back(); + break; + + case BRW_OPCODE_ELSE: + /* For instructions before the ELSE in the conditional (i.e. the + * then-part of the loop), the scope ends here. + */ + scopes.back().end_offset = info.offset; + break; + + case BRW_OPCODE_ENDIF: { + const int innermost_end_offset = scopes.back().end_offset; + int jip_offset; + + if (innermost_end_offset != -1) + jip_offset = innermost_end_offset; + else if (final_halt_offset != -1) + jip_offset = final_halt_offset + 16; + else + jip_offset = info.offset + 16; + + brw_eu_inst_set_jip(devinfo, insn, jip_offset - info.offset); + + scopes.push_back({ + .end_offset = info.offset, + .loop_end_offset = scopes.back().loop_end_offset, + }); + break; + } + + case BRW_OPCODE_WHILE: + scopes.push_back({ + .end_offset = info.offset, + .loop_end_offset = info.offset, + }); + break; + + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: { + const int innermost_end_offset = scopes.back().end_offset; + brw_eu_inst_set_jip(devinfo, insn, innermost_end_offset - info.offset); + + const int loop_end_offset = scopes.back().loop_end_offset; + assert(loop_end_offset != -1); + assert(loop_end_offset > info.offset); + brw_eu_inst_set_uip(devinfo, insn, loop_end_offset - info.offset); + break; + } + + case BRW_OPCODE_HALT: { + /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): + * + * "In case of the halt instruction not inside any conditional + * code block, the value of and should be the + * same. In case of the halt instruction inside conditional code + * block, the should be the end of the program, and the + * should be end of the most inner conditional code block." + */ + const int innermost_end_offset = scopes.back().end_offset; + + /* If present, use the final HALT to infer the "end of the program". + * + * See also SHADER_OPCODE_HALT_TARGET. + */ + if (final_halt_offset != -1) { + if (final_halt_offset == info.offset) + assert(innermost_end_offset == -1); + + const int uip_offset = final_halt_offset + 16; + brw_eu_inst_set_uip(devinfo, insn, uip_offset - info.offset); + } + + if (innermost_end_offset != -1) + brw_eu_inst_set_jip(devinfo, insn, innermost_end_offset - info.offset); + else + brw_eu_inst_set_jip(devinfo, insn, brw_eu_inst_uip(devinfo, insn)); + break; + } + + default: + /* Nothing to do. */ + break; + } + } +} diff --git a/src/intel/compiler/brw/brw_generator.h b/src/intel/compiler/brw/brw_generator.h index 753f47d9d09..3fcbe785259 100644 --- a/src/intel/compiler/brw/brw_generator.h +++ b/src/intel/compiler/brw/brw_generator.h @@ -40,8 +40,6 @@ private: void generate_scratch_header(brw_inst *inst, struct brw_reg dst, struct brw_reg src); - void generate_halt(brw_inst *inst); - void generate_mov_indirect(brw_inst *inst, struct brw_reg dst, struct brw_reg reg, @@ -68,7 +66,9 @@ private: unsigned dispatch_width; /**< 8, 16 or 32 */ - brw_exec_list discard_halt_patches; + int final_halt_offset; + bool needs_final_halt; + bool debug_flag; const char *shader_name; mesa_shader_stage stage;