From 97bf3d3b2d4a2c01cafff413a44ec42002ba9956 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 28 Feb 2024 02:58:00 -0800 Subject: [PATCH] intel/brw: Replace CS_OPCODE_CS_TERMINATE with SHADER_OPCODE_SEND There's no need for special handling here, it's just a send message with a trivial g0 header and descriptor. Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_eu_defines.h | 5 --- src/intel/compiler/brw_fs.cpp | 3 -- src/intel/compiler/brw_fs.h | 1 - src/intel/compiler/brw_fs_generator.cpp | 45 ------------------- src/intel/compiler/brw_fs_reg_allocate.cpp | 25 ++++------- src/intel/compiler/brw_fs_visitor.cpp | 36 ++++++++++++--- src/intel/compiler/brw_inst.h | 9 ---- src/intel/compiler/brw_ir_performance.cpp | 7 +-- .../compiler/brw_schedule_instructions.cpp | 3 +- 9 files changed, 42 insertions(+), 92 deletions(-) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 9ffb0cb9a58..8e54d72eea6 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -506,11 +506,6 @@ enum opcode { FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, - /** - * Terminate the compute shader. - */ - CS_OPCODE_CS_TERMINATE, - /** * GLSL barrier() */ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 23fe5ffcfe2..9794c84d078 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -890,7 +890,6 @@ fs_inst::size_read(int arg) const return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8); break; - case CS_OPCODE_CS_TERMINATE: case SHADER_OPCODE_BARRIER: return REG_SIZE; @@ -2440,8 +2439,6 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: return "interp_per_slot_offset"; - case CS_OPCODE_CS_TERMINATE: - return "cs_terminate"; case SHADER_OPCODE_BARRIER: return "barrier"; case SHADER_OPCODE_MULH: diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index f7e948d06a9..895a449b1ac 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -498,7 +498,6 @@ private: struct brw_reg payload2); void generate_fb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload); - void generate_cs_terminate(fs_inst *inst, struct brw_reg payload); void generate_barrier(fs_inst *inst, struct brw_reg src); bool generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index b41fb9c46e7..03a5b2fe53f 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -611,46 +611,6 @@ fs_generator::generate_quad_swizzle(const fs_inst *inst, } } -void -fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) -{ - struct brw_inst *insn; - - insn = brw_next_insn(p, BRW_OPCODE_SEND); - - brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); - brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); - if (devinfo->ver < 12) - brw_set_src1(p, insn, brw_imm_ud(0u)); - - /* For XeHP and newer send a message to the message gateway to terminate a - * compute shader. For older devices, a message is sent to the thread - * spawner. - */ - if (devinfo->verx10 >= 125) - brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY); - else - brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); - brw_inst_set_mlen(devinfo, insn, 1); - brw_inst_set_rlen(devinfo, insn, 0); - brw_inst_set_eot(devinfo, insn, inst->eot); - brw_inst_set_header_present(devinfo, insn, false); - - brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ - - if (devinfo->ver < 11) { - brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ - - /* Note that even though the thread has a URB resource associated with it, - * we set the "do not dereference URB" bit, because the URB resource is - * managed by the fixed-function unit, so it will free it automatically. - */ - brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ - } - - brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); -} - void fs_generator::generate_barrier(fs_inst *, struct brw_reg src) { @@ -1469,11 +1429,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, } break; - case CS_OPCODE_CS_TERMINATE: - generate_cs_terminate(inst, src[0]); - send_count++; - break; - case SHADER_OPCODE_BARRIER: generate_barrier(inst, src[0]); send_count++; diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 990ada7293b..9a48af3ae51 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -221,24 +221,15 @@ void fs_visitor::calculate_payload_ranges(unsigned payload_node_count, } } - /* Special case instructions which have extra implied registers used. */ - switch (inst->opcode) { - case CS_OPCODE_CS_TERMINATE: + if (inst->eot) { + /* We could omit this for the !inst->header_present case, except + * that the simulator apparently incorrectly reads from g0/g1 + * instead of sideband. It also really freaks out driver + * developers to see g0 used in unusual places, so just always + * reserve it. + */ payload_last_use_ip[0] = use_ip; - break; - - default: - if (inst->eot) { - /* We could omit this for the !inst->header_present case, except - * that the simulator apparently incorrectly reads from g0/g1 - * instead of sideband. It also really freaks out driver - * developers to see g0 used in unusual places, so just always - * reserve it. - */ - payload_last_use_ip[0] = use_ip; - payload_last_use_ip[1] = use_ip; - } - break; + payload_last_use_ip[1] = use_ip; } ip++; diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 2d237561253..8efb89d1244 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -970,7 +970,7 @@ fs_visitor::emit_urb_fence() void fs_visitor::emit_cs_terminate() { - const fs_builder bld = fs_builder(this).at_end(); + const fs_builder ubld = fs_builder(this).at_end().exec_all(); /* We can't directly send from g0, since sends with EOT have to use * g112-127. So, copy it to a virtual register, The register allocator will @@ -978,12 +978,36 @@ fs_visitor::emit_cs_terminate() */ struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD); fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); - bld.group(8, 0).exec_all().MOV(payload, g0); + ubld.group(8, 0).MOV(payload, g0); - /* Send a message to the thread spawner to terminate the thread. */ - fs_inst *inst = bld.exec_all() - .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload); - inst->eot = true; + /* Set the descriptor to "Dereference Resource" and "Root Thread" */ + unsigned desc = 0; + + /* Set Resource Select to "Do not dereference URB" on Gfx < 11. + * + * Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + if (devinfo->ver < 11) + desc |= (1 << 4); /* Do not dereference URB */ + + fs_reg srcs[4] = { + brw_imm_ud(desc), /* desc */ + brw_imm_ud(0), /* ex_desc */ + payload, /* payload */ + fs_reg(), /* payload2 */ + }; + + fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, reg_undef, srcs, 4); + + /* On Alchemist and later, send an EOT message to the message gateway to + * terminate a compute shader. For older GPUs, send to the thread spawner. + */ + send->sfid = devinfo->verx10 >= 125 ? BRW_SFID_MESSAGE_GATEWAY + : BRW_SFID_THREAD_SPAWNER; + send->mlen = 1; + send->eot = true; } fs_visitor::fs_visitor(const struct brw_compiler *compiler, diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h index c5aaae5c38c..308525e6234 100644 --- a/src/intel/compiler/brw_inst.h +++ b/src/intel/compiler/brw_inst.h @@ -860,15 +860,6 @@ F(rt_slot_group, /* 9+ */ MD(11), MD(11), /* 12+ */ MD12(11), MD12(11)) F(rt_message_type, /* 9+ */ MD(10), MD( 8), /* 12+ */ MD12(10), MD12(8)) /** @} */ -/** - * Thread Spawn message function control bits: - * @{ - */ -FC(ts_resource_select, /* 9+ */ MD( 4), MD( 4), /* 12+ */ -1, -1, devinfo->ver < 11) -FC(ts_request_type, /* 9+ */ MD( 1), MD( 1), /* 12+ */ -1, -1, devinfo->ver < 11) -F(ts_opcode, /* 9+ */ MD( 0), MD( 0), /* 12+ */ MD12(0), MD12(0)) -/** @} */ - /** * Pixel Interpolator message function control bits: * @{ diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index daf5985d2a5..34efa3e6645 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -594,10 +594,6 @@ namespace { 0 /* XXX */, 0, 0, 0, 0, 0, 0, 0); - case CS_OPCODE_CS_TERMINATE: - return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, - 10 /* XXX */, 0, 0, 0, 0, 0); - case SHADER_OPCODE_SEND: switch (info.sfid) { case GFX6_SFID_DATAPORT_CONSTANT_CACHE: @@ -685,7 +681,8 @@ namespace { abort(); } - case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: + case BRW_SFID_MESSAGE_GATEWAY: + case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */ case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, 10 /* XXX */, 0, 0, 0, 0, 0); diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 20638456677..807473d809f 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -522,7 +522,8 @@ schedule_node::set_latency(const struct brw_isa_info *isa) } break; - case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: + case BRW_SFID_MESSAGE_GATEWAY: + case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */ case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: /* TODO. *