From 13cca489203bbfb30050fb79bcaff92f5691c7d8 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 21 Dec 2022 20:16:27 +0200 Subject: [PATCH] intel/fs: drop FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7 We can lower FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD into other more generic sends and drop this internal opcode. The idea behind this change is to allow bindless surfaces to be used for UBO pulls and why it's interesting to be able to reuse setup_surface_descriptors(). But that will come in a later change. No shader-db changes on TGL & DG2. Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_eu_defines.h | 12 +- src/intel/compiler/brw_fs.cpp | 120 ++---------------- src/intel/compiler/brw_fs.h | 4 - src/intel/compiler/brw_fs_generator.cpp | 67 ---------- src/intel/compiler/brw_fs_nir.cpp | 10 +- src/intel/compiler/brw_ir_performance.cpp | 9 +- .../compiler/brw_lower_logical_sends.cpp | 115 +++++++++++++++++ .../compiler/brw_schedule_instructions.cpp | 6 +- src/intel/compiler/brw_shader.cpp | 2 - 9 files changed, 154 insertions(+), 191 deletions(-) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 537203556c9..f40e87be456 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -559,7 +559,6 @@ enum opcode { FS_OPCODE_PIXEL_X, FS_OPCODE_PIXEL_Y, FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, FS_OPCODE_SET_SAMPLE_ID, @@ -888,6 +887,17 @@ enum tex_logical_srcs { TEX_LOGICAL_NUM_SRCS, }; +enum pull_uniform_constant_srcs { + /** Surface binding table index */ + PULL_UNIFORM_CONSTANT_SRC_SURFACE, + /** Surface offset */ + PULL_UNIFORM_CONSTANT_SRC_OFFSET, + /** Pull size */ + PULL_UNIFORM_CONSTANT_SRC_SIZE, + + PULL_UNIFORM_CONSTANT_SRCS, +}; + enum surface_logical_srcs { /** Surface binding table index */ SURFACE_LOGICAL_SRC_SURFACE, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 50191448d3d..e1d4420b23b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -248,7 +248,6 @@ fs_inst::is_control_source(unsigned arg) const { switch (opcode) { case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: return arg == 0; @@ -306,9 +305,6 @@ fs_inst::is_payload(unsigned arg) const case SHADER_OPCODE_BARRIER: return arg == 0; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - return arg == 1; - case SHADER_OPCODE_SEND: return arg == 2 || arg == 3; @@ -864,12 +860,6 @@ fs_inst::size_read(int arg) const return 1; break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - /* The payload is actually stored in src1 */ - if (arg == 1) - return mlen * REG_SIZE; - break; - case FS_OPCODE_LINTERP: if (arg == 1) return 16; @@ -2460,8 +2450,14 @@ fs_visitor::lower_constant_loads() const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); const unsigned base = pull_index * 4; - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, + srcs, PULL_UNIFORM_CONSTANT_SRCS); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; @@ -3678,106 +3674,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds() invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } -/** - * Turns the generic expression-style uniform pull constant load instruction - * into a hardware-specific series of instructions for loading a pull - * constant. - * - * The expression style allows the CSE pass before this to optimize out - * repeated loads from the same offset, and gives the pre-register-allocation - * scheduling full flexibility, while the conversion to native instructions - * allows the post-register-allocation scheduler the best information - * possible. - * - * Note that execution masking for setting up pull constant loads is special: - * the channels that need to be written are unrelated to the current execution - * mask, since a later instruction will use one of the result channels as a - * source operand for all 8 or 16 of its channels. - */ -void -fs_visitor::lower_uniform_pull_constant_loads() -{ - foreach_block_and_inst (block, fs_inst, inst, cfg) { - if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) - continue; - - const fs_reg& surface = inst->src[0]; - const fs_reg& offset_B = inst->src[1]; - assert(offset_B.file == IMM); - - if (devinfo->has_lsc) { - const fs_builder ubld = - fs_builder(this, block, inst).group(8, 0).exec_all(); - - const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.MOV(payload, offset_B); - - inst->sfid = GFX12_SFID_UGM; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - 1 /* simd_size */, - LSC_ADDR_SURFTYPE_BTI, - LSC_ADDR_SIZE_A32, - 1 /* num_coordinates */, - LSC_DATA_SIZE_D32, - inst->size_written / 4, - true /* transpose */, - LSC_CACHE_LOAD_L1STATE_L3MOCS, - true /* has_dest */); - - fs_reg ex_desc; - if (surface.file == IMM) { - ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); - } else { - /* We only need the first component for the payload so we can use - * one of the other components for the extended descriptor - */ - ex_desc = component(payload, 1); - ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24)); - } - - /* Update the original instruction. */ - inst->opcode = SHADER_OPCODE_SEND; - inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); - inst->ex_mlen = 0; - inst->header_size = 0; - inst->send_has_side_effects = false; - inst->send_is_volatile = true; - inst->exec_size = 1; - - /* Finally, the payload */ - inst->resize_sources(3); - inst->src[0] = brw_imm_ud(0); /* desc */ - inst->src[1] = ex_desc; - inst->src[2] = payload; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else if (devinfo->ver >= 7) { - const fs_builder ubld = fs_builder(this, block, inst).exec_all(); - const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); - - ubld.group(8, 0).MOV(payload, - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - ubld.group(1, 0).MOV(component(payload, 2), - brw_imm_ud(offset_B.ud / 16)); - - inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7; - inst->src[1] = payload; - inst->header_size = 1; - inst->mlen = 1; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else { - /* Before register allocation, we didn't tell the scheduler about the - * MRF we use. We know it's safe to use this MRF because nothing - * else does except for register spill/unspill, which generates and - * uses its MRF within a single IR instruction. - */ - inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; - inst->mlen = 1; - } - } -} - bool fs_visitor::lower_load_payload() { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 581cafab1fc..e09e0703d9b 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -621,10 +621,6 @@ private: void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, struct brw_reg index, struct brw_reg offset); - void generate_uniform_pull_constant_load_gfx7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg surf_index, - struct brw_reg payload); void generate_varying_pull_constant_load_gfx4(fs_inst *inst, struct brw_reg dst, struct brw_reg index); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 1d079d22ee4..895816b4b9d 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1553,67 +1553,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, read_offset, surf_index); } -void -fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg payload) -{ - assert(index.type == BRW_REGISTER_TYPE_UD); - assert(payload.file == BRW_GENERAL_REGISTER_FILE); - assert(type_sz(dst.type) == 4); - assert(!devinfo->has_lsc); - - if (index.file == BRW_IMMEDIATE_VALUE) { - const uint32_t surf_index = index.ud; - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_pop_insn_state(p); - - brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE); - brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); - brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); - brw_set_desc(p, send, - brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, - REG_SIZE), true) | - brw_dp_desc(devinfo, surf_index, - GFX7_DATAPORT_DC_OWORD_BLOCK_READ, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size))); - - } else { - const tgl_swsb swsb = brw_get_default_swsb(p); - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* a0.0 = surf_index & 0xff */ - brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); - brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); - brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); - brw_set_dest(p, insn_and, addr); - brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); - brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); - - /* dst = send(payload, a0.0 | ) */ - brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); - brw_send_indirect_message( - p, GFX6_SFID_DATAPORT_CONSTANT_CACHE, - retype(dst, BRW_REGISTER_TYPE_UD), - retype(payload, BRW_REGISTER_TYPE_UD), addr, - brw_message_desc(devinfo, 1, - DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | - brw_dp_desc(devinfo, 0 /* surface */, - GFX7_DATAPORT_DC_OWORD_BLOCK_READ, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)), - false /* EOT */); - - brw_pop_insn_state(p); - } -} - void fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst, struct brw_reg dst, @@ -2294,12 +2233,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, send_count++; break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - assert(inst->force_writemask_all); - generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]); - send_count++; - break; - case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: generate_varying_pull_constant_load_gfx4(inst, dst, src[0]); send_count++; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8da05b87b1c..3ebbca88d9e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4776,9 +4776,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const unsigned count = MIN2(instr->num_components - c, (block_sz - base % block_sz) / type_size); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - packed_consts, surf_index, - brw_imm_ud(base & ~(block_sz - 1))); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surf_index; + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, + srcs, PULL_UNIFORM_CONSTANT_SRCS); const fs_reg consts = retype(byte_offset(packed_consts, base & (block_sz - 1)), diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index 3548cef79ce..3c7f8e46f59 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -1001,7 +1001,6 @@ namespace { abort(); case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); @@ -1036,6 +1035,14 @@ namespace { case SHADER_OPCODE_SEND: switch (info.sfid) { + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + if (devinfo->ver >= 7) { + /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */ + return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + } else { + abort(); + } case GFX6_SFID_DATAPORT_RENDER_CACHE: if (devinfo->ver >= 7) { switch (brw_dp_desc_msg_type(devinfo, info.desc)) { diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index a536d1a4f76..7bda05bd716 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -2778,3 +2778,118 @@ fs_visitor::lower_logical_sends() return progress; } + +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + const fs_reg& surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE]; + const fs_reg& offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET]; + const fs_reg& size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE]; + assert(offset_B.file == IMM); + assert(size_B.file == IMM); + + if (devinfo->has_lsc) { + const fs_builder ubld = + fs_builder(this, block, inst).group(8, 0).exec_all(); + + const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(payload, offset_B); + + inst->sfid = GFX12_SFID_UGM; + inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + 1 /* simd_size */, + LSC_ADDR_SURFTYPE_BTI, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + inst->size_written / 4, + true /* transpose */, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + true /* has_dest */); + + fs_reg ex_desc; + if (surface.file == IMM) { + ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); + } else { + /* We only need the first component for the payload so we can use + * one of the other components for the extended descriptor + */ + ex_desc = component(payload, 1); + ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24)); + } + + /* Update the original instruction. */ + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); + inst->ex_mlen = 0; + inst->header_size = 0; + inst->send_has_side_effects = false; + inst->send_is_volatile = true; + inst->exec_size = 1; + + /* Finally, the payload */ + inst->resize_sources(3); + inst->src[0] = brw_imm_ud(0); /* desc */ + inst->src[1] = ex_desc; + inst->src[2] = payload; + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + } else if (devinfo->ver >= 7) { + const fs_builder ubld = fs_builder(this, block, inst).exec_all(); + fs_reg header = bld.exec_all().group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); + + ubld.group(8, 0).MOV(header, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + ubld.group(1, 0).MOV(component(header, 2), + brw_imm_ud(offset_B.ud / 16)); + + inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE; + inst->opcode = SHADER_OPCODE_SEND; + inst->header_size = 1; + inst->mlen = 1; + + uint32_t desc = + brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */, + size_B.ud / 4, false /* write */); + + inst->resize_sources(4); + + setup_surface_descriptors(ubld, inst, desc, + inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE], + fs_reg() /* surface_handle */); + + inst->src[2] = header; + inst->src[3] = fs_reg(); /* unused for reads */ + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; + inst->mlen = 1; + } + } +} diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 3286e3f83b9..26e30a63534 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -326,7 +326,6 @@ schedule_node::set_latency_gfx7(bool is_haswell) case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: case VS_OPCODE_PULL_CONSTANT_LOAD: /* testing using varying-index pull constants: * @@ -399,6 +398,11 @@ schedule_node::set_latency_gfx7(bool is_haswell) break; } + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */ + latency = 200; + break; + case GFX6_SFID_DATAPORT_RENDER_CACHE: switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) { case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index d4839cb22c5..8bead7fc84f 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -430,8 +430,6 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: return "uniform_pull_const"; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - return "uniform_pull_const_gfx7"; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: return "varying_pull_const_gfx4"; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: