diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 537203556c9..f40e87be456 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -559,7 +559,6 @@ enum opcode { FS_OPCODE_PIXEL_X, FS_OPCODE_PIXEL_Y, FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, FS_OPCODE_SET_SAMPLE_ID, @@ -888,6 +887,17 @@ enum tex_logical_srcs { TEX_LOGICAL_NUM_SRCS, }; +enum pull_uniform_constant_srcs { + /** Surface binding table index */ + PULL_UNIFORM_CONSTANT_SRC_SURFACE, + /** Surface offset */ + PULL_UNIFORM_CONSTANT_SRC_OFFSET, + /** Pull size */ + PULL_UNIFORM_CONSTANT_SRC_SIZE, + + PULL_UNIFORM_CONSTANT_SRCS, +}; + enum surface_logical_srcs { /** Surface binding table index */ SURFACE_LOGICAL_SRC_SURFACE, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 50191448d3d..e1d4420b23b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -248,7 +248,6 @@ fs_inst::is_control_source(unsigned arg) const { switch (opcode) { case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: return arg == 0; @@ -306,9 +305,6 @@ fs_inst::is_payload(unsigned arg) const case SHADER_OPCODE_BARRIER: return arg == 0; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - return arg == 1; - case SHADER_OPCODE_SEND: return arg == 2 || arg == 3; @@ -864,12 +860,6 @@ fs_inst::size_read(int arg) const return 1; break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - /* The payload is actually stored in src1 */ - if (arg == 1) - return mlen * REG_SIZE; - break; - case FS_OPCODE_LINTERP: if (arg == 1) return 16; @@ -2460,8 +2450,14 @@ fs_visitor::lower_constant_loads() const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); const unsigned base = pull_index * 4; - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, + srcs, PULL_UNIFORM_CONSTANT_SRCS); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; @@ -3678,106 +3674,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds() invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } -/** - * Turns the generic expression-style uniform pull constant load instruction - * into a hardware-specific series of instructions for loading a pull - * constant. - * - * The expression style allows the CSE pass before this to optimize out - * repeated loads from the same offset, and gives the pre-register-allocation - * scheduling full flexibility, while the conversion to native instructions - * allows the post-register-allocation scheduler the best information - * possible. - * - * Note that execution masking for setting up pull constant loads is special: - * the channels that need to be written are unrelated to the current execution - * mask, since a later instruction will use one of the result channels as a - * source operand for all 8 or 16 of its channels. - */ -void -fs_visitor::lower_uniform_pull_constant_loads() -{ - foreach_block_and_inst (block, fs_inst, inst, cfg) { - if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) - continue; - - const fs_reg& surface = inst->src[0]; - const fs_reg& offset_B = inst->src[1]; - assert(offset_B.file == IMM); - - if (devinfo->has_lsc) { - const fs_builder ubld = - fs_builder(this, block, inst).group(8, 0).exec_all(); - - const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.MOV(payload, offset_B); - - inst->sfid = GFX12_SFID_UGM; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - 1 /* simd_size */, - LSC_ADDR_SURFTYPE_BTI, - LSC_ADDR_SIZE_A32, - 1 /* num_coordinates */, - LSC_DATA_SIZE_D32, - inst->size_written / 4, - true /* transpose */, - LSC_CACHE_LOAD_L1STATE_L3MOCS, - true /* has_dest */); - - fs_reg ex_desc; - if (surface.file == IMM) { - ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); - } else { - /* We only need the first component for the payload so we can use - * one of the other components for the extended descriptor - */ - ex_desc = component(payload, 1); - ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24)); - } - - /* Update the original instruction. */ - inst->opcode = SHADER_OPCODE_SEND; - inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); - inst->ex_mlen = 0; - inst->header_size = 0; - inst->send_has_side_effects = false; - inst->send_is_volatile = true; - inst->exec_size = 1; - - /* Finally, the payload */ - inst->resize_sources(3); - inst->src[0] = brw_imm_ud(0); /* desc */ - inst->src[1] = ex_desc; - inst->src[2] = payload; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else if (devinfo->ver >= 7) { - const fs_builder ubld = fs_builder(this, block, inst).exec_all(); - const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); - - ubld.group(8, 0).MOV(payload, - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - ubld.group(1, 0).MOV(component(payload, 2), - brw_imm_ud(offset_B.ud / 16)); - - inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7; - inst->src[1] = payload; - inst->header_size = 1; - inst->mlen = 1; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else { - /* Before register allocation, we didn't tell the scheduler about the - * MRF we use. We know it's safe to use this MRF because nothing - * else does except for register spill/unspill, which generates and - * uses its MRF within a single IR instruction. - */ - inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; - inst->mlen = 1; - } - } -} - bool fs_visitor::lower_load_payload() { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 581cafab1fc..e09e0703d9b 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -621,10 +621,6 @@ private: void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, struct brw_reg index, struct brw_reg offset); - void generate_uniform_pull_constant_load_gfx7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg surf_index, - struct brw_reg payload); void generate_varying_pull_constant_load_gfx4(fs_inst *inst, struct brw_reg dst, struct brw_reg index); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 1d079d22ee4..895816b4b9d 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1553,67 +1553,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, read_offset, surf_index); } -void -fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg payload) -{ - assert(index.type == BRW_REGISTER_TYPE_UD); - assert(payload.file == BRW_GENERAL_REGISTER_FILE); - assert(type_sz(dst.type) == 4); - assert(!devinfo->has_lsc); - - if (index.file == BRW_IMMEDIATE_VALUE) { - const uint32_t surf_index = index.ud; - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_pop_insn_state(p); - - brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE); - brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); - brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); - brw_set_desc(p, send, - brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, - REG_SIZE), true) | - brw_dp_desc(devinfo, surf_index, - GFX7_DATAPORT_DC_OWORD_BLOCK_READ, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size))); - - } else { - const tgl_swsb swsb = brw_get_default_swsb(p); - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* a0.0 = surf_index & 0xff */ - brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); - brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); - brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); - brw_set_dest(p, insn_and, addr); - brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); - brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); - - /* dst = send(payload, a0.0 | ) */ - brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); - brw_send_indirect_message( - p, GFX6_SFID_DATAPORT_CONSTANT_CACHE, - retype(dst, BRW_REGISTER_TYPE_UD), - retype(payload, BRW_REGISTER_TYPE_UD), addr, - brw_message_desc(devinfo, 1, - DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | - brw_dp_desc(devinfo, 0 /* surface */, - GFX7_DATAPORT_DC_OWORD_BLOCK_READ, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)), - false /* EOT */); - - brw_pop_insn_state(p); - } -} - void fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst, struct brw_reg dst, @@ -2294,12 +2233,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, send_count++; break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - assert(inst->force_writemask_all); - generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]); - send_count++; - break; - case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: generate_varying_pull_constant_load_gfx4(inst, dst, src[0]); send_count++; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8da05b87b1c..3ebbca88d9e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4776,9 +4776,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const unsigned count = MIN2(instr->num_components - c, (block_sz - base % block_sz) / type_size); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - packed_consts, surf_index, - brw_imm_ud(base & ~(block_sz - 1))); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surf_index; + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, + srcs, PULL_UNIFORM_CONSTANT_SRCS); const fs_reg consts = retype(byte_offset(packed_consts, base & (block_sz - 1)), diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index 3548cef79ce..3c7f8e46f59 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -1001,7 +1001,6 @@ namespace { abort(); case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); @@ -1036,6 +1035,14 @@ namespace { case SHADER_OPCODE_SEND: switch (info.sfid) { + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + if (devinfo->ver >= 7) { + /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */ + return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + } else { + abort(); + } case GFX6_SFID_DATAPORT_RENDER_CACHE: if (devinfo->ver >= 7) { switch (brw_dp_desc_msg_type(devinfo, info.desc)) { diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index a536d1a4f76..7bda05bd716 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -2778,3 +2778,118 @@ fs_visitor::lower_logical_sends() return progress; } + +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + const fs_reg& surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE]; + const fs_reg& offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET]; + const fs_reg& size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE]; + assert(offset_B.file == IMM); + assert(size_B.file == IMM); + + if (devinfo->has_lsc) { + const fs_builder ubld = + fs_builder(this, block, inst).group(8, 0).exec_all(); + + const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(payload, offset_B); + + inst->sfid = GFX12_SFID_UGM; + inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + 1 /* simd_size */, + LSC_ADDR_SURFTYPE_BTI, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + inst->size_written / 4, + true /* transpose */, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + true /* has_dest */); + + fs_reg ex_desc; + if (surface.file == IMM) { + ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); + } else { + /* We only need the first component for the payload so we can use + * one of the other components for the extended descriptor + */ + ex_desc = component(payload, 1); + ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24)); + } + + /* Update the original instruction. */ + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); + inst->ex_mlen = 0; + inst->header_size = 0; + inst->send_has_side_effects = false; + inst->send_is_volatile = true; + inst->exec_size = 1; + + /* Finally, the payload */ + inst->resize_sources(3); + inst->src[0] = brw_imm_ud(0); /* desc */ + inst->src[1] = ex_desc; + inst->src[2] = payload; + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + } else if (devinfo->ver >= 7) { + const fs_builder ubld = fs_builder(this, block, inst).exec_all(); + fs_reg header = bld.exec_all().group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); + + ubld.group(8, 0).MOV(header, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + ubld.group(1, 0).MOV(component(header, 2), + brw_imm_ud(offset_B.ud / 16)); + + inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE; + inst->opcode = SHADER_OPCODE_SEND; + inst->header_size = 1; + inst->mlen = 1; + + uint32_t desc = + brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */, + size_B.ud / 4, false /* write */); + + inst->resize_sources(4); + + setup_surface_descriptors(ubld, inst, desc, + inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE], + fs_reg() /* surface_handle */); + + inst->src[2] = header; + inst->src[3] = fs_reg(); /* unused for reads */ + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; + inst->mlen = 1; + } + } +} diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 3286e3f83b9..26e30a63534 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -326,7 +326,6 @@ schedule_node::set_latency_gfx7(bool is_haswell) case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: case VS_OPCODE_PULL_CONSTANT_LOAD: /* testing using varying-index pull constants: * @@ -399,6 +398,11 @@ schedule_node::set_latency_gfx7(bool is_haswell) break; } + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */ + latency = 200; + break; + case GFX6_SFID_DATAPORT_RENDER_CACHE: switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) { case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index d4839cb22c5..8bead7fc84f 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -430,8 +430,6 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: return "uniform_pull_const"; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - return "uniform_pull_const_gfx7"; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: return "varying_pull_const_gfx4"; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: