diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index ed4b497f492..8ac46a36ef2 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -1311,6 +1311,14 @@ enum brw_message_target { #define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS 2 #define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS 3 #define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS 4 +#define GEN12_DATAPORT_OWORD_BLOCK_16_OWORDS 5 +#define BRW_DATAPORT_OWORD_BLOCK_OWORDS(n) \ + ((n) == 1 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \ + (n) == 2 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \ + (n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS : \ + (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : \ + (n) == 16 ? GEN12_DATAPORT_OWORD_BLOCK_16_OWORDS : \ + (abort(), ~0)) #define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n) \ ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \ (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 5a7679ee5e1..46354cae314 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1621,6 +1621,74 @@ fs_visitor::assign_curb_setup() uint64_t used = 0; + if (stage == MESA_SHADER_COMPUTE && + (devinfo->gen > 12 || gen_device_info_is_12hp(devinfo))) { + fs_builder ubld = bld.exec_all().group(8, 0).at( + cfg->first_block(), cfg->first_block()->start()); + + /* The base address for our push data is passed in as R0.0[31:6]. We + * have to mask off the bottom 6 bits. + */ + fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.group(1, 0).AND(base_addr, + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xffffffc0)); + + fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(header0, brw_imm_ud(0)); + ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4)); + + /* On Gen12-HP we load constants at the start of the program using A32 + * stateless messages. + */ + for (unsigned i = 0; i < uniform_push_length;) { + unsigned num_regs = MIN2(uniform_push_length - i, 8); + assert(num_regs > 0); + num_regs = 1 << util_logbase2(num_regs); + + fs_reg header; + if (i == 0) { + header = header0; + } else { + header = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(header, brw_imm_ud(0)); + ubld.group(1, 0).ADD(component(header, 2), + component(header0, 2), + brw_imm_ud(i * 2)); + } + + fs_reg srcs[4] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + header, /* payload */ + fs_reg(), /* payload2 */ + }; + + fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0), + BRW_REGISTER_TYPE_UD); + + /* This instruction has to be run SIMD16 if we're filling more than a + * single register. + */ + unsigned send_width = MIN2(16, num_regs * 8); + + fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND, + dest, srcs, 4); + send->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + send->desc = brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + GEN7_DATAPORT_DC_OWORD_BLOCK_READ, + BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2)); + send->header_size = 1; + send->mlen = 1; + send->size_written = num_regs * REG_SIZE; + send->send_is_volatile = true; + + i += num_regs; + } + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + /* Map the offsets in the UNIFORM file to fixed HW regs. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) {