diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c index 87723678d60..375c51abf3c 100644 --- a/src/intel/compiler/brw_disasm.c +++ b/src/intel/compiler/brw_disasm.c @@ -443,6 +443,8 @@ static const char *const dp_dc1_msg_type_hsw[32] = { [GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read", [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read", [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP] = "DC A64 untyped atomic op", + [GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ] = "DC A64 oword block read", + [GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE] = "DC A64 oword block write", [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write", [GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write", [GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] = diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 88cf9d1d674..cbb6e51694b 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -806,6 +806,27 @@ brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo, msg_type, msg_control); } +static inline uint32_t +brw_dp_a64_oword_block_rw_desc(const struct gen_device_info *devinfo, + bool align_16B, + unsigned num_dwords, + bool write) +{ + /* Writes can only have addresses aligned by OWORDs (16 Bytes). */ + assert(!write || align_16B); + + unsigned msg_type = + write ? GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE : + GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ; + + unsigned msg_control = + SET_BITS(!align_16B, 4, 3) | + SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0); + + return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + /** * Calculate the data size (see MDC_A64_DS in the "Structures" volume of the * Skylake PRM). diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index b4128d2bf52..986b876c251 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -427,6 +427,9 @@ enum opcode { SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, + SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL, @@ -1410,6 +1413,8 @@ enum brw_message_target { #define GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10 #define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11 #define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP 0x12 +#define GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ 0x14 +#define GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE 0x15 #define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19 #define GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a #define GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 06a3dbe0890..42f20695433 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -824,9 +824,21 @@ fs_inst::components_read(unsigned i) const return 1; case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: assert(src[2].file == IMM); return 1; + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + assert(src[2].file == IMM); + if (i == 1) { /* data to write */ + const unsigned comps = src[2].ud / exec_size; + assert(comps > 0); + return comps; + } else { + return 1; + } + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: assert(src[2].file == IMM); return i == 1 ? src[2].ud : 1; @@ -5626,6 +5638,23 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->resize_sources(4); } +static fs_reg +emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) +{ + const fs_builder ubld = bld.exec_all().group(8, 0); + fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(header, brw_imm_ud(0)); + + /* Use a 2-wide MOV to fill out the address */ + assert(type_sz(addr.type) == 8 && addr.stride == 0); + fs_reg addr_vec2 = addr; + addr_vec2.type = BRW_REGISTER_TYPE_UD; + addr_vec2.stride = 1; + ubld.group(2, 0).MOV(header, addr_vec2); + + return header; +} + static void lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5645,8 +5674,23 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) emit_predicate_on_sample_mask(bld, inst); fs_reg payload, payload2; - unsigned mlen, ex_mlen = 0; - if (devinfo->gen >= 9) { + unsigned mlen, ex_mlen = 0, header_size = 0; + if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL || + inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL || + inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) { + assert(devinfo->gen >= 9); + + /* OWORD messages only take a scalar address in a header */ + mlen = 1; + header_size = 1; + payload = emit_a64_oword_block_header(bld, addr); + + if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) { + ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; + payload2 = retype(bld.move_to_vgrf(src, src_comps), + BRW_REGISTER_TYPE_UD); + } + } else if (devinfo->gen >= 9) { /* On Skylake and above, we have SENDS */ mlen = 2 * (inst->exec_size / 8); ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; @@ -5683,6 +5727,27 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) true /* write */); break; + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + desc = brw_dp_a64_oword_block_rw_desc(devinfo, + true, /* align_16B */ + arg, /* num_dwords */ + false /* write */); + break; + + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + desc = brw_dp_a64_oword_block_rw_desc(devinfo, + false, /* align_16B */ + arg, /* num_dwords */ + false /* write */); + break; + + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + desc = brw_dp_a64_oword_block_rw_desc(devinfo, + true, /* align_16B */ + arg, /* num_dwords */ + true /* write */); + break; + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, arg, /* bit_size */ @@ -5722,7 +5787,7 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) inst->opcode = SHADER_OPCODE_SEND; inst->mlen = mlen; inst->ex_mlen = ex_mlen; - inst->header_size = 0; + inst->header_size = header_size; inst->send_has_side_effects = has_side_effects; inst->send_is_volatile = !has_side_effects; @@ -5956,6 +6021,9 @@ fs_visitor::lower_logical_sends() case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: @@ -6557,6 +6625,12 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size); + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + assert(inst->exec_size <= 16); + return inst->exec_size; + case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 380081c1b71..bed9e793d59 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -501,6 +501,8 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: + case GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: + case GEN9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ latency = 300; break; diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index d137c7dcf58..c6d997287c0 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -303,6 +303,12 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "untyped_surface_write_logical"; case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: return "a64_untyped_read_logical"; + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + return "a64_oword_block_read_logical"; + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + return "a64_unaligned_oword_block_read_logical"; + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + return "a64_oword_block_write_logical"; case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: return "a64_untyped_write_logical"; case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: @@ -1088,6 +1094,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_RND_MODE: case SHADER_OPCODE_FLOAT_CONTROL_MODE: case FS_OPCODE_SCHEDULING_FENCE: + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: return true; default: return eot;