intel/fs: Make logical URB write instructions more like other logical instructions

The changes to fs_visitor::validate() helped track down a place where I
initially forgot to convert a message to the new sources layout.  This
had caused a different validation failure in
dEQP-GLES31.functional.tessellation.tesscoord.triangles_equal_spacing,
but this were not detected until after SENDs were lowered.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 19951145 -> 19951133 (<.01%)
instructions in affected programs: 2429 -> 2417 (-0.49%)
helped: 8 / HURT: 0

total cycles in shared programs: 858904152 -> 858862331 (<.01%)
cycles in affected programs: 5702652 -> 5660831 (-0.73%)
helped: 2138 / HURT: 1255

Broadwell
total cycles in shared programs: 904869459 -> 904835501 (<.01%)
cycles in affected programs: 7686744 -> 7652786 (-0.44%)
helped: 2861 / HURT: 2050

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 141442369 -> 141442032 (-0.0%)
Instructions helped: 337

Cycles in all programs: 9099270231 -> 9099036492 (-0.0%)
Cycles helped: 40661
Cycles hurt: 28606

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>
This commit is contained in:
Ian Romanick 2022-07-12 15:32:01 -07:00 committed by Marge Bot
parent 5dab077824
commit 349a040f68
7 changed files with 174 additions and 119 deletions

View file

@ -950,6 +950,17 @@ enum rt_logical_srcs {
RT_LOGICAL_NUM_SRCS RT_LOGICAL_NUM_SRCS
}; };
enum urb_logical_srcs {
URB_LOGICAL_SRC_HANDLE,
URB_LOGICAL_SRC_PER_SLOT_OFFSETS,
URB_LOGICAL_SRC_CHANNEL_MASK,
/** Data to be written. BAD_FILE for reads. */
URB_LOGICAL_SRC_DATA,
URB_LOGICAL_NUM_SRCS
};
#ifdef __cplusplus #ifdef __cplusplus
/** /**
* Allow brw_urb_write_flags enums to be ORed together. * Allow brw_urb_write_flags enums to be ORed together.

View file

@ -863,6 +863,17 @@ fs_inst::components_read(unsigned i) const
return 1; return 1;
} }
case SHADER_OPCODE_URB_WRITE_LOGICAL:
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
if (i == URB_LOGICAL_SRC_DATA)
return mlen - 1 -
unsigned(src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) -
unsigned(src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
else
return 1;
default: default:
return 1; return 1;
} }
@ -891,10 +902,6 @@ fs_inst::size_read(int arg) const
break; break;
case FS_OPCODE_FB_READ: case FS_OPCODE_FB_READ:
case SHADER_OPCODE_URB_WRITE_LOGICAL:
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_READ_LOGICAL: case SHADER_OPCODE_URB_READ_LOGICAL:
case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL: case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE: case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@ -1546,17 +1553,17 @@ fs_visitor::emit_gs_thread_end()
break; break;
} }
} }
fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, hdr); inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->mlen = 1; inst->mlen = 1;
} else { } else {
fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
sources[1] = this->final_gs_vertex_count; inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
abld.LOAD_PAYLOAD(payload, sources, 2, 2); srcs, ARRAY_SIZE(srcs));
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
inst->mlen = 2; inst->mlen = 2;
} }
inst->eot = true; inst->eot = true;
@ -6676,16 +6683,12 @@ fs_visitor::run_tcs()
} }
/* Emit EOT write; set TR DS Cache bit */ /* Emit EOT write; set TR DS Cache bit */
fs_reg srcs[3] = { fs_reg srcs[URB_LOGICAL_NUM_SRCS];
fs_reg(get_tcs_output_urb_handle()), srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
fs_reg(brw_imm_ud(WRITEMASK_X << 16)), srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
fs_reg(brw_imm_ud(0)), srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
};
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
bld.null_reg_ud(), payload); reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 3; inst->mlen = 3;
inst->eot = true; inst->eot = true;

View file

@ -2341,27 +2341,27 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
} }
/* Store the control data bits in the message payload and send it. */ /* Store the control data bits in the message payload and send it. */
unsigned mlen = 2; const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) +
if (channel_mask.file != BAD_FILE) unsigned(per_slot_offset.file != BAD_FILE);
mlen += 4; /* channel masks, plus 3 extra copies of the data */
if (per_slot_offset.file != BAD_FILE)
mlen++;
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); /* If there are channel masks, add 3 extra copies of the data. */
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
unsigned i = 0;
sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
if (per_slot_offset.file != BAD_FILE)
sources[i++] = per_slot_offset;
if (channel_mask.file != BAD_FILE)
sources[i++] = channel_mask;
while (i < mlen) {
sources[i++] = this->control_data_bits;
}
abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); fs_reg sources[4];
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
inst->mlen = mlen; for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = this->control_data_bits;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = header_size + length;
/* We need to increment Global Offset by 256-bits to make room for /* We need to increment Global Offset by 256-bits to make room for
* Broadwell's extra "Vertex Count" payload at the beginning of the * Broadwell's extra "Vertex Count" payload at the beginning of the
* URB entry. Since this is an OWord message, Global Offset is counted * URB entry. Since this is an OWord message, Global Offset is counted
@ -3046,15 +3046,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
fs_reg indirect_offset = get_indirect_offset(instr); fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0]; unsigned imm_offset = instr->const_index[0];
unsigned mask = instr->const_index[1]; unsigned mask = instr->const_index[1];
unsigned header_regs = 0;
struct brw_reg output_handles = get_tcs_output_urb_handle();
fs_reg srcs[7];
srcs[header_regs++] = output_handles;
if (indirect_offset.file != BAD_FILE) {
srcs[header_regs++] = indirect_offset;
}
if (mask == 0) if (mask == 0)
break; break;
@ -3068,8 +3059,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
unsigned first_component = nir_intrinsic_component(instr); unsigned first_component = nir_intrinsic_component(instr);
mask = mask << first_component; mask = mask << first_component;
fs_reg mask_reg;
if (mask != WRITEMASK_XYZW) { if (mask != WRITEMASK_XYZW) {
srcs[header_regs++] = brw_imm_ud(mask << 16); mask_reg = brw_imm_ud(mask << 16);
opcode = indirect_offset.file != BAD_FILE ? opcode = indirect_offset.file != BAD_FILE ?
SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL : SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL :
SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL; SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL;
@ -3079,21 +3071,30 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
SHADER_OPCODE_URB_WRITE_LOGICAL; SHADER_OPCODE_URB_WRITE_LOGICAL;
} }
fs_reg sources[4];
for (unsigned i = 0; i < num_components; i++) { for (unsigned i = 0; i < num_components; i++) {
if (!(mask & (1 << (i + first_component)))) if (!(mask & (1 << (i + first_component))))
continue; continue;
srcs[header_regs + i + first_component] = offset(value, bld, i); sources[i + first_component] = offset(value, bld, i);
} }
unsigned mlen = header_regs + num_components + first_component; unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) +
fs_reg payload = unsigned(mask != WRITEMASK_XYZW);
bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); const unsigned length = num_components + first_component;
bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
fs_inst *inst = bld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
inst->offset = imm_offset; inst->offset = imm_offset;
inst->mlen = mlen; inst->mlen = header_size + length;
break; break;
} }

View file

@ -43,6 +43,20 @@ fs_visitor::validate()
{ {
#ifndef NDEBUG #ifndef NDEBUG
foreach_block_and_inst (block, fs_inst, inst, cfg) { foreach_block_and_inst (block, fs_inst, inst, cfg) {
if (inst->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
const unsigned header_size = 1 +
unsigned(inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) +
unsigned(inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
unsigned data_size = 0;
for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) {
fsv_assert(type_sz(offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j).type) == 4);
data_size++;
}
fsv_assert(header_size + data_size == inst->mlen);
}
if (inst->dst.file == VGRF) { if (inst->dst.file == VGRF) {
fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <= fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
alloc.sizes[inst->dst.nr]); alloc.sizes[inst->dst.nr]);

View file

@ -935,22 +935,15 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (length == 8 || (length > 0 && slot == last_slot)) if (length == 8 || (length > 0 && slot == last_slot))
flush = true; flush = true;
if (flush) { if (flush) {
fs_reg *payload_sources = fs_reg srcs[URB_LOGICAL_NUM_SRCS];
ralloc_array(mem_ctx, fs_reg, length + header_size);
fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
BRW_REGISTER_TYPE_F);
payload_sources[0] = urb_handle;
if (opcode == SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL) srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
payload_sources[1] = per_slot_offsets; srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
memcpy(&payload_sources[header_size], sources, fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
length * sizeof sources[0]);
abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
header_size);
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
/* For ICL WA 1805992985 one needs additional write in the end. */ /* For ICL WA 1805992985 one needs additional write in the end. */
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
@ -985,10 +978,17 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (stage == MESA_SHADER_GEOMETRY) if (stage == MESA_SHADER_GEOMETRY)
return; return;
fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
bld.exec_all().MOV(payload, urb_handle); fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload); bld.exec_all().MOV(uniform_urb_handle, urb_handle);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->eot = true; inst->eot = true;
inst->mlen = 2; inst->mlen = 2;
inst->offset = 1; inst->offset = 1;
@ -1002,14 +1002,16 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* all 8 lanes must valid. * all 8 lanes must valid.
*/ */
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) { if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD); fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
/* Workaround requires all 8 channels (lanes) to be valid. This is /* Workaround requires all 8 channels (lanes) to be valid. This is
* understood to mean they all need to be alive. First trick is to find * understood to mean they all need to be alive. First trick is to find
* a live channel and copy its urb handle for all the other channels to * a live channel and copy its urb handle for all the other channels to
* make sure all handles are valid. * make sure all handles are valid.
*/ */
bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle)); bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
/* Second trick is to use masked URB write where one can tell the HW to /* Second trick is to use masked URB write where one can tell the HW to
* actually write data only for selected channels even though all are * actually write data only for selected channels even though all are
@ -1025,14 +1027,19 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* 4 slots data. All are explicitly zeros in order to to keep the MBZ * 4 slots data. All are explicitly zeros in order to to keep the MBZ
* area written as zeros. * area written as zeros.
*/ */
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u)); bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u)); bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u)); bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u)); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload); reg_undef, srcs, ARRAY_SIZE(srcs));
inst->eot = true; inst->eot = true;
inst->mlen = 6; inst->mlen = 6;
inst->offset = 0; inst->offset = 0;

View file

@ -73,8 +73,27 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
assert(inst->header_size == 0); assert(inst->header_size == 0);
fs_reg *payload_sources = new fs_reg[inst->mlen];
fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
BRW_REGISTER_TYPE_F);
unsigned header_size = 0;
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
if (per_slot_present)
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
if (channel_mask_present)
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
delete [] payload_sources;
inst->opcode = SHADER_OPCODE_SEND; inst->opcode = SHADER_OPCODE_SEND;
inst->header_size = 1; inst->header_size = header_size;
inst->dst = brw_null_reg(); inst->dst = brw_null_reg();
inst->sfid = BRW_SFID_URB; inst->sfid = BRW_SFID_URB;
@ -88,13 +107,11 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
inst->ex_mlen = 0; inst->ex_mlen = 0;
inst->send_has_side_effects = true; inst->send_has_side_effects = true;
fs_reg tmp = inst->src[0];
inst->resize_sources(4); inst->resize_sources(4);
inst->src[0] = brw_imm_ud(0); /* desc */ inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = brw_imm_ud(0); /* ex_desc */ inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = tmp; inst->src[2] = payload;
inst->src[3] = brw_null_reg(); inst->src[3] = brw_null_reg();
} }

View file

@ -892,25 +892,25 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
fs_builder bld8 = bld.group(8, q); fs_builder bld8 = bld.group(8, q);
fs_reg payload_srcs[6]; fs_reg payload_srcs[4];
unsigned p = 0; unsigned length = 0;
payload_srcs[p++] = urb_handle;
payload_srcs[p++] = brw_imm_ud(first_mask << 16);
const unsigned header_size = p;
for (unsigned i = 0; i < comp_shift; i++) for (unsigned i = 0; i < comp_shift; i++)
payload_srcs[p++] = reg_undef; payload_srcs[length++] = reg_undef;
for (unsigned c = 0; c < first_comps; c++) for (unsigned c = 0; c < first_comps; c++)
payload_srcs[p++] = quarter(offset(src, bld, c), q); payload_srcs[length++] = quarter(offset(src, bld, c), q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size); srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16);
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload); reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = p; inst->mlen = 2 + length;
inst->offset = urb_global_offset; inst->offset = urb_global_offset;
assert(inst->offset < 2048); assert(inst->offset < 2048);
} }
@ -923,22 +923,22 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
fs_builder bld8 = bld.group(8, q); fs_builder bld8 = bld.group(8, q);
fs_reg payload_srcs[6]; fs_reg payload_srcs[4];
unsigned p = 0; unsigned length = 0;
payload_srcs[p++] = urb_handle;
payload_srcs[p++] = brw_imm_ud(second_mask << 16);
const unsigned header_size = p;
for (unsigned c = 0; c < second_comps; c++) for (unsigned c = 0; c < second_comps; c++)
payload_srcs[p++] = quarter(offset(src, bld, c + first_comps), q); payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size); srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16);
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload); reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = p; inst->mlen = 2 + length;
inst->offset = urb_global_offset; inst->offset = urb_global_offset;
assert(inst->offset < 2048); assert(inst->offset < 2048);
} }
@ -988,21 +988,23 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
bld8.SHR(off, off, brw_imm_ud(2)); bld8.SHR(off, off, brw_imm_ud(2));
fs_reg payload_srcs[7]; fs_reg payload_srcs[4];
int x = 0; unsigned length = 0;
payload_srcs[x++] = urb_handle;
payload_srcs[x++] = off;
payload_srcs[x++] = mask;
for (unsigned j = 0; j < 4; j++) for (unsigned j = 0; j < 4; j++)
payload_srcs[x++] = quarter(src_comp, q); payload_srcs[length++] = quarter(src_comp, q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, x); fs_reg srcs[URB_LOGICAL_NUM_SRCS];
bld8.LOAD_PAYLOAD(payload, payload_srcs, x, 3); srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL, fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload); reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = x; inst->mlen = 3 + length;
inst->offset = 0; inst->offset = 0;
} }
} }