brw: enable A64 loads source rematerialization

Allows to avoid Wa_1407528679 on A64 loads

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29663>
This commit is contained in:
Lionel Landwerlin 2024-06-13 20:51:24 +03:00 committed by Marge Bot
parent f482fc33cf
commit 339630ab05

View file

@ -57,7 +57,7 @@ struct nir_to_brw_state {
fs_reg *ssa_values;
fs_inst **resource_insts;
struct brw_fs_bind_info *ssa_bind_infos;
fs_reg *resource_values;
fs_reg *uniform_values;
fs_reg *system_values;
};
@ -395,7 +395,7 @@ fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
ntb.ssa_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
ntb.resource_insts = rzalloc_array(ntb.mem_ctx, fs_inst *, impl->ssa_alloc);
ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
ntb.resource_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
ntb.uniform_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
fs_nir_emit_cf_list(ntb, &impl->body);
}
@ -1863,7 +1863,7 @@ get_resource_nir_src(nir_to_brw_state &ntb, const nir_src &src)
{
if (!is_resource_src(src))
return fs_reg();
return ntb.resource_values[src.ssa->index];
return ntb.uniform_values[src.ssa->index];
}
static fs_reg
@ -4578,7 +4578,8 @@ add_rebuild_src(nir_src *src, void *state)
}
static fs_reg
try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def *resource_def)
try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld,
nir_def *resource_def, bool a64 = false)
{
/* Create a build at the location of the resource_intel intrinsic */
fs_builder ubld8 = bld.exec_all().group(8, 0);
@ -4605,11 +4606,32 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
unsigned base_offset = nir_intrinsic_base(intrin);
unsigned load_offset = nir_src_as_uint(intrin->src[0]);
fs_reg src(UNIFORM, base_offset / 4,
brw_type_with_size(BRW_TYPE_UD, intrin->def.bit_size));
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size));
src.offset = load_offset + base_offset % 4;
return src;
}
case nir_intrinsic_load_mesh_inline_data_intel: {
assert(ntb.s.stage == MESA_SHADER_MESH ||
ntb.s.stage == MESA_SHADER_TASK);
const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload();
fs_reg data = offset(payload.inline_parameter, 1,
nir_intrinsic_align_offset(intrin));
return retype(data, brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size));
}
case nir_intrinsic_load_btd_local_arg_addr_intel: {
assert(brw_shader_stage_is_bindless(ntb.s.stage));
const bs_thread_payload &payload = ntb.s.bs_payload();
return retype(payload.local_arg_ptr, BRW_TYPE_Q);
}
case nir_intrinsic_load_btd_global_arg_addr_intel: {
assert(brw_shader_stage_is_bindless(ntb.s.stage));
const bs_thread_payload &payload = ntb.s.bs_payload();
return retype(payload.global_arg_ptr, BRW_TYPE_Q);
}
default:
/* Execute the code below, since we have to generate new
* instructions.
@ -4620,7 +4642,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
}
#if 0
fprintf(stderr, "Trying remat : ");
fprintf(stderr, "Trying remat :\n");
for (unsigned i = 0; i < resources.array.size(); i++) {
fprintf(stderr, " ");
nir_print_instr(resources.array[i]->parent_instr, stderr);
@ -4636,7 +4658,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
case nir_instr_type_load_const: {
nir_load_const_instr *load_const =
nir_instr_as_load_const(instr);
ubld8.MOV(brw_imm_ud(load_const->value[0].i32),
ubld8.MOV(brw_imm_d(load_const->value[0].i32),
&ntb.resource_insts[def->index]);
break;
}
@ -4680,11 +4702,45 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
&ntb.resource_insts[def->index]);
break;
}
case nir_op_iand:
ubld8.AND(srcs[0], srcs[1], &ntb.resource_insts[def->index]);
break;
case nir_op_ishl:
ubld8.SHL(srcs[0], srcs[1], &ntb.resource_insts[def->index]);
break;
case nir_op_mov:
break;
case nir_op_ult32: {
if (brw_type_size_bits(srcs[0].type) != 32)
break;
fs_reg dst = ubld8.vgrf(srcs[0].type);
enum brw_reg_type utype =
brw_type_with_size(srcs[0].type,
brw_type_size_bits(srcs[0].type));
ntb.resource_insts[def->index] =
ubld8.CMP(dst,
retype(srcs[0], utype),
retype(srcs[1], utype),
brw_cmod_for_nir_comparison(alu->op));
break;
}
case nir_op_b2i32:
ubld8.MOV(negate(retype(srcs[0], BRW_TYPE_D)),
&ntb.resource_insts[def->index]);
break;
case nir_op_unpack_64_2x32_split_x:
ubld8.MOV(subscript(srcs[0], BRW_TYPE_D, 0),
&ntb.resource_insts[def->index]);
break;
case nir_op_unpack_64_2x32_split_y:
ubld8.MOV(subscript(srcs[0], BRW_TYPE_D, 1),
&ntb.resource_insts[def->index]);
break;
case nir_op_pack_64_2x32_split: {
fs_reg dst = ubld8.vgrf(BRW_TYPE_Q);
ntb.resource_insts[def->index] =
ubld8.emit(FS_OPCODE_PACK, dst, srcs[0], srcs[1]);
}
default:
break;
}
@ -4712,9 +4768,37 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
break;
}
case nir_intrinsic_load_mesh_inline_data_intel: {
assert(ntb.s.stage == MESA_SHADER_MESH ||
ntb.s.stage == MESA_SHADER_TASK);
const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload();
fs_reg data = retype(
offset(payload.inline_parameter, 1,
nir_intrinsic_align_offset(intrin)),
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size));
ubld8.MOV(data, &ntb.resource_insts[def->index]);
break;
}
case nir_intrinsic_load_btd_local_arg_addr_intel: {
assert(brw_shader_stage_is_bindless(ntb.s.stage));
const bs_thread_payload &payload = ntb.s.bs_payload();
ubld8.MOV(retype(payload.local_arg_ptr, BRW_TYPE_Q),
&ntb.resource_insts[def->index]);
break;
}
case nir_intrinsic_load_btd_global_arg_addr_intel: {
assert(brw_shader_stage_is_bindless(ntb.s.stage));
const bs_thread_payload &payload = ntb.s.bs_payload();
ubld8.MOV(retype(payload.global_arg_ptr, BRW_TYPE_Q),
&ntb.resource_insts[def->index]);
break;
}
case nir_intrinsic_load_reloc_const_intel: {
uint32_t id = nir_intrinsic_param_idx(intrin);
fs_reg dst = ubld8.vgrf(BRW_TYPE_UD);
fs_reg dst = ubld8.vgrf(BRW_TYPE_D);
ntb.resource_insts[def->index] =
ubld8.emit(SHADER_OPCODE_MOV_RELOC_IMM, dst,
brw_imm_ud(id), brw_imm_ud(0));
@ -4749,7 +4833,8 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
if (ntb.resource_insts[def->index] == NULL) {
#if 0
fprintf(stderr, "Tried remat : ");
if (a64) {
fprintf(stderr, "Tried remat :\n");
for (unsigned i = 0; i < resources.array.size(); i++) {
fprintf(stderr, " ");
nir_print_instr(resources.array[i]->parent_instr, stderr);
@ -4758,6 +4843,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def
fprintf(stderr, "failed at! : ");
nir_print_instr(instr, stderr);
fprintf(stderr, "\n");
}
#endif
return fs_reg();
}
@ -4907,8 +4993,10 @@ choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
}
static fs_reg
increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
increment_a64_address(const fs_builder &_bld, fs_reg address, uint32_t v, bool use_no_mask)
{
const fs_builder bld = use_no_mask ? _bld.exec_all().group(8, 0) : _bld;
if (bld.shader->devinfo->has_64bit_int) {
struct brw_reg imm = brw_imm_reg(address.type);
imm.u64 = v;
@ -5724,10 +5812,10 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
if (nir_intrinsic_resource_access_intel(instr) &
nir_resource_intel_non_uniform) {
ntb.resource_values[instr->def.index] = fs_reg();
ntb.uniform_values[instr->def.index] = fs_reg();
} else {
ntb.resource_values[instr->def.index] =
try_rebuild_resource(ntb, bld, instr->src[1].ssa);
ntb.uniform_values[instr->def.index] =
try_rebuild_source(ntb, bld, instr->src[1].ssa);
}
ntb.ssa_values[instr->def.index] =
ntb.ssa_values[instr->src[1].ssa->index];
@ -6529,9 +6617,16 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
ntb.uniform_values[instr->src[0].ssa->index] =
try_rebuild_source(ntb, bld, instr->src[0].ssa, true);
bool no_mask = ntb.uniform_values[instr->src[0].ssa->index].file != BAD_FILE;
fs_reg address =
ntb.uniform_values[instr->src[0].ssa->index].file != BAD_FILE ?
ntb.uniform_values[instr->src[0].ssa->index] :
bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
const fs_reg packed_consts =
ubld1.vgrf(BRW_TYPE_UD, total_dwords);
fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
while (loaded_dwords < total_dwords) {
const unsigned block =
@ -6546,12 +6641,15 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_TYPE_UD),
srcs, A64_LOGICAL_NUM_SRCS)->size_written =
fs_inst *inst =
ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_TYPE_UD),
srcs, A64_LOGICAL_NUM_SRCS);
inst->size_written =
align(block_bytes, REG_SIZE * reg_unit(devinfo));
inst->has_no_mask_send_params = no_mask;
address = increment_a64_address(ubld1, address, block_bytes);
address = increment_a64_address(ubld1, address, block_bytes, no_mask);
loaded_dwords += block;
}
@ -7397,7 +7495,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
retype(byte_offset(dest, loaded * 4), BRW_TYPE_UD),
srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
address = increment_a64_address(ubld1, address, block_bytes);
address = increment_a64_address(ubld1, address, block_bytes, false);
loaded += block;
}
@ -7434,7 +7532,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
srcs, A64_LOGICAL_NUM_SRCS);
const unsigned block_bytes = block * 4;
address = increment_a64_address(ubld1, address, block_bytes);
address = increment_a64_address(ubld1, address, block_bytes, false);
written += block;
}