From 339630ab05abcaf11a9f67b2dd42ef793d2f689a Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 13 Jun 2024 20:51:24 +0300 Subject: [PATCH] brw: enable A64 loads source rematerialization Allows to avoid Wa_1407528679 on A64 loads Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 138 +++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 20 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d36f86fc6c3..5f0753c75de 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -57,7 +57,7 @@ struct nir_to_brw_state { fs_reg *ssa_values; fs_inst **resource_insts; struct brw_fs_bind_info *ssa_bind_infos; - fs_reg *resource_values; + fs_reg *uniform_values; fs_reg *system_values; }; @@ -395,7 +395,7 @@ fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl) ntb.ssa_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc); ntb.resource_insts = rzalloc_array(ntb.mem_ctx, fs_inst *, impl->ssa_alloc); ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc); - ntb.resource_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc); + ntb.uniform_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc); fs_nir_emit_cf_list(ntb, &impl->body); } @@ -1863,7 +1863,7 @@ get_resource_nir_src(nir_to_brw_state &ntb, const nir_src &src) { if (!is_resource_src(src)) return fs_reg(); - return ntb.resource_values[src.ssa->index]; + return ntb.uniform_values[src.ssa->index]; } static fs_reg @@ -4578,7 +4578,8 @@ add_rebuild_src(nir_src *src, void *state) } static fs_reg -try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def *resource_def) +try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld, + nir_def *resource_def, bool a64 = false) { /* Create a build at the location of the resource_intel intrinsic */ fs_builder ubld8 = bld.exec_all().group(8, 0); @@ -4605,11 +4606,32 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def unsigned base_offset = nir_intrinsic_base(intrin); unsigned load_offset = nir_src_as_uint(intrin->src[0]); fs_reg src(UNIFORM, base_offset / 4, - brw_type_with_size(BRW_TYPE_UD, intrin->def.bit_size)); + brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size)); src.offset = load_offset + base_offset % 4; return src; } + case nir_intrinsic_load_mesh_inline_data_intel: { + assert(ntb.s.stage == MESA_SHADER_MESH || + ntb.s.stage == MESA_SHADER_TASK); + const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload(); + fs_reg data = offset(payload.inline_parameter, 1, + nir_intrinsic_align_offset(intrin)); + return retype(data, brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size)); + } + + case nir_intrinsic_load_btd_local_arg_addr_intel: { + assert(brw_shader_stage_is_bindless(ntb.s.stage)); + const bs_thread_payload &payload = ntb.s.bs_payload(); + return retype(payload.local_arg_ptr, BRW_TYPE_Q); + } + + case nir_intrinsic_load_btd_global_arg_addr_intel: { + assert(brw_shader_stage_is_bindless(ntb.s.stage)); + const bs_thread_payload &payload = ntb.s.bs_payload(); + return retype(payload.global_arg_ptr, BRW_TYPE_Q); + } + default: /* Execute the code below, since we have to generate new * instructions. @@ -4620,7 +4642,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def } #if 0 - fprintf(stderr, "Trying remat : "); + fprintf(stderr, "Trying remat :\n"); for (unsigned i = 0; i < resources.array.size(); i++) { fprintf(stderr, " "); nir_print_instr(resources.array[i]->parent_instr, stderr); @@ -4636,7 +4658,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def case nir_instr_type_load_const: { nir_load_const_instr *load_const = nir_instr_as_load_const(instr); - ubld8.MOV(brw_imm_ud(load_const->value[0].i32), + ubld8.MOV(brw_imm_d(load_const->value[0].i32), &ntb.resource_insts[def->index]); break; } @@ -4680,11 +4702,45 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def &ntb.resource_insts[def->index]); break; } + case nir_op_iand: + ubld8.AND(srcs[0], srcs[1], &ntb.resource_insts[def->index]); + break; case nir_op_ishl: ubld8.SHL(srcs[0], srcs[1], &ntb.resource_insts[def->index]); break; case nir_op_mov: break; + case nir_op_ult32: { + if (brw_type_size_bits(srcs[0].type) != 32) + break; + fs_reg dst = ubld8.vgrf(srcs[0].type); + enum brw_reg_type utype = + brw_type_with_size(srcs[0].type, + brw_type_size_bits(srcs[0].type)); + ntb.resource_insts[def->index] = + ubld8.CMP(dst, + retype(srcs[0], utype), + retype(srcs[1], utype), + brw_cmod_for_nir_comparison(alu->op)); + break; + } + case nir_op_b2i32: + ubld8.MOV(negate(retype(srcs[0], BRW_TYPE_D)), + &ntb.resource_insts[def->index]); + break; + case nir_op_unpack_64_2x32_split_x: + ubld8.MOV(subscript(srcs[0], BRW_TYPE_D, 0), + &ntb.resource_insts[def->index]); + break; + case nir_op_unpack_64_2x32_split_y: + ubld8.MOV(subscript(srcs[0], BRW_TYPE_D, 1), + &ntb.resource_insts[def->index]); + break; + case nir_op_pack_64_2x32_split: { + fs_reg dst = ubld8.vgrf(BRW_TYPE_Q); + ntb.resource_insts[def->index] = + ubld8.emit(FS_OPCODE_PACK, dst, srcs[0], srcs[1]); + } default: break; } @@ -4712,9 +4768,37 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def break; } + case nir_intrinsic_load_mesh_inline_data_intel: { + assert(ntb.s.stage == MESA_SHADER_MESH || + ntb.s.stage == MESA_SHADER_TASK); + const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload(); + fs_reg data = retype( + offset(payload.inline_parameter, 1, + nir_intrinsic_align_offset(intrin)), + brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size)); + ubld8.MOV(data, &ntb.resource_insts[def->index]); + break; + } + + case nir_intrinsic_load_btd_local_arg_addr_intel: { + assert(brw_shader_stage_is_bindless(ntb.s.stage)); + const bs_thread_payload &payload = ntb.s.bs_payload(); + ubld8.MOV(retype(payload.local_arg_ptr, BRW_TYPE_Q), + &ntb.resource_insts[def->index]); + break; + } + + case nir_intrinsic_load_btd_global_arg_addr_intel: { + assert(brw_shader_stage_is_bindless(ntb.s.stage)); + const bs_thread_payload &payload = ntb.s.bs_payload(); + ubld8.MOV(retype(payload.global_arg_ptr, BRW_TYPE_Q), + &ntb.resource_insts[def->index]); + break; + } + case nir_intrinsic_load_reloc_const_intel: { uint32_t id = nir_intrinsic_param_idx(intrin); - fs_reg dst = ubld8.vgrf(BRW_TYPE_UD); + fs_reg dst = ubld8.vgrf(BRW_TYPE_D); ntb.resource_insts[def->index] = ubld8.emit(SHADER_OPCODE_MOV_RELOC_IMM, dst, brw_imm_ud(id), brw_imm_ud(0)); @@ -4749,7 +4833,8 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def if (ntb.resource_insts[def->index] == NULL) { #if 0 - fprintf(stderr, "Tried remat : "); + if (a64) { + fprintf(stderr, "Tried remat :\n"); for (unsigned i = 0; i < resources.array.size(); i++) { fprintf(stderr, " "); nir_print_instr(resources.array[i]->parent_instr, stderr); @@ -4758,6 +4843,7 @@ try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def fprintf(stderr, "failed at! : "); nir_print_instr(instr, stderr); fprintf(stderr, "\n"); + } #endif return fs_reg(); } @@ -4907,8 +4993,10 @@ choose_oword_block_size_dwords(const struct intel_device_info *devinfo, } static fs_reg -increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v) +increment_a64_address(const fs_builder &_bld, fs_reg address, uint32_t v, bool use_no_mask) { + const fs_builder bld = use_no_mask ? _bld.exec_all().group(8, 0) : _bld; + if (bld.shader->devinfo->has_64bit_int) { struct brw_reg imm = brw_imm_reg(address.type); imm.u64 = v; @@ -5724,10 +5812,10 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, if (nir_intrinsic_resource_access_intel(instr) & nir_resource_intel_non_uniform) { - ntb.resource_values[instr->def.index] = fs_reg(); + ntb.uniform_values[instr->def.index] = fs_reg(); } else { - ntb.resource_values[instr->def.index] = - try_rebuild_resource(ntb, bld, instr->src[1].ssa); + ntb.uniform_values[instr->def.index] = + try_rebuild_source(ntb, bld, instr->src[1].ssa); } ntb.ssa_values[instr->def.index] = ntb.ssa_values[instr->src[1].ssa->index]; @@ -6529,9 +6617,16 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder ubld8 = bld.exec_all().group(8, 0); const fs_builder ubld16 = bld.exec_all().group(16, 0); + ntb.uniform_values[instr->src[0].ssa->index] = + try_rebuild_source(ntb, bld, instr->src[0].ssa, true); + bool no_mask = ntb.uniform_values[instr->src[0].ssa->index].file != BAD_FILE; + fs_reg address = + ntb.uniform_values[instr->src[0].ssa->index].file != BAD_FILE ? + ntb.uniform_values[instr->src[0].ssa->index] : + bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); + const fs_reg packed_consts = ubld1.vgrf(BRW_TYPE_UD, total_dwords); - fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); while (loaded_dwords < total_dwords) { const unsigned block = @@ -6546,12 +6641,15 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); - ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, - retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_TYPE_UD), - srcs, A64_LOGICAL_NUM_SRCS)->size_written = + fs_inst *inst = + ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_TYPE_UD), + srcs, A64_LOGICAL_NUM_SRCS); + inst->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo)); + inst->has_no_mask_send_params = no_mask; - address = increment_a64_address(ubld1, address, block_bytes); + address = increment_a64_address(ubld1, address, block_bytes, no_mask); loaded_dwords += block; } @@ -7397,7 +7495,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, retype(byte_offset(dest, loaded * 4), BRW_TYPE_UD), srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; - address = increment_a64_address(ubld1, address, block_bytes); + address = increment_a64_address(ubld1, address, block_bytes, false); loaded += block; } @@ -7434,7 +7532,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, srcs, A64_LOGICAL_NUM_SRCS); const unsigned block_bytes = block * 4; - address = increment_a64_address(ubld1, address, block_bytes); + address = increment_a64_address(ubld1, address, block_bytes, false); written += block; }