diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index 9405a01b871..29e9b631585 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -138,6 +138,7 @@ struct isel_context { /* NIR range analysis. */ struct hash_table* range_ht; + struct hash_table* numlsb_ht; Temp arg_temps[AC_MAX_ARGS]; Operand workgroup_id[3]; diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 7746b05f418..d57105cec31 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -375,6 +375,7 @@ init_context(isel_context* ctx, nir_shader* shader) /* Init NIR range analysis. */ ctx->range_ht = _mesa_pointer_hash_table_create(NULL); + ctx->numlsb_ht = _mesa_pointer_hash_table_create(NULL); uint32_t options = shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; @@ -728,6 +729,7 @@ init_context(isel_context* ctx, nir_shader* shader) void cleanup_context(isel_context* ctx) { + _mesa_hash_table_destroy(ctx->numlsb_ht, NULL); _mesa_hash_table_destroy(ctx->range_ht, NULL); } diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 1d3c6f9b9ec..01bc8ff9774 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -9,6 +9,8 @@ #include "aco_instruction_selection.h" #include "aco_ir.h" +#include "nir_range_analysis.h" + #include "ac_descriptors.h" #include "ac_nir.h" #include "amdgfxregs.h" @@ -281,6 +283,7 @@ struct LoadEmitInfo { unsigned align_mul = 0; unsigned align_offset = 0; pipe_format format; + nir_src* resource_src = NULL; /* should be equal to resource or NULL */ nir_src* offset_src = NULL; /* should be equal to offset or NULL */ isel_context* ctx; @@ -350,6 +353,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, if (new_info.resource.id() && new_info.resource.size() == 2 && add_might_overflow(ctx, info.offset_src, to_add)) { new_info.resource = add64_32(bld, new_info.resource, Operand::c32(to_add)); + new_info.resource_src = NULL; offset_changed = false; } else if (offset.isConstant()) { offset = Operand::c32(offset.constantValue() + to_add); @@ -502,6 +506,12 @@ get_smem_opcode(amd_gfx_level level, unsigned bytes, bool buffer, bool round_dow return {buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16, 64}; } +unsigned +src_has_req_lsb(isel_context* ctx, nir_src* src, unsigned req) +{ + return src && nir_def_num_lsb_zero(ctx->numlsb_ht, nir_get_scalar(src->ssa, 0)) >= req; +} + Temp smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed, unsigned align) { @@ -540,6 +550,30 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed */ RegClass rc(RegType::sgpr, DIV_ROUND_UP(util_next_power_of_two(bytes_needed), 4u)); + unsigned req_lsb_zero = bytes_needed == 1 ? 0 : (bytes_needed == 2 ? 1 : 2); + if (!buffer) { + /* We require each offset source and the final address to be aligned, so ensure at least + * two sources are aligned. The remaining one can then be assumed to be aligned, otherwise the + * final address is unaligned. */ + // TODO: lower in NIR + bool addr_aligned = src_has_req_lsb(info.ctx, info.resource_src, req_lsb_zero); + bool offset_aligned = + !offset.id() || src_has_req_lsb(info.ctx, info.offset_src, req_lsb_zero); + bool const_aligned = !const_offset || ffs(const_offset) > req_lsb_zero; + + if (!offset_aligned && (!addr_aligned || !const_aligned)) { + addr = add64_32(bld, addr, Operand(offset)); + offset = Temp(); + } + if (!const_aligned && (!addr_aligned || !offset_aligned)) { + addr = add64_32(bld, addr, Operand::c32(const_offset)); + const_offset = 0; + } + } else { + /* We assume the buffer resource is also aligned. */ + assert(!const_offset || ffs(const_offset) > req_lsb_zero); + } + bool soe = !buffer && offset.id() && const_offset && bld.program->gfx_level >= GFX9; aco_ptr load{create_instruction(op, Format::SMEM, 2 + soe, 1)}; if (buffer) { @@ -2334,7 +2368,8 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); info.sync = get_memory_sync_info(instr, storage_buffer, 0); - info.offset_src = &instr->src[1]; + info.resource_src = &instr->src[0]; + info.offset_src = offset.id() ? &instr->src[1] : NULL; info.cache = get_cache_flags(ctx, access, ac_access_type_load); info.disable_wqm = access & ACCESS_SKIP_HELPERS;