diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 0afe9d43b80..91ec355cbda 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -1706,3 +1706,53 @@ ac_nir_varying_estimate_instr_cost(nir_instr *instr) unreachable("unexpected instr type"); } } + +typedef struct { + enum amd_gfx_level gfx_level; + bool use_llvm; + bool after_lowering; +} mem_access_cb_data; + +static bool +use_smem_for_load(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data_) +{ + const mem_access_cb_data *cb_data = (mem_access_cb_data *)cb_data_; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_global_amd: + case nir_intrinsic_load_constant: + if (cb_data->use_llvm) + return false; + break; + case nir_intrinsic_load_ubo: + break; + default: + return false; + } + + if (intrin->def.divergent || (cb_data->after_lowering && intrin->def.bit_size < 32)) + return false; + + enum gl_access_qualifier access = nir_intrinsic_access(intrin); + bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); + bool reorder = nir_intrinsic_can_reorder(intrin) || ((access & ACCESS_NON_WRITEABLE) && !(access & ACCESS_VOLATILE)); + if (!reorder || (glc && cb_data->gfx_level < GFX8)) + return false; + + nir_intrinsic_set_access(intrin, access | ACCESS_SMEM_AMD); + return true; +} + +bool +ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm, bool after_lowering) +{ + mem_access_cb_data cb_data = { + .gfx_level = gfx_level, + .use_llvm = use_llvm, + .after_lowering = after_lowering, + }; + return nir_shader_intrinsics_pass(shader, &use_smem_for_load, nir_metadata_all, &cb_data); +} diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 15d5555bef1..166da2f378a 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -323,6 +323,9 @@ ac_nir_varying_estimate_instr_cost(nir_instr *instr); bool ac_nir_opt_shared_append(nir_shader *shader); +bool +ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm, bool after_lowering); + #ifdef __cplusplus } #endif diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index e614e0c53d1..9fcbd4cfe18 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -6020,13 +6020,10 @@ load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, { Builder bld(ctx->program, ctx->block); - bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); - - bool use_smem = dst.type() != RegType::vgpr && (ctx->options->gfx_level >= GFX8 || !glc) && - (access & ACCESS_CAN_REORDER); - if (use_smem) + bool use_smem = access & ACCESS_SMEM_AMD; + if (use_smem) { offset = bld.as_uniform(offset); - else { + } else { /* GFX6-7 are affected by a hw bug that prevents address clamping to * work correctly when the SGPR offset is used. */ @@ -6054,7 +6051,8 @@ visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr) unsigned size = instr->def.bit_size / 8; load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), - nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr)); + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), + nir_intrinsic_access(instr) | ACCESS_CAN_REORDER); } void @@ -6084,7 +6082,7 @@ visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr) Operand::c32(desc[3])); unsigned size = instr->def.bit_size / 8; load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, nir_intrinsic_align_mul(instr), - nir_intrinsic_align_offset(instr)); + nir_intrinsic_align_offset(instr), nir_intrinsic_access(instr) | ACCESS_CAN_REORDER); } /* Packs multiple Temps of different sizes in to a vector of v1 Temps. @@ -6921,23 +6919,17 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) num_components, component_size, align, false); unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD; - bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); - - /* VMEM stores don't update the SMEM cache and it's difficult to prove that - * it's safe to use SMEM */ - bool can_use_smem = (access & ACCESS_NON_WRITEABLE) && byte_align_for_smem; - if (info.dst.type() == RegType::vgpr || (ctx->options->gfx_level < GFX8 && glc) || - !can_use_smem) { - EmitLoadParameters params = global_load_params; - params.byte_align_loads = byte_align_for_vmem; - info.cache = get_cache_flags(ctx, access); - emit_load(ctx, bld, info, params); - } else { + if ((access & ACCESS_SMEM_AMD) && byte_align_for_smem) { if (info.resource.id()) info.resource = bld.as_uniform(info.resource); info.offset = Operand(bld.as_uniform(info.offset)); info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM); emit_load(ctx, bld, info, smem_load_params); + } else { + EmitLoadParameters params = global_load_params; + params.byte_align_loads = byte_align_for_vmem; + info.cache = get_cache_flags(ctx, access); + emit_load(ctx, bld, info, params); } } diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 91cac51ebc4..f4632f3fbef 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -376,6 +376,7 @@ init_context(isel_context* ctx, nir_shader* shader) } apply_nuw_to_offsets(ctx, impl); + ac_nir_flag_smem_for_loads(shader, ctx->program->gfx_level, false, true); /* sanitize control flow */ sanitize_cf_list(impl, &impl->body); diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 60ac99159ae..06fa6ecceef 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1186,7 +1186,7 @@ load("task_payload", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) # src[] = { offset }. load("push_constant", [1], [BASE, RANGE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { offset }. -load("constant", [1], [BASE, RANGE, ALIGN_MUL, ALIGN_OFFSET], +load("constant", [1], [BASE, RANGE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { address }. load("global", [1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index b1953994ce0..86453542905 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -810,6 +810,7 @@ print_access(enum gl_access_qualifier access, print_state *state, const char *se { ACCESS_CP_GE_COHERENT_AMD, "cp-ge-coherent-amd" }, { ACCESS_IN_BOUNDS_AGX, "in-bounds-agx" }, { ACCESS_KEEP_SCALAR, "keep-scalar" }, + { ACCESS_SMEM_AMD, "smem-amd" }, }; bool first = true; diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h index a1df5a5c96b..a9a38bf3149 100644 --- a/src/compiler/shader_enums.h +++ b/src/compiler/shader_enums.h @@ -1159,6 +1159,11 @@ enum gl_access_qualifier * shader where the API wants to copy all bytes that are resident. */ ACCESS_KEEP_SCALAR = (1 << 15), + + /** + * Indicates that this load will use SMEM. + */ + ACCESS_SMEM_AMD = (1 << 16), }; /**