aco: don't use SMEM for SSBO stores

fossil-db (Navi): Totals from 70 (0.05% of 138791) affected shaders: SGPRs: 2324 -> 2097 (-9.77%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 157872 -> 154836 (-1.92%); split: -1.93%, +0.01% MaxWaves: 1288 -> 1260 (-2.17%) Instrs: 29730 -> 29108 (-2.09%); split: -2.13%, +0.04% Cycles: 394944 -> 391280 (-0.93%); split: -0.94%, +0.01% VMEM: 5288 -> 5695 (+7.70%); split: +11.97%, -4.27% SMEM: 2680 -> 2444 (-8.81%); split: +1.34%, -10.15% VClause: 291 -> 502 (+72.51%) SClause: 1176 -> 918 (-21.94%) Copies: 3549 -> 3517 (-0.90%); split: -1.80%, +0.90% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1675 -> 1491 (-10.99%) PreVGPRs: 1101 -> 1223 (+11.08%) Totals from 70 (0.05% of 139517) affected shaders (RAVEN): SGPRs: 2368 -> 2121 (-10.43%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 156664 -> 153252 (-2.18%) MaxWaves: 636 -> 622 (-2.20%) Instrs: 29968 -> 29226 (-2.48%) Cycles: 398284 -> 393492 (-1.20%) VMEM: 5544 -> 5930 (+6.96%); split: +11.72%, -4.76% SMEM: 2752 -> 2502 (-9.08%); split: +1.20%, -10.28% VClause: 292 -> 504 (+72.60%) SClause: 1236 -> 940 (-23.95%) Copies: 3907 -> 3852 (-1.41%); split: -2.20%, +0.79% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1671 -> 1487 (-11.01%) PreVGPRs: 1102 -> 1225 (+11.16%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6143>
2026-05-09 02:28:10 +02:00 · 2020-07-30 19:44:28 +01:00 · 2020-07-30 19:44:28 +01:00 · 867323379e
commit 867323379e
parent 2410def98f
9 changed files with 27 additions and 143 deletions
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -146,7 +146,7 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
      FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
      return flat->disable_wqm;
   } else {
-      return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
+      return instr->format == Format::EXP;
   }
 }

@ -648,43 +648,6 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
   return i;
 }

-void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
-{
-   Operand offset = instr->operands[1];
-   if (need_check) {
-      /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
-      Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
-
-      if (offset.isLiteral())
-         offset = bld.copy(bld.def(s1), offset);
-
-      offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
-                        offset, Operand(UINT32_MAX), bld.scc(nonempty));
-   } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
-      offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
-   }
-   if (!offset.isConstant())
-      offset.setFixed(m0);
-
-   switch (instr->operands[2].size()) {
-   case 1:
-      instr->opcode = aco_opcode::s_buffer_store_dword;
-      break;
-   case 2:
-      instr->opcode = aco_opcode::s_buffer_store_dwordx2;
-      break;
-   case 4:
-      instr->opcode = aco_opcode::s_buffer_store_dwordx4;
-      break;
-   default:
-      unreachable("Invalid SMEM buffer store size");
-   }
-   instr->operands[1] = offset;
-   /* as_uniform() needs to be done here so it's done in exact mode and helper
-    * lanes don't contribute. */
-   instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
-}
-
 void process_instructions(exec_ctx& ctx, Block* block,
                          std::vector<aco_ptr<Instruction>>& instructions,
                          unsigned idx)
@ -831,10 +794,6 @@ void process_instructions(exec_ctx& ctx, Block* block,
         instr->operands[0] = bld.scc(exit_cond);
         state = Exact;

-      } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
-         bool need_check = ctx.info[block->index].exec.size() != 1 &&
-                           !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
-         lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
      }

      bld.insert(std::move(instr));
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -3801,24 +3801,21 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
 }


-aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+aco_opcode get_buffer_store_op(unsigned bytes)
 {
   switch (bytes) {
   case 1:
-      assert(!smem);
      return aco_opcode::buffer_store_byte;
   case 2:
-      assert(!smem);
      return aco_opcode::buffer_store_short;
   case 4:
-      return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+      return aco_opcode::buffer_store_dword;
   case 8:
-      return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+      return aco_opcode::buffer_store_dwordx2;
   case 12:
-      assert(!smem);
      return aco_opcode::buffer_store_dwordx3;
   case 16:
-      return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+      return aco_opcode::buffer_store_dwordx4;
   }
   unreachable("Unexpected store size");
   return aco_opcode::num_opcodes;
@ -3943,7 +3940,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
   assert(vdata.size() >= 1 && vdata.size() <= 4);

   Builder bld(ctx->program, ctx->block);
-   aco_opcode op = get_buffer_store_op(false, vdata.bytes());
+   aco_opcode op = get_buffer_store_op(vdata.bytes());
   const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);

   Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
@ -6198,70 +6195,29 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)

   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
-   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
-   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
-    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
-    */
-   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
-
-   bool smem = !nir_src_is_divergent(instr->src[2]) &&
-               ctx->options->chip_class >= GFX8 &&
-               ctx->options->chip_class < GFX10_3 &&
-               (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
-               allow_smem;
-   if (smem)
-      offset = bld.as_uniform(offset);
-   bool smem_nonfs = smem && ctx->stage != fragment_fs;

   unsigned write_count = 0;
   Temp write_datas[32];
   unsigned offsets[32];
-   split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+   split_buffer_store(ctx, instr, false, RegType::vgpr,
                      data, writemask, 16, &write_count, write_datas, offsets);

   for (unsigned i = 0; i < write_count; i++) {
-      aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
-      if (smem && ctx->stage == fragment_fs)
-         op = aco_opcode::p_fs_buffer_store_smem;
+      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());

-      if (smem) {
-         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
-         store->operands[0] = Operand(rsrc);
-         if (offsets[i]) {
-            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                      offset, Operand(offsets[i]));
-            store->operands[1] = Operand(off);
-         } else {
-            store->operands[1] = Operand(offset);
-         }
-         if (op != aco_opcode::p_fs_buffer_store_smem)
-            store->operands[1].setFixed(m0);
-         store->operands[2] = Operand(write_datas[i]);
-         store->glc = glc;
-         store->dlc = false;
-         store->disable_wqm = true;
-         store->sync = sync;
-         ctx->block->instructions.emplace_back(std::move(store));
-         ctx->program->wb_smem_l1_on_end = true;
-         if (op == aco_opcode::p_fs_buffer_store_smem) {
-            ctx->block->kind |= block_kind_needs_lowering;
-            ctx->program->needs_exact = true;
-         }
-      } else {
-         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
-         store->operands[0] = Operand(rsrc);
-         store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-         store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
-         store->operands[3] = Operand(write_datas[i]);
-         store->offset = offsets[i];
-         store->offen = (offset.type() == RegType::vgpr);
-         store->glc = glc;
-         store->dlc = false;
-         store->disable_wqm = true;
-         store->sync = sync;
-         ctx->program->needs_exact = true;
-         ctx->block->instructions.emplace_back(std::move(store));
-      }
+      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+      store->operands[0] = Operand(rsrc);
+      store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+      store->operands[3] = Operand(write_datas[i]);
+      store->offset = offsets[i];
+      store->offen = (offset.type() == RegType::vgpr);
+      store->glc = glc;
+      store->dlc = false;
+      store->disable_wqm = true;
+      store->sync = sync;
+      ctx->program->needs_exact = true;
+      ctx->block->instructions.emplace_back(std::move(store));
   }
 }

@ -6469,7 +6425,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
      } else {
         assert(ctx->options->chip_class == GFX6);

-         aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());

         Temp rsrc = get_gfx6_global_rsrc(bld, addr);

@ -6906,7 +6862,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                      swizzle_component_size, &write_count, write_datas, offsets);

   for (unsigned i = 0; i < write_count; i++) {
-      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
   }
@ -11863,8 +11819,6 @@ void select_program(Program *program,
   append_logical_end(ctx.block);
   ctx.block->kind |= block_kind_uniform;
   Builder bld(ctx.program, ctx.block);
-   if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
   bld.sopp(aco_opcode::s_endpgm);

   cleanup_cfg(program);
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@ -46,8 +46,6 @@ enum resource_flags {
   has_nonglc_vmem_store = 0x8,

   has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store,
-   has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load,
-   has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store,

   buffer_is_restrict = 0x10,
 };
@ -200,20 +198,6 @@ inline uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def
   return res;
 }

-inline bool can_subdword_ssbo_store_use_smem(nir_intrinsic_instr *intrin)
-{
-   unsigned wrmask = nir_intrinsic_write_mask(intrin);
-   if (util_last_bit(wrmask) != util_bitcount(wrmask) ||
-       util_bitcount(wrmask) * intrin->src[0].ssa->bit_size % 32 ||
-       util_bitcount(wrmask) != intrin->src[0].ssa->num_components)
-      return false;
-
-   if (nir_intrinsic_align_mul(intrin) % 4 || nir_intrinsic_align_offset(intrin) % 4)
-      return false;
-
-   return true;
-}
-
 void init_context(isel_context *ctx, nir_shader *shader);
 void cleanup_context(isel_context *ctx);

--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -189,10 +189,7 @@ void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
            res = intrin->src[0].ssa;
            break;
         case nir_intrinsic_store_ssbo:
-            if (nir_src_is_divergent(intrin->src[2]) ||
-                ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 ||
-                (intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin)))
-               flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
+            flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
            res = intrin->src[1].ssa;
            break;
         case nir_intrinsic_load_global:
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -1640,7 +1640,6 @@ public:
   Stage stage;
   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
   bool needs_wqm = false; /* there exists a p_wqm instruction */
-   bool wb_smem_l1_on_end = false;

   std::vector<uint8_t> constant_data;
   Temp private_segment_buffer;
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -1875,8 +1875,6 @@ void lower_to_hw_instr(Program* program)
                  bld.reset(discard_block);
                  bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
                          0, V_008DFC_SQ_EXP_NULL, false, true, true);
-                  if (program->wb_smem_l1_on_end)
-                     bld.smem(aco_opcode::s_dcache_wb);
                  bld.sopp(aco_opcode::s_endpgm);

                  bld.reset(&ctx.instructions);
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -285,8 +285,6 @@ opcode("p_demote_to_helper")
 opcode("p_is_helper")
 opcode("p_exit_early_if")

-opcode("p_fs_buffer_store_smem", format=Format.SMEM)
-
 # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
 opcode("p_bpermute")

--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@ -136,10 +136,7 @@ void finish_program(Program *program)
   for (Block& block : program->blocks) {
      if (block.linear_succs.size() == 0) {
         block.kind |= block_kind_uniform;
-         Builder bld(program, &block);
-         if (program->wb_smem_l1_on_end)
-            bld.smem(aco_opcode::s_dcache_wb, false);
-         bld.sopp(aco_opcode::s_endpgm);
+         Builder(program, &block).sopp(aco_opcode::s_endpgm);
      }
   }
 }
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@ -67,10 +67,8 @@ BEGIN_TEST(isel.compute.simple)
            uint res;
         };
         void main() {
-            //~gfx7>> v1: %data = p_parallelcopy 42
-            //~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
-            //~gfx8>> s1: %data = p_parallelcopy 42
-            //~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation
+            //>> v1: %data = p_parallelcopy 42
+            //buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
            res = 42;
         }
      );