aco: force uniform result for LDS load with uniform address if it can be non uniform

Because a LDS load is 2 separate loads on gfx10+ with wave64, a different wave can write LDS in between and cause a non uniform result. Use v_readfirst_lane instead of p_as_uniform because it cannot be copy propagated. Fixes a OpenCL CTS test with zink+rusticl. Totals from 136 (0.17% of 78196) affected shaders: MaxWaves: 3236 -> 3244 (+0.25%) Instrs: 130069 -> 131221 (+0.89%) CodeSize: 698048 -> 703436 (+0.77%) VGPRs: 5464 -> 5440 (-0.44%) SpillSGPRs: 94 -> 96 (+2.13%) Latency: 5361017 -> 5363781 (+0.05%); split: -0.00%, +0.05% InvThroughput: 883010 -> 884100 (+0.12%) SClause: 3822 -> 3821 (-0.03%); split: -0.05%, +0.03% Copies: 14220 -> 14314 (+0.66%); split: -0.01%, +0.68% Branches: 4549 -> 4551 (+0.04%) PreSGPRs: 4934 -> 4940 (+0.12%) PreVGPRs: 4666 -> 4655 (-0.24%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25973>
2025-12-23 09:00:10 +01:00 · 2023-10-31 15:42:31 +01:00 · 2023-10-31 15:42:31 +01:00 · 04956d54ce
commit 04956d54ce
parent a4597777fe
1 changed files with 60 additions and 14 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -3941,6 +3941,44 @@ visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
   }
 }
 Temp
 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
 {
   Builder bld(ctx->program, ctx->block);
   if (src.regClass().type() == RegType::sgpr) {
      bld.copy(Definition(dst), src);
   } else if (src.size() == 1) {
      bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
   } else {
      aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
         aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
      split->operands[0] = Operand(src);
      for (unsigned i = 0; i < src.size(); i++) {
         split->definitions[i] =
            bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
      }
      Instruction* split_raw = split.get();
      ctx->block->instructions.emplace_back(std::move(split));
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
      vec->definitions[0] = Definition(dst);
      for (unsigned i = 0; i < src.size(); i++) {
         vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
                                     split_raw->definitions[i].getTemp());
      }
      ctx->block->instructions.emplace_back(std::move(vec));
      if (src.bytes() % 4 == 0)
         emit_split_vector(ctx, dst, src.size());
   }
   return dst;
 }
 bool
 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
                                   unsigned align_, bool support_12_byte)
@ -3974,6 +4012,7 @@ struct LoadEmitInfo {
   bool glc = false;
   bool slc = false;
   bool split_by_component_stride = true;
   bool readfirstlane_for_uniform = false;
   unsigned swizzle_component_size = 0;
   memory_sync_info sync;
   Temp soffset = Temp(0, s1);
@ -4220,8 +4259,14 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
      /* try to p_as_uniform early so we can create more optimizable code and
       * also update allocated_vec */
      for (unsigned j = start; j < components_split; j++) {
-         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
+         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
-            allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
+            if (info.readfirstlane_for_uniform) {
               allocated_vec[j] = emit_readfirstlane(
                  ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
            } else {
               allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
            }
         }
         has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
      }
   }
@ -4243,7 +4288,10 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
      Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
      vec->definitions[0] = Definition(tmp);
      bld.insert(std::move(vec));
-      bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
+      if (info.readfirstlane_for_uniform)
         emit_readfirstlane(ctx, tmp, info.dst);
      else
         bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
   } else {
      vec->definitions[0] = Definition(info.dst);
      bld.insert(std::move(vec));
@ -4771,6 +4819,13 @@ load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, T
   info.align_offset = 0;
   info.sync = memory_sync_info(storage_shared);
   info.const_offset = base_offset;
   /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
    * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
    * to avoid copy-propagation.
    */
   info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
                                    ctx->program->wave_size == 64 &&
                                    ctx->program->workgroup_size > 64;
   emit_load(ctx, bld, info, lds_load_params);
   return dst;
@ -8496,22 +8551,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
   case nir_intrinsic_read_first_invocation: {
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
      Temp dst = get_ssa_temp(ctx, &instr->def);
-      if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
+      if (instr->def.bit_size == 1) {
         bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
      } else if (src.regClass() == v2) {
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
         lo = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo);
         hi = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi);
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
         emit_split_vector(ctx, dst, 2);
      } else if (instr->def.bit_size == 1) {
         assert(src.regClass() == bld.lm);
         Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
                             bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
         bool_to_vector_condition(ctx, tmp, dst);
      } else {
-         bld.copy(Definition(dst), src);
+         emit_readfirstlane(ctx, src, dst);
      }
      set_wqm(ctx);
      break;