nak: Loop to ensure we get accurate shader clocks

Even though CS2R can fetch a whole 64 bits at a time, that doesn't mean it does so atomically. Instead, we need to loop, alternating high and low until we fetch the same high value twice. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27303>
2026-02-19 01:40:43 +01:00 · 2024-01-26 09:13:28 -06:00 · 2024-01-26 09:13:28 -06:00 · 6260fa47ff
commit 6260fa47ff
parent 48ebfeba34
2 changed files with 57 additions and 6 deletions
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@ -509,10 +509,60 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
      break;
   }

-   case nir_intrinsic_shader_clock:
-      val = nir_load_sysval_nv(b, 64, .base = NAK_SV_CLOCK);
-      val = nir_unpack_64_2x32(b, val);
+   case nir_intrinsic_shader_clock: {
+      /* The CS2R opcode can load 64 bits worth of sysval data at a time but
+       * it's not actually atomic.  In order to get correct shader clocks, we
+       * need to do a loop where we do
+       *
+       *    CS2R SV_CLOCK_HI
+       *    CS2R SV_CLOCK_LO
+       *    CS2R SV_CLOCK_HI
+       *    CS2R SV_CLOCK_LO
+       *    CS2R SV_CLOCK_HI
+       *    ...
+       *
+       * The moment two high values are the same, we take the low value
+       * between them and that gives us our clock.
+       *
+       * In order to make sure we don't run into any weird races, we also need
+       * to insert a barrier after every load to ensure the one load completes
+       * before we kick off the next load.  Otherwise, if one load happens to
+       * be faster than the other (they are variable latency, after all) we're
+       * still guaranteed that the loads happen in the order we want.
+       */
+      nir_variable *clock =
+         nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
+
+      nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
+      nir_ssa_bar_nv(b, clock_hi);
+
+      nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
+
+      nir_push_loop(b);
+      {
+         nir_def *last_clock = nir_load_var(b, clock);
+
+         nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
+         nir_ssa_bar_nv(b, clock_lo);
+
+         clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
+         nir_ssa_bar_nv(b, clock_hi);
+
+         nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
+
+         nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
+         {
+            nir_jump(b, nir_jump_break);
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_pop_loop(b, NULL);
+
+      val = nir_load_var(b, clock);
+      if (intrin->def.bit_size == 64)
+         val = nir_pack_64_2x32(b, val);
      break;
+   }

   case nir_intrinsic_load_warps_per_sm_nv:
      val = nir_imm_int(b, nak->warps_per_sm);
@ -549,8 +599,7 @@ static bool
 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
 {
   return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
-                                     nir_metadata_block_index |
-                                     nir_metadata_dominance,
+                                     nir_metadata_none,
                                     (void *)nak);
 }

--- a/src/nouveau/compiler/nak_private.h
+++ b/src/nouveau/compiler/nak_private.h
@ -108,7 +108,9 @@ enum ENUM_PACKED nak_sv {
   NAK_SV_LANEMASK_LE      = 0x3a,
   NAK_SV_LANEMASK_GT      = 0x3b,
   NAK_SV_LANEMASK_GE      = 0x3c,
-   NAK_SV_CLOCK            = 0x50,
+   NAK_SV_CLOCK_LO         = 0x50,
+   NAK_SV_CLOCK_HI         = 0x51,
+   NAK_SV_CLOCK            = NAK_SV_CLOCK_LO,
 };

 bool nak_nir_workgroup_has_one_subgroup(const nir_shader *nir);