nak: Loop to ensure we get accurate shader clocks

Even though CS2R can fetch a whole 64 bits at a time, that doesn't mean
it does so atomically.  Instead, we need to loop, alternating high and
low until we fetch the same high value twice.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27303>
This commit is contained in:
Faith Ekstrand 2024-01-26 09:13:28 -06:00 committed by Marge Bot
parent 48ebfeba34
commit 6260fa47ff
2 changed files with 57 additions and 6 deletions

View file

@ -509,10 +509,60 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
break;
}
case nir_intrinsic_shader_clock:
val = nir_load_sysval_nv(b, 64, .base = NAK_SV_CLOCK);
val = nir_unpack_64_2x32(b, val);
case nir_intrinsic_shader_clock: {
/* The CS2R opcode can load 64 bits worth of sysval data at a time but
* it's not actually atomic. In order to get correct shader clocks, we
* need to do a loop where we do
*
* CS2R SV_CLOCK_HI
* CS2R SV_CLOCK_LO
* CS2R SV_CLOCK_HI
* CS2R SV_CLOCK_LO
* CS2R SV_CLOCK_HI
* ...
*
* The moment two high values are the same, we take the low value
* between them and that gives us our clock.
*
* In order to make sure we don't run into any weird races, we also need
* to insert a barrier after every load to ensure the one load completes
* before we kick off the next load. Otherwise, if one load happens to
* be faster than the other (they are variable latency, after all) we're
* still guaranteed that the loads happen in the order we want.
*/
nir_variable *clock =
nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
nir_ssa_bar_nv(b, clock_hi);
nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
nir_push_loop(b);
{
nir_def *last_clock = nir_load_var(b, clock);
nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
nir_ssa_bar_nv(b, clock_lo);
clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
nir_ssa_bar_nv(b, clock_hi);
nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
}
nir_pop_loop(b, NULL);
val = nir_load_var(b, clock);
if (intrin->def.bit_size == 64)
val = nir_pack_64_2x32(b, val);
break;
}
case nir_intrinsic_load_warps_per_sm_nv:
val = nir_imm_int(b, nak->warps_per_sm);
@ -549,8 +599,7 @@ static bool
nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
{
return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
nir_metadata_block_index |
nir_metadata_dominance,
nir_metadata_none,
(void *)nak);
}

View file

@ -108,7 +108,9 @@ enum ENUM_PACKED nak_sv {
NAK_SV_LANEMASK_LE = 0x3a,
NAK_SV_LANEMASK_GT = 0x3b,
NAK_SV_LANEMASK_GE = 0x3c,
NAK_SV_CLOCK = 0x50,
NAK_SV_CLOCK_LO = 0x50,
NAK_SV_CLOCK_HI = 0x51,
NAK_SV_CLOCK = NAK_SV_CLOCK_LO,
};
bool nak_nir_workgroup_has_one_subgroup(const nir_shader *nir);