mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-19 01:40:43 +01:00
nak: Loop to ensure we get accurate shader clocks
Even though CS2R can fetch a whole 64 bits at a time, that doesn't mean it does so atomically. Instead, we need to loop, alternating high and low until we fetch the same high value twice. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27303>
This commit is contained in:
parent
48ebfeba34
commit
6260fa47ff
2 changed files with 57 additions and 6 deletions
|
|
@ -509,10 +509,60 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_shader_clock:
|
||||
val = nir_load_sysval_nv(b, 64, .base = NAK_SV_CLOCK);
|
||||
val = nir_unpack_64_2x32(b, val);
|
||||
case nir_intrinsic_shader_clock: {
|
||||
/* The CS2R opcode can load 64 bits worth of sysval data at a time but
|
||||
* it's not actually atomic. In order to get correct shader clocks, we
|
||||
* need to do a loop where we do
|
||||
*
|
||||
* CS2R SV_CLOCK_HI
|
||||
* CS2R SV_CLOCK_LO
|
||||
* CS2R SV_CLOCK_HI
|
||||
* CS2R SV_CLOCK_LO
|
||||
* CS2R SV_CLOCK_HI
|
||||
* ...
|
||||
*
|
||||
* The moment two high values are the same, we take the low value
|
||||
* between them and that gives us our clock.
|
||||
*
|
||||
* In order to make sure we don't run into any weird races, we also need
|
||||
* to insert a barrier after every load to ensure the one load completes
|
||||
* before we kick off the next load. Otherwise, if one load happens to
|
||||
* be faster than the other (they are variable latency, after all) we're
|
||||
* still guaranteed that the loads happen in the order we want.
|
||||
*/
|
||||
nir_variable *clock =
|
||||
nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
|
||||
|
||||
nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
|
||||
nir_ssa_bar_nv(b, clock_hi);
|
||||
|
||||
nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
|
||||
|
||||
nir_push_loop(b);
|
||||
{
|
||||
nir_def *last_clock = nir_load_var(b, clock);
|
||||
|
||||
nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
|
||||
nir_ssa_bar_nv(b, clock_lo);
|
||||
|
||||
clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
|
||||
nir_ssa_bar_nv(b, clock_hi);
|
||||
|
||||
nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
|
||||
|
||||
nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
|
||||
{
|
||||
nir_jump(b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_loop(b, NULL);
|
||||
|
||||
val = nir_load_var(b, clock);
|
||||
if (intrin->def.bit_size == 64)
|
||||
val = nir_pack_64_2x32(b, val);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_warps_per_sm_nv:
|
||||
val = nir_imm_int(b, nak->warps_per_sm);
|
||||
|
|
@ -549,8 +599,7 @@ static bool
|
|||
nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
|
||||
{
|
||||
return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
|
||||
nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
nir_metadata_none,
|
||||
(void *)nak);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -108,7 +108,9 @@ enum ENUM_PACKED nak_sv {
|
|||
NAK_SV_LANEMASK_LE = 0x3a,
|
||||
NAK_SV_LANEMASK_GT = 0x3b,
|
||||
NAK_SV_LANEMASK_GE = 0x3c,
|
||||
NAK_SV_CLOCK = 0x50,
|
||||
NAK_SV_CLOCK_LO = 0x50,
|
||||
NAK_SV_CLOCK_HI = 0x51,
|
||||
NAK_SV_CLOCK = NAK_SV_CLOCK_LO,
|
||||
};
|
||||
|
||||
bool nak_nir_workgroup_has_one_subgroup(const nir_shader *nir);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue