diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 9513d93de8f..75055b7982d 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -509,10 +509,60 @@ nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin, break; } - case nir_intrinsic_shader_clock: - val = nir_load_sysval_nv(b, 64, .base = NAK_SV_CLOCK); - val = nir_unpack_64_2x32(b, val); + case nir_intrinsic_shader_clock: { + /* The CS2R opcode can load 64 bits worth of sysval data at a time but + * it's not actually atomic. In order to get correct shader clocks, we + * need to do a loop where we do + * + * CS2R SV_CLOCK_HI + * CS2R SV_CLOCK_LO + * CS2R SV_CLOCK_HI + * CS2R SV_CLOCK_LO + * CS2R SV_CLOCK_HI + * ... + * + * The moment two high values are the same, we take the low value + * between them and that gives us our clock. + * + * In order to make sure we don't run into any weird races, we also need + * to insert a barrier after every load to ensure the one load completes + * before we kick off the next load. Otherwise, if one load happens to + * be faster than the other (they are variable latency, after all) we're + * still guaranteed that the loads happen in the order we want. + */ + nir_variable *clock = + nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL); + + nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI); + nir_ssa_bar_nv(b, clock_hi); + + nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3); + + nir_push_loop(b); + { + nir_def *last_clock = nir_load_var(b, clock); + + nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO); + nir_ssa_bar_nv(b, clock_lo); + + clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1); + nir_ssa_bar_nv(b, clock_hi); + + nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3); + + nir_push_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1))); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + } + nir_pop_loop(b, NULL); + + val = nir_load_var(b, clock); + if (intrin->def.bit_size == 64) + val = nir_pack_64_2x32(b, val); break; + } case nir_intrinsic_load_warps_per_sm_nv: val = nir_imm_int(b, nak->warps_per_sm); @@ -549,8 +599,7 @@ static bool nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak) { return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin, - nir_metadata_block_index | - nir_metadata_dominance, + nir_metadata_none, (void *)nak); } diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index c01b03ec6c1..3c487b89223 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -108,7 +108,9 @@ enum ENUM_PACKED nak_sv { NAK_SV_LANEMASK_LE = 0x3a, NAK_SV_LANEMASK_GT = 0x3b, NAK_SV_LANEMASK_GE = 0x3c, - NAK_SV_CLOCK = 0x50, + NAK_SV_CLOCK_LO = 0x50, + NAK_SV_CLOCK_HI = 0x51, + NAK_SV_CLOCK = NAK_SV_CLOCK_LO, }; bool nak_nir_workgroup_has_one_subgroup(const nir_shader *nir);