mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 16:00:08 +01:00
nir/divergence: add a new mode to cover fused threads on Intel HW
The Intel Gfx12.x generation of GPU has an architecture feature called EU fusion in which 2 subgroups run lock step. A typical case where this happens is a compute shader with 1x1x1 local workgroup size and a dispatch command of 2x1x1. In that case 2 threads will be run in lock step for each of the workgroup. This has been the sources of some troubles in the backend because one subgroup can run with all lanes disabled, requiring care for SEND messages using the NoMask flag (execution regardless of the lane mask). We found out that other things are happening when 2 subgroups run together : - the HW will use the surface/sampler handle from only one subgroup - the HW will use the sampler header from only one subgroup So one of the fused subgroup can access the wrong surface/sampler if the value is different between the 2 subgroups and that can happen even with subgroup uniform values. Fortunately we can flag SEND instructions to disable the fusion behavior (most likely at a performance cost). This change introduce a new divergence mode that tries to compute things divergent between subgroups so that we can flag instructions accordingly. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37394>
This commit is contained in:
parent
79923115e7
commit
ca1533cd03
2 changed files with 24 additions and 5 deletions
|
|
@ -206,7 +206,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
* subgroups, so subgroup ops are always divergent between vertices of
|
* subgroups, so subgroup ops are always divergent between vertices of
|
||||||
* the same primitive.
|
* the same primitive.
|
||||||
*/
|
*/
|
||||||
is_divergent = state->options & nir_divergence_vertex;
|
is_divergent = (state->options & nir_divergence_vertex) ||
|
||||||
|
(state->options & nir_divergence_across_subgroups);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Intrinsics which are always uniform */
|
/* Intrinsics which are always uniform */
|
||||||
|
|
@ -398,6 +399,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
} else {
|
} else {
|
||||||
is_divergent = true;
|
is_divergent = true;
|
||||||
}
|
}
|
||||||
|
if (options & nir_divergence_across_subgroups)
|
||||||
|
is_divergent = true;
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_attribute_pan:
|
case nir_intrinsic_load_attribute_pan:
|
||||||
assert(stage == MESA_SHADER_VERTEX);
|
assert(stage == MESA_SHADER_VERTEX);
|
||||||
|
|
@ -414,6 +417,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
|
is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
|
||||||
else
|
else
|
||||||
is_divergent = true;
|
is_divergent = true;
|
||||||
|
if (options & nir_divergence_across_subgroups)
|
||||||
|
is_divergent = true;
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_input_vertex:
|
case nir_intrinsic_load_input_vertex:
|
||||||
is_divergent = src_divergent(instr->src[1], state);
|
is_divergent = src_divergent(instr->src[1], state);
|
||||||
|
|
@ -530,7 +535,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
* vertices of the same primitive because they may be in
|
* vertices of the same primitive because they may be in
|
||||||
* different subgroups.
|
* different subgroups.
|
||||||
*/
|
*/
|
||||||
is_divergent = state->options & nir_divergence_vertex;
|
is_divergent = (state->options & nir_divergence_vertex) ||
|
||||||
|
(state->options & nir_divergence_across_subgroups);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
FALLTHROUGH;
|
FALLTHROUGH;
|
||||||
|
|
@ -538,7 +544,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
case nir_intrinsic_inclusive_scan_clusters_ir3: {
|
case nir_intrinsic_inclusive_scan_clusters_ir3: {
|
||||||
nir_op op = nir_intrinsic_reduction_op(instr);
|
nir_op op = nir_intrinsic_reduction_op(instr);
|
||||||
is_divergent = src_divergent(instr->src[0], state) ||
|
is_divergent = src_divergent(instr->src[0], state) ||
|
||||||
state->options & nir_divergence_vertex;
|
(state->options & nir_divergence_vertex) ||
|
||||||
|
(state->options & nir_divergence_across_subgroups);
|
||||||
if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
|
if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
|
||||||
op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
|
op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
|
||||||
op != nir_op_iand && op != nir_op_ior)
|
op != nir_op_iand && op != nir_op_ior)
|
||||||
|
|
@ -550,7 +557,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
/* This reduces the last invocations in all 8-wide clusters. It should
|
/* This reduces the last invocations in all 8-wide clusters. It should
|
||||||
* behave the same as reduce with cluster_size == subgroup_size.
|
* behave the same as reduce with cluster_size == subgroup_size.
|
||||||
*/
|
*/
|
||||||
is_divergent = state->options & nir_divergence_vertex;
|
is_divergent = (state->options & nir_divergence_vertex) ||
|
||||||
|
(state->options & nir_divergence_across_subgroups);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case nir_intrinsic_load_ubo:
|
case nir_intrinsic_load_ubo:
|
||||||
|
|
@ -749,9 +757,13 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
||||||
/* Not having the non_uniform flag with divergent sources is undefined
|
/* Not having the non_uniform flag with divergent sources is undefined
|
||||||
* behavior. The Intel driver defines it pick the lowest numbered live
|
* behavior. The Intel driver defines it pick the lowest numbered live
|
||||||
* SIMD lane (via emit_uniformize).
|
* SIMD lane (via emit_uniformize).
|
||||||
|
*
|
||||||
|
* When gather the divergence across subgroups, we need propagate the
|
||||||
|
* divergence from the sources.
|
||||||
*/
|
*/
|
||||||
if ((nir_intrinsic_resource_access_intel(instr) &
|
if ((nir_intrinsic_resource_access_intel(instr) &
|
||||||
nir_resource_intel_non_uniform) != 0) {
|
nir_resource_intel_non_uniform) != 0 ||
|
||||||
|
(state->options & nir_divergence_across_subgroups)) {
|
||||||
unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
|
unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
|
||||||
for (unsigned i = 0; i < num_srcs; i++) {
|
for (unsigned i = 0; i < num_srcs; i++) {
|
||||||
if (src_divergent(instr->src[i], state)) {
|
if (src_divergent(instr->src[i], state)) {
|
||||||
|
|
|
||||||
|
|
@ -77,12 +77,19 @@ typedef enum {
|
||||||
nir_divergence_uniform_load_tears = (1 << 7),
|
nir_divergence_uniform_load_tears = (1 << 7),
|
||||||
/* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
|
/* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
|
||||||
nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
|
nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
|
||||||
|
|
||||||
/* Whether to compute vertex divergence (meaning between vertices
|
/* Whether to compute vertex divergence (meaning between vertices
|
||||||
* of the same primitive) instead of subgroup invocation divergence
|
* of the same primitive) instead of subgroup invocation divergence
|
||||||
* (between invocations of the same subgroup). For example, patch input
|
* (between invocations of the same subgroup). For example, patch input
|
||||||
* loads are always convergent, while subgroup intrinsics are divergent.
|
* loads are always convergent, while subgroup intrinsics are divergent.
|
||||||
*/
|
*/
|
||||||
nir_divergence_vertex = (1 << 11),
|
nir_divergence_vertex = (1 << 11),
|
||||||
|
|
||||||
|
/* Whether to compute divergence of subgroup operations as if multiple
|
||||||
|
* subgroups ran in lock-step (for example subgroup operations normally
|
||||||
|
* convergent are divergent).
|
||||||
|
*/
|
||||||
|
nir_divergence_across_subgroups = (1 << 12),
|
||||||
} nir_divergence_options;
|
} nir_divergence_options;
|
||||||
|
|
||||||
/** An instruction filtering callback
|
/** An instruction filtering callback
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue