broadcom/compiler: fix lane selection for subgroups in fragment shaders

It seems the hardware behavior for this is as per-spec and we are
supposed to identify as active entire quads. Particularly, there
are some derivative tests with dynamic control flow that use
subgroup ballot and require this.

However, we still need to exclude terminted lanes (OpTerminate). For
that, we keep track of the sample mask at the start of a fagment
shader start and compare it with the current sample mask.

Fixes: ('broadcom/compiler: support subgroup reduction operations from fragment shaders')
Fixes: dEQP-VK.glsl.derivate.dynamic_loop.*
Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409>
This commit is contained in:
Iago Toral Quiroga 2024-02-01 10:30:46 +01:00
parent 2143da6d5a
commit cc7934a89b
3 changed files with 24 additions and 71 deletions

View file

@ -15,59 +15,3 @@ dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_usage,Crash
dEQP-VK.wsi.wayland.swapchain.simulate_oom.min_image_count,Crash
dEQP-VK.wsi.wayland.swapchain.simulate_oom.pre_transform,Crash
dEQP-VK.wsi.wayland.swapchain.simulate_oom.present_mode,Crash
# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_mediump,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_highp,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_mediump,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_highp,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_mediump,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_highp,Fail
dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_mediump,Fail

View file

@ -3282,25 +3282,28 @@ emit_load_local_invocation_index(struct v3d_compile *c)
/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
* fragment shaders a lane is considered active if any sample flags are set
* for *any* lane in the same quad. This is not what we want. To fix this we
* can emit MSF to get the active lanes and produce a condition that we
* can then use with these operations to limit execution only to lanes that
* are really active in each quad. Further, we also need to disable lanes
* that may be disabled because of non-uniform control flow.
* for *any* lane in the same quad, however, we still need to ensure that
* terminated lanes (OpTerminate) are not included. Further, we also need to
* disable lanes that may be disabled because of non-uniform control
* flow.
*/
static enum v3d_qpu_cond
setup_subgroup_reduction_condition(struct v3d_compile *c)
setup_subgroup_control_flow_condition(struct v3d_compile *c)
{
assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
c->s->info.stage == MESA_SHADER_COMPUTE);
enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
/* Produce condition for 'lane is active' from current sample flags.
* Only required for fragment shaders.
/* We need to make sure that terminated lanes in fragment shaders are
* not included. We can identify these lanes by comparing the inital
* sample mask with the current. This fixes:
* dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
*/
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), vir_MSF(c)),
if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
vir_NOT(c, vir_XOR(c, c->start_msf,
vir_MSF(c)))),
V3D_QPU_PF_PUSHZ);
cond = V3D_QPU_COND_IFNA;
}
@ -3883,7 +3886,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_ballot: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
struct qreg res = vir_get_temp(c);
vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@ -3902,7 +3905,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_read_first_invocation: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
struct qreg res = vir_get_temp(c);
vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@ -3922,7 +3925,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_vote_ieq: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
struct qreg res = vir_get_temp(c);
vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
vir_ALLEQ_dest(c, res, value) :
@ -3940,7 +3943,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_vote_all: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
struct qreg res = vir_get_temp(c);
vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
@ -3964,7 +3967,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_vote_any: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
struct qreg res = vir_get_temp(c);
vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
@ -4544,6 +4547,7 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
c->start_msf = vir_MSF(c);
if (c->devinfo->ver < 71)
c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
else

View file

@ -793,6 +793,11 @@ struct v3d_compile {
struct qreg cs_shared_offset;
int local_invocation_index_bits;
/* Starting value of the sample mask in a fragment shader. We use
* this to identify lanes that have been terminated/discarded.
*/
struct qreg start_msf;
/* If the shader uses subgroup functionality */
bool has_subgroups;