broadcom/compiler: support subgroup ballot

This adds support in our compiler for the subgroup ballot
feature. To this end we start using the NIR lowering for
subgroups which can lowers some of these intrinsics into
things more amenable to our hardware and takes care of
scalarization.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27211>
This commit is contained in:
Iago Toral Quiroga 2024-01-23 08:13:22 +01:00 committed by Marge Bot
parent 295f906517
commit 29a5e3e615
2 changed files with 81 additions and 0 deletions

View file

@ -3734,6 +3734,59 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_ballot: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
struct qreg res = vir_get_temp(c);
if (vir_in_nonuniform_control_flow(c)) {
/* Ballot uses the MSF mask and the condition mask to
* identify active lanes. Particularly, it uses the
* condition mask to filter out lanes disabled by
* control flow.
*/
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
V3D_QPU_PF_PUSHZ);
vir_set_cond(vir_BALLOT_dest(c, res, value),
V3D_QPU_COND_IFA);
} else {
vir_BALLOT_dest(c, res, value);
}
ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
break;
}
case nir_intrinsic_read_invocation: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
struct qreg index = ntq_get_src(c, instr->src[1], 0);
struct qreg res = vir_SHUFFLE(c, value, index);
ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
break;
}
case nir_intrinsic_read_first_invocation: {
assert(c->devinfo->ver >= 71);
struct qreg value = ntq_get_src(c, instr->src[0], 0);
struct qreg res = vir_get_temp(c);
if (vir_in_nonuniform_control_flow(c)) {
/* Bcastf uses the MSF mask and the condition mask to
* identify active lanes. Particularly, it uses the
* condition mask to filter out lanes disabled by
* control flow.
*/
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
V3D_QPU_PF_PUSHZ);
vir_set_cond(vir_BCASTF_dest(c, res, value),
V3D_QPU_COND_IFA);
} else {
vir_BCASTF_dest(c, res, value);
}
ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
break;
}
case nir_intrinsic_load_num_subgroups:
unreachable("Should have been lowered");
break;

View file

@ -1568,6 +1568,24 @@ lower_subgroup_intrinsics(struct v3d_compile *c,
case nir_intrinsic_load_subgroup_size:
case nir_intrinsic_load_subgroup_invocation:
case nir_intrinsic_elect:
case nir_intrinsic_ballot:
case nir_intrinsic_inverse_ballot:
case nir_intrinsic_ballot_bitfield_extract:
case nir_intrinsic_ballot_bit_count_reduce:
case nir_intrinsic_ballot_find_lsb:
case nir_intrinsic_ballot_find_msb:
case nir_intrinsic_ballot_bit_count_exclusive:
case nir_intrinsic_ballot_bit_count_inclusive:
case nir_intrinsic_reduce:
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_exclusive_scan:
case nir_intrinsic_read_invocation:
case nir_intrinsic_read_first_invocation:
case nir_intrinsic_load_subgroup_eq_mask:
case nir_intrinsic_load_subgroup_ge_mask:
case nir_intrinsic_load_subgroup_gt_mask:
case nir_intrinsic_load_subgroup_le_mask:
case nir_intrinsic_load_subgroup_lt_mask:
c->has_subgroups = true;
break;
default:
@ -1681,6 +1699,16 @@ v3d_attempt_compile(struct v3d_compile *c)
NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
const nir_lower_subgroups_options subgroup_opts = {
.subgroup_size = V3D_CHANNELS,
.ballot_components = 1,
.ballot_bit_size = 32,
.lower_to_scalar = true,
.lower_inverse_ballot = true,
.lower_subgroup_masks = true,
};
NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
v3d_optimize_nir(c, c->s);
/* Do late algebraic optimization to turn add(a, neg(b)) back into