From 29a5e3e6154fc1b518155e5171897aa7aedc890b Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 23 Jan 2024 08:13:22 +0100 Subject: [PATCH] broadcom/compiler: support subgroup ballot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support in our compiler for the subgroup ballot feature. To this end we start using the NIR lowering for subgroups which can lowers some of these intrinsics into things more amenable to our hardware and takes care of scalarization. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 53 ++++++++++++++++++++++++++++++ src/broadcom/compiler/vir.c | 28 ++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index b21ce0461f1..6b449a2a289 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -3734,6 +3734,59 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; } + case nir_intrinsic_ballot: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg res = vir_get_temp(c); + if (vir_in_nonuniform_control_flow(c)) { + /* Ballot uses the MSF mask and the condition mask to + * identify active lanes. Particularly, it uses the + * condition mask to filter out lanes disabled by + * control flow. + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_BALLOT_dest(c, res, value), + V3D_QPU_COND_IFA); + } else { + vir_BALLOT_dest(c, res, value); + } + + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg index = ntq_get_src(c, instr->src[1], 0); + struct qreg res = vir_SHUFFLE(c, value, index); + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + + case nir_intrinsic_read_first_invocation: { + assert(c->devinfo->ver >= 71); + struct qreg value = ntq_get_src(c, instr->src[0], 0); + struct qreg res = vir_get_temp(c); + if (vir_in_nonuniform_control_flow(c)) { + /* Bcastf uses the MSF mask and the condition mask to + * identify active lanes. Particularly, it uses the + * condition mask to filter out lanes disabled by + * control flow. + */ + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_BCASTF_dest(c, res, value), + V3D_QPU_COND_IFA); + } else { + vir_BCASTF_dest(c, res, value); + } + + ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); + break; + } + case nir_intrinsic_load_num_subgroups: unreachable("Should have been lowered"); break; diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 09190db9b9b..f5794133b6d 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1568,6 +1568,24 @@ lower_subgroup_intrinsics(struct v3d_compile *c, case nir_intrinsic_load_subgroup_size: case nir_intrinsic_load_subgroup_invocation: case nir_intrinsic_elect: + case nir_intrinsic_ballot: + case nir_intrinsic_inverse_ballot: + case nir_intrinsic_ballot_bitfield_extract: + case nir_intrinsic_ballot_bit_count_reduce: + case nir_intrinsic_ballot_find_lsb: + case nir_intrinsic_ballot_find_msb: + case nir_intrinsic_ballot_bit_count_exclusive: + case nir_intrinsic_ballot_bit_count_inclusive: + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_read_invocation: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_load_subgroup_eq_mask: + case nir_intrinsic_load_subgroup_ge_mask: + case nir_intrinsic_load_subgroup_gt_mask: + case nir_intrinsic_load_subgroup_le_mask: + case nir_intrinsic_load_subgroup_lt_mask: c->has_subgroups = true; break; default: @@ -1681,6 +1699,16 @@ v3d_attempt_compile(struct v3d_compile *c) NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c); + const nir_lower_subgroups_options subgroup_opts = { + .subgroup_size = V3D_CHANNELS, + .ballot_components = 1, + .ballot_bit_size = 32, + .lower_to_scalar = true, + .lower_inverse_ballot = true, + .lower_subgroup_masks = true, + }; + NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts); + v3d_optimize_nir(c, c->s); /* Do late algebraic optimization to turn add(a, neg(b)) back into