nir: make inverse_ballot 1bit only

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37178>
2025-12-29 18:50:10 +01:00 · 2025-09-03 13:34:07 +02:00 · 2025-09-03 13:34:07 +02:00 · ef8c364d3d
commit ef8c364d3d
parent 6c0017be38
7 changed files with 21 additions and 21 deletions
--- a/src/amd/common/nir/ac_nir.c
+++ b/src/amd/common/nir/ac_nir.c
@ -824,12 +824,12 @@ ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,
   nir_def *dont_care = nir_undef(b, 1, num_lds_dwords * 32);
   nir_def *packed_counts = NULL;

-   nir_if *if_use_lds = nir_push_if(b, nir_inverse_ballot(b, 1, nir_imm_intN_t(b, ballot, wave_size)));
+   nir_if *if_use_lds = nir_push_if(b, nir_inverse_ballot(b, nir_imm_intN_t(b, ballot, wave_size)));
   {
      nir_def *store_val = surviving_invocations_in_current_wave[0];

      if (num_repacks == 2) {
-         nir_def *lane_id_0 = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 1, wave_size));
+         nir_def *lane_id_0 = nir_inverse_ballot(b, nir_imm_intN_t(b, 1, wave_size));
         nir_def *off = nir_bcsel(b, lane_id_0, nir_imm_int(b, 0), nir_imm_int(b, num_lds_dwords * 4));
         lds_addr_base = nir_iadd_nuw(b, lds_addr_base, off);
         store_val = nir_bcsel(b, lane_id_0, store_val, surviving_invocations_in_current_wave[1]);
--- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
@ -943,7 +943,7 @@ hs_tess_level_group_vote(nir_builder *b, lower_tess_io_state *st,

   nir_if *thread0 = nir_push_if(&top_b,
                                 nir_iand(&top_b, nir_ieq_imm(&top_b, nir_load_subgroup_id(&top_b), 0),
-                                          nir_inverse_ballot(&top_b, 1, nir_imm_intN_t(&top_b, 0x1, st->wave_size))));
+                                          nir_inverse_ballot(&top_b, nir_imm_intN_t(&top_b, 0x1, st->wave_size))));
   {
      /* 0x3 is the initial bitmask (tf0 | tf1). Each subgroup will do atomic iand on it for the vote. */
      nir_store_shared(&top_b, nir_imm_int(&top_b, 0x3), nir_imm_int(&top_b, 0),
@ -1070,7 +1070,7 @@ hs_tess_level_group_vote(nir_builder *b, lower_tess_io_state *st,
      const unsigned tcs_vertices_out = b->shader->info.tess.tcs_vertices_out;
      assert(tcs_vertices_out <= 32);
      nir_def *is_first_active_lane =
-         nir_inverse_ballot(b, 1, nir_imm_intN_t(b, BITFIELD_MASK(tcs_vertices_out), st->wave_size));
+         nir_inverse_ballot(b, nir_imm_intN_t(b, BITFIELD_MASK(tcs_vertices_out), st->wave_size));

      /* Only the first active invocation in each subgroup performs the AND reduction through LDS. */
      nir_if *if_first_active_lane = nir_push_if(b, is_first_active_lane);
@ -1094,7 +1094,7 @@ hs_tess_level_group_vote(nir_builder *b, lower_tess_io_state *st,

   /* Read the result from LDS. Only 1 lane should load it to prevent LDS bank conflicts. */
   nir_def *lds_result;
-   nir_if *if_lane0 = nir_push_if(b, nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0x1, st->wave_size)));
+   nir_if *if_lane0 = nir_push_if(b, nir_inverse_ballot(b, nir_imm_intN_t(b, 0x1, st->wave_size)));
   if_lane0->control = nir_selection_control_divergent_always_taken;
   {
      lds_result = nir_load_shared(b, 1, 32, nir_imm_int(b, 0), .align_mul = 4);
--- a/src/amd/vulkan/nir/radv_nir_lower_cooperative_matrix.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_cooperative_matrix.c
@ -451,7 +451,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_

      if (src->bit_size == 32) {
         if (params->wave_size == 64) {
-            nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
+            nir_def *low_lanes = nir_inverse_ballot(b, nir_imm_intN_t(b, UINT32_MAX, 64));
            for (int i = 0; i < num_comps; i++) {
               nir_def *comp = components[i];
               nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
@ -463,7 +463,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
            memcpy(components, tmp, sizeof(components));
         }

-         nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
+         nir_def *low_lanes = nir_inverse_ballot(b, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
         for (int i = 0; i < num_comps; i++) {
            unsigned swap16 = 0x1f | (0x10 << 10);
            nir_def *half_swap = nir_masked_swizzle_amd(b, components[i], .swizzle_mask = swap16, .fetch_inactive = 1);
@ -485,7 +485,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
         nir_def *high_sel = nir_imm_int(b, src->bit_size == 8 ? 0x01050004 : 0x01000504);

         if (params->wave_size == 64) {
-            nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
+            nir_def *low_lanes = nir_inverse_ballot(b, nir_imm_intN_t(b, UINT32_MAX, 64));
            nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
            nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
            for (int i = 0; i < num_comps; i++) {
@ -499,7 +499,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
            memcpy(components, tmp, sizeof(components));
         }

-         nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
+         nir_def *low_lanes = nir_inverse_ballot(b, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
         nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
         nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
         for (int i = 0; i < num_comps; i++) {
@ -526,7 +526,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
      if (src->bit_size == 32) {
         for (unsigned keep32 = 0; keep32 < ((params->wave_size == 64) ? 2 : 1); keep32++) {
            nir_def *ballot = nir_imm_intN_t(b, keep32 ? UINT32_MAX : 0xffff0000ffffull, params->wave_size);
-            nir_def *keep = nir_inverse_ballot(b, 1, ballot);
+            nir_def *keep = nir_inverse_ballot(b, ballot);
            num_comps /= 2;
            for (unsigned i = 0; i < num_comps; i++) {
               components[i] = nir_bcsel(b, keep, components[i * 2], components[i * 2 + 1]);
@ -545,7 +545,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_

         for (unsigned keep32 = 0; keep32 < ((params->wave_size == 64) ? 2 : 1); keep32++) {
            nir_def *ballot = nir_imm_intN_t(b, keep32 ? UINT32_MAX : 0xffff0000ffffull, params->wave_size);
-            nir_def *keep = nir_inverse_ballot(b, 1, ballot);
+            nir_def *keep = nir_inverse_ballot(b, ballot);
            nir_def *perm = nir_bcsel(b, keep, low_sel, high_sel);
            num_comps /= 2;
            for (unsigned i = 0; i < num_comps; i++) {
@ -569,8 +569,8 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
            mask |= BITFIELD64_MASK(x_mask) << i;
         }

-         nir_def *even = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, mask, params->wave_size));
-         nir_def *odd = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, mask << x_mask, params->wave_size));
+         nir_def *even = nir_inverse_ballot(b, nir_imm_intN_t(b, mask, params->wave_size));
+         nir_def *odd = nir_inverse_ballot(b, nir_imm_intN_t(b, mask << x_mask, params->wave_size));

         for (unsigned i = 0; i < num_comps; i += 2 * x_mask) {
            for (unsigned j = 0; j < x_mask; j++) {
@ -594,7 +594,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_

      if (params->gfx_level >= GFX12) {
         if (params->wave_size == 64) {
-            nir_def *cond = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xf0f0f0f00f0f0f0f, params->wave_size));
+            nir_def *cond = nir_inverse_ballot(b, nir_imm_intN_t(b, 0xf0f0f0f00f0f0f0f, params->wave_size));
            for (unsigned i = 0; i < num_comps; i++) {
               nir_def *comp = components[i];
               nir_def *compx = nir_rotate(b, comp, nir_imm_int(b, 32));
@ -603,7 +603,7 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
            }
         }

-         nir_def *cond = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xff0000ffff0000ff, params->wave_size));
+         nir_def *cond = nir_inverse_ballot(b, nir_imm_intN_t(b, 0xff0000ffff0000ff, params->wave_size));
         for (unsigned i = 0; i < num_comps; i++) {
            nir_def *comp = components[i];
            nir_def *compx = nir_masked_swizzle_amd(b, comp, .swizzle_mask = 0x1f | (0x18 << 10), .fetch_inactive = 1);
--- a/src/amd/vulkan/nir/radv_nir_opt_tid_function.c
+++ b/src/amd/vulkan/nir/radv_nir_opt_tid_function.c
@ -529,7 +529,7 @@ opt_fotid_bool(nir_builder *b, nir_alu_instr *instr, const radv_nir_opt_tid_func
   }

   nir_def *ballot = nir_vec(b, ballot_comp, options->hw_ballot_num_comp);
-   nir_def *res = nir_inverse_ballot(b, 1, ballot);
+   nir_def *res = nir_inverse_ballot(b, ballot);
   res->parent_instr->pass_flags = 1;

   nir_def_replace(&instr->def, res);
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -546,7 +546,7 @@ intrinsic("read_getlast_ir3", src_comp=[0], dest_comp=0, bit_sizes=src0, flags=S
 intrinsic("elect", dest_comp=1, flags=SUBGROUP_FLAGS)
 intrinsic("first_invocation", dest_comp=1, bit_sizes=[32], flags=SUBGROUP_FLAGS)
 intrinsic("last_invocation", dest_comp=1, bit_sizes=[32], flags=SUBGROUP_FLAGS)
-intrinsic("inverse_ballot", src_comp=[0], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("inverse_ballot", src_comp=[0], dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE, CAN_REORDER])

 barrier("begin_invocation_interlock")
 barrier("end_invocation_interlock")
--- a/src/compiler/nir/nir_lower_subgroups.c
+++ b/src/compiler/nir/nir_lower_subgroups.c
@ -545,7 +545,7 @@ lower_boolean_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
      nir_def *mask = nir_ishl(b, nir_imm_intN_t(b, 1, ballot->bit_size), index);
      return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
   } else {
-      return nir_inverse_ballot(b, 1, ballot);
+      return nir_inverse_ballot(b, ballot);
   }
 }

@ -689,7 +689,7 @@ lower_boolean_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
      val = nir_inot(b, val);
   }

-   return nir_inverse_ballot(b, 1, val);
+   return nir_inverse_ballot(b, val);
 }

 static nir_def *
@ -1138,7 +1138,7 @@ lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
                                            nir_load_subgroup_invocation(b));
      } else if (intrin->src[0].ssa->num_components != options->ballot_components ||
                 intrin->src[0].ssa->bit_size != options->ballot_bit_size) {
-         return nir_inverse_ballot(b, 1, ballot_type_to_uint(b, intrin->src[0].ssa, options));
+         return nir_inverse_ballot(b, ballot_type_to_uint(b, intrin->src[0].ssa, options));
      }
      break;

--- a/src/compiler/spirv/vtn_subgroup.c
+++ b/src/compiler/spirv/vtn_subgroup.c
@ -103,7 +103,7 @@ vtn_handle_subgroup(struct vtn_builder *b, SpvOp opcode,
   }

   case SpvOpGroupNonUniformInverseBallot: {
-      nir_def *dest = nir_inverse_ballot(&b->nb, 1, vtn_get_nir_ssa(b, w[4]));
+      nir_def *dest = nir_inverse_ballot(&b->nb, vtn_get_nir_ssa(b, w[4]));
      vtn_push_nir_ssa(b, w[2], dest);
      break;
   }