gallivm: let reduce ops use llvm intrinsics

As part of coopmat, I want to make reductions faster as I need them to implement coopmat. The intrinsics can't be used directly as we have to take into account the exec_mask, but it can be done by picking the a value to insert into the disabled lanes, then calling the LLVM intrinsic. Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39225>
2026-05-06 11:38:05 +02:00 · 2025-12-17 10:41:41 +10:00 · 2025-12-17 10:41:41 +10:00 · 12bceb228a
commit 12bceb228a
parent 1e59dbf66b
1 changed files with 99 additions and 119 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@ -2315,15 +2315,31 @@ static void emit_elect(struct lp_build_nir_soa_context *bld, LLVMValueRef result
   result[0] = LLVMBuildICmp(builder, LLVMIntNE, result[0], lp_build_const_int_vec(gallivm, bld->int_bld.type, 0), "");
 }

+static LLVMValueRef build_reduction_identity_val(struct gallivm_state *gallivm,
+                                                 struct lp_build_context *int_bld,
+                                                 nir_op reduction_op,
+                                                 unsigned bit_size)
+{
+   nir_const_value const_val = nir_alu_binop_identity(reduction_op, bit_size);
+
+   return lp_build_const_int_vec(gallivm, lp_elem_type(int_bld->type),
+                                 nir_const_value_as_uint(const_val, bit_size));
+}
+
 static void emit_reduce(struct lp_build_nir_soa_context *bld, LLVMValueRef src,
                        nir_intrinsic_instr *instr, LLVMValueRef result[4])
 {
   struct gallivm_state *gallivm = bld->base.gallivm;
   LLVMBuilderRef builder = gallivm->builder;
   uint32_t bit_size = nir_src_bit_size(instr->src[0]);
-   /* can't use llvm reduction intrinsics because of exec_mask */
   LLVMValueRef exec_mask = group_op_mask_vec(bld);
   nir_op reduction_op = nir_intrinsic_reduction_op(instr);
+   bool is_flt = reduction_op == nir_op_fadd ||
+      reduction_op == nir_op_fmul ||
+      reduction_op == nir_op_fmin ||
+      reduction_op == nir_op_fmax;
+   bool is_unsigned = reduction_op == nir_op_umin ||
+      reduction_op == nir_op_umax;

   uint32_t cluster_size = 0;

@ -2338,137 +2354,101 @@ static void emit_reduce(struct lp_build_nir_soa_context *bld, LLVMValueRef src,
      src = LLVMBuildZExt(builder, src, bld->uint8_bld.vec_type, "");
   }

+   struct lp_build_context *int_bld = get_int_bld(bld, true, bit_size, true);
+   struct lp_build_context *vec_bld = is_flt ? get_flt_bld(bld, bit_size, true) :
+      get_int_bld(bld, is_unsigned, bit_size, true);
+
+   /*
+    * For a reduce operation with the correct cluster size, the llvm
+    * intrinsics can be used as long as the exec_mask is taken into account.
+    * Values are defaulted in disabled lanes depending on the operation.
+    */
+   if (instr->intrinsic == nir_intrinsic_reduce &&
+       cluster_size == bld->int_bld.type.length) {
+      char intrinsic[64];
+      uint32_t length = vec_bld->type.length;
+      uint32_t src_width = bit_size;
+
+      src = LLVMBuildBitCast(builder, src, int_bld->vec_type, "");
+      if (bit_size < 32)
+         exec_mask = LLVMBuildTrunc(builder, exec_mask, int_bld->vec_type, "");
+      if (bit_size > 32)
+         exec_mask = LLVMBuildSExt(builder, exec_mask, int_bld->vec_type, "");
+      LLVMValueRef masked_val = lp_build_and(int_bld, src, exec_mask);
+      const char *opname;
+
+      switch (reduction_op) {
+      case nir_op_iadd: opname = "add"; break;
+      case nir_op_iand: opname = "and"; break;
+      case nir_op_ior: opname = "or"; break;
+      case nir_op_ixor: opname = "xor"; break;
+      case nir_op_imul: opname = "mul"; break;
+      case nir_op_fadd: opname = "fadd"; break;
+      case nir_op_fmul: opname = "fmul"; break;
+      case nir_op_imin: opname = "smin"; break;
+      case nir_op_umin: opname = "umin"; break;
+      case nir_op_fmin: opname = "fmin"; break;
+      case nir_op_imax: opname = "smax"; break;
+      case nir_op_umax: opname = "umax"; break;
+      case nir_op_fmax: opname = "fmax"; break;
+      default:
+         assert(0);
+      };
+      snprintf(intrinsic, sizeof intrinsic, "llvm.vector.reduce.%s.v%u%s%u",
+               opname,
+               length, is_flt ? "f" : "i" , src_width);
+
+      LLVMValueRef init_val = build_reduction_identity_val(gallivm,
+                                                           int_bld,
+                                                           reduction_op,
+                                                           bit_size);
+      if (init_val) {
+         init_val = lp_build_broadcast_scalar(int_bld, init_val);
+         init_val = lp_build_andnot(int_bld, init_val, exec_mask);
+         masked_val = lp_build_or(int_bld, masked_val, init_val);
+      }
+      if (is_flt)
+         masked_val = LLVMBuildBitCast(builder, masked_val, vec_bld->vec_type, "");
+
+      LLVMValueRef args[2];
+      int num_args = 1;
+
+      if (reduction_op == nir_op_fadd ||
+          reduction_op == nir_op_fmul) {
+         if (reduction_op == nir_op_fmul) {
+            args[0] = lp_build_const_elem(gallivm, vec_bld->type, 1);
+         } else {
+            args[0] = lp_build_const_elem(gallivm, vec_bld->type, -0.0);
+         }
+         args[1] = masked_val;
+         num_args++;
+      } else {
+         args[0] = masked_val;
+      }
+
+      LLVMValueRef res = lp_build_intrinsic(builder, intrinsic, vec_bld->elem_type, args, num_args, 0);
+
+      result[0] = lp_build_broadcast(gallivm, vec_bld->vec_type, res);
+
+      if (instr->def.bit_size == 1)
+         result[0] = LLVMBuildICmp(builder, LLVMIntNE, result[0], int_bld->zero, "");
+      return;
+   }
+
   LLVMValueRef res_store = NULL;
   LLVMValueRef scan_store;
-   struct lp_build_context *int_bld = get_int_bld(bld, true, bit_size, true);

   res_store = lp_build_alloca(gallivm, int_bld->vec_type, "");
   scan_store = lp_build_alloca(gallivm, int_bld->elem_type, "");

   struct lp_build_context elem_bld;
-   bool is_flt = reduction_op == nir_op_fadd ||
-      reduction_op == nir_op_fmul ||
-      reduction_op == nir_op_fmin ||
-      reduction_op == nir_op_fmax;
-   bool is_unsigned = reduction_op == nir_op_umin ||
-      reduction_op == nir_op_umax;
-
-   struct lp_build_context *vec_bld = is_flt ? get_flt_bld(bld, bit_size, true) :
-      get_int_bld(bld, is_unsigned, bit_size, true);

   lp_build_context_init(&elem_bld, gallivm, lp_elem_type(vec_bld->type));

-   LLVMValueRef store_val = NULL;
+   LLVMValueRef store_val = build_reduction_identity_val(gallivm, int_bld, reduction_op, bit_size);
   /*
    * Put the identity value for the operation into the storage
    */
-   switch (reduction_op) {
-   case nir_op_fmin: {
-      LLVMValueRef flt_max = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), INFINITY) :
-         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), INFINITY) : lp_build_const_float(gallivm, INFINITY));
-      store_val = LLVMBuildBitCast(builder, flt_max, int_bld->elem_type, "");
-      break;
-   }
-   case nir_op_fmax: {
-      LLVMValueRef flt_min = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), -INFINITY) :
-         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), -INFINITY) : lp_build_const_float(gallivm, -INFINITY));
-      store_val = LLVMBuildBitCast(builder, flt_min, int_bld->elem_type, "");
-      break;
-   }
-   case nir_op_fmul: {
-      LLVMValueRef flt_one = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), 1.0) :
-         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), 1.0) : lp_build_const_float(gallivm, 1.0));
-      store_val = LLVMBuildBitCast(builder, flt_one, int_bld->elem_type, "");
-      break;
-   }
-   case nir_op_umin:
-      switch (bit_size) {
-      case 8:
-         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), UINT8_MAX, 0);
-         break;
-      case 16:
-         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), UINT16_MAX, 0);
-         break;
-      case 32:
-      default:
-         store_val  = lp_build_const_int32(gallivm, UINT_MAX);
-         break;
-      case 64:
-         store_val  = lp_build_const_int64(gallivm, UINT64_MAX);
-         break;
-      }
-      break;
-   case nir_op_imin:
-      switch (bit_size) {
-      case 8:
-         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MAX, 0);
-         break;
-      case 16:
-         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MAX, 0);
-         break;
-      case 32:
-      default:
-         store_val  = lp_build_const_int32(gallivm, INT_MAX);
-         break;
-      case 64:
-         store_val  = lp_build_const_int64(gallivm, INT64_MAX);
-         break;
-      }
-      break;
-   case nir_op_imax:
-      switch (bit_size) {
-      case 8:
-         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MIN, 0);
-         break;
-      case 16:
-         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MIN, 0);
-         break;
-      case 32:
-      default:
-         store_val  = lp_build_const_int32(gallivm, INT_MIN);
-         break;
-      case 64:
-         store_val  = lp_build_const_int64(gallivm, INT64_MIN);
-         break;
-      }
-      break;
-   case nir_op_imul:
-      switch (bit_size) {
-      case 8:
-         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 1, 0);
-         break;
-      case 16:
-         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 1, 0);
-         break;
-      case 32:
-      default:
-         store_val  = lp_build_const_int32(gallivm, 1);
-         break;
-      case 64:
-         store_val  = lp_build_const_int64(gallivm, 1);
-         break;
-      }
-      break;
-   case nir_op_iand:
-      switch (bit_size) {
-      case 8:
-         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 0xff, 0);
-         break;
-      case 16:
-         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 0xffff, 0);
-         break;
-      case 32:
-      default:
-         store_val  = lp_build_const_int32(gallivm, 0xffffffff);
-         break;
-      case 64:
-         store_val  = lp_build_const_int64(gallivm, 0xffffffffffffffffLL);
-         break;
-      }
-      break;
-   default:
-      break;
-   }
   if (store_val)
      LLVMBuildStore(builder, store_val, scan_store);