gallivm: add PK2H/UP2H support

Add support for these opcodes, the conversion functions were already there albeit need some new packing stuff. Just like the tgsi version, piglit won't like it for all the same reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing tests, albeit since PK2H won't due those rounding differences I don't know if that one works or not as the piglit test is rather difficult to deal with).
2026-05-05 03:08:05 +02:00 · 2016-01-29 02:49:22 +01:00 · 2016-01-29 02:49:22 +01:00 · 5171ec9ca9
commit 5171ec9ca9
parent dc16086e3b
5 changed files with 119 additions and 2 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
 *
 * Convert float32 to half floats, preserving Infs and NaNs,
 * with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
 */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
   struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
   LLVMValueRef result;

+   /*
+    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+    * directly, without any (x86 or generic) intrinsics.
+    * Albeit the rounding mode cannot be specified (and is undefined,
+    * though in practice on x86 seems to do nearest-even but it may
+    * be dependent on instruction set support), so is essentially
+    * useless.
+    */
+
   if (util_cpu_caps.has_f16c &&
       (length == 4 || length == 8)) {
      struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
        LLVMValueRef index = LLVMConstInt(i32t, i, 0);
        LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 #if 0
-        /* XXX: not really supported by backends */
+        /*
+         * XXX: not really supported by backends.
+         * Even if they would now, rounding mode cannot be specified and
+         * is undefined.
+         */
        LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 #else
        LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@ -256,6 +256,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 }


+/**
+ * Un-interleave vector.
+ * This will return a vector consisting of every second element
+ * (depending on lo_hi, beginning at 0 or 1).
+ * The returned vector size (elems and width) will only be half
+ * that of the source vector.
+ */
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < num_elems / 2; ++i)
+      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+
+   shuffle = LLVMConstVector(elems, num_elems / 2);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
+}
+
+
 /**
 * Interleave vector elements.
 *
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
                     LLVMValueRef b,
                     unsigned lo_hi);

+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi);

 void
 lp_build_unpack2(struct gallivm_state *gallivm,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
   /* Ignore deprecated instructions */
   switch (inst->Instruction.Opcode) {

-   case TGSI_OPCODE_UP2H:
   case TGSI_OPCODE_UP2US:
   case TGSI_OPCODE_UP4B:
   case TGSI_OPCODE_UP4UB:
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@ -45,8 +45,10 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"

 #include "tgsi/tgsi_exec.h"

@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
   log_emit	 /* emit */
 };

+/* TGSI_OPCODE_PK2H */
+
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_Y);
+}
+
+static void
+pk2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_type f16i_t;
+   LLVMValueRef lo, hi, res;
+
+   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
+   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
+   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
+   /* maybe some interleave doubling vector width would be useful... */
+   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
+   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
+   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
+
+   emit_data->output[emit_data->chan] = res;
+}
+
+static struct lp_build_tgsi_action pk2h_action = {
+   pk2h_fetch_args, /* fetch_args */
+   pk2h_emit        /* emit */
+};
+
+/* TGSI_OPCODE_UP2H */
+
+static void
+up2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef lo, hi, res[2], arg;
+   unsigned nr = bld_base->base.type.length;
+   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
+
+   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
+   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
+   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
+   res[0] = lp_build_half_to_float(gallivm, lo);
+   res[1] = lp_build_half_to_float(gallivm, hi);
+
+   emit_data->output[0] = emit_data->output[2] = res[0];
+   emit_data->output[1] = emit_data->output[3] = res[1];
+}
+
+static struct lp_build_tgsi_action up2h_action = {
+   scalar_unary_fetch_args, /* fetch_args */
+   up2h_emit                /* emit */
+};
+
 /* TGSI_OPCODE_LRP */

 static void
@ -1032,10 +1103,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
   bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
   bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
   bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
+   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
   bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
   bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
   bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
   bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
+   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
   bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;

   bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;