nir: add new opcodes to map new v71 packing/conversion instructions

Since v71, broadcom hw include specific packing/conversion instructions, so this commit adds opcodes to be able to make use of them, specially for image stores: * pack_2x16_to_unorm_2x8 (on backend vftounorm8/vftosnorm8): 2x16-bit floating point to 2x8-bit unorm/snorm * f2unorm_16/f2snorm_16 (on backend ftounorm16/ftosnorm16): floating point to 16-bit unorm/snorm * pack_2x16_to_unorm_2x10/pack_2x16_to_unorm_10_2 (on backend vftounorm10lo/vftounorm10hi): used to convert a floating point to a r10g10b10a2 unorm * pack_32_to_r11g11b10 (on backend v11fpack): packs 2 2x16 FP into R11G11B10. * pack_uint_32_to_r10g10b10a2 (on backend v10pack): pack 2 2x16 integer into R10G10B10A2 * pack_4x16_to_4x8 (on backend v8pack): packs 2 2x16 bit integer into 4x8 bits. * pack_2x32_to_2x16 (on backend vpack): 2x32 bit to 2x16 integer pack For the latter, it can be easly confused with the existing pack_32_2x16_split. But note that this one receives two 16bit integer, and packs them on a 32bit integer. But broadcom opcode takes two 32bit integer, takes the lower halfword, and packs them as 2x16 on a 32bit integer. Interestingly broadcom also defines a similar one that packs the higher halfword. Not used yet. Note that at this point we use agnostic names, even if we add a _v3d suffix as they are only available for broadcom, in order to follow current NIR conventions. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25726>
2025-12-21 22:20:14 +01:00 · 2023-11-18 10:56:37 +01:00 · 2023-11-18 10:56:37 +01:00 · c0cfa4f53b
commit c0cfa4f53b
parent fc044928b2
2 changed files with 150 additions and 0 deletions
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@ -62,6 +62,8 @@ template = """\
 #include "util/softfloat.h"
 #include "util/bigmath.h"
 #include "util/format/format_utils.h"
 #include "util/format_r11g11b10f.h"
 #include "util/u_math.h"
 #include "nir_constant_expressions.h"
 /**
@ -277,6 +279,102 @@ unpack_half_1x16(uint16_t u)
   return _mesa_half_to_float(u);
 }
 /* Broadcom v3d specific instructions */
 /**
 * Packs 2 2x16 floating split into a r11g11b10f:
 *
 * dst[10:0]  = float16_to_float11 (src0[15:0])
 * dst[21:11] = float16_to_float11 (src0[31:16])
 * dst[31:22] = float16_to_float10 (src1[15:0])
 */
 static uint32_t pack_32_to_r11g11b10_v3d(const uint32_t src0,
                                         const uint32_t src1)
 {
   float rgb[3] = {
      unpack_half_1x16((src0 & 0xffff)),
      unpack_half_1x16((src0 >> 16)),
      unpack_half_1x16((src1 & 0xffff)),
   };
   return float3_to_r11g11b10f(rgb);
 }
 /**
  * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
  * as they receives a uint16_t val instead of a float
  */
 static inline uint8_t _mesa_half_to_snorm8(uint16_t val)
 {
   return pack_snorm_1x8(_mesa_half_to_float(val));
 }
 static uint16_t _mesa_float_to_snorm16(uint32_t val)
 {
   union fi aux;
   aux.ui = val;
   return pack_snorm_1x16(aux.f);
 }
 static uint16_t _mesa_float_to_unorm16(uint32_t val)
 {
   union fi aux;
   aux.ui = val;
   return pack_unorm_1x16(aux.f);
 }
 static inline uint32_t float_pack16_v3d(uint32_t f32)
 {
   return _mesa_float_to_half(uif(f32));
 }
 static inline uint32_t float_unpack16_v3d(uint32_t f16)
 {
   return fui(_mesa_half_to_float(f16));
 }
 static inline uint32_t vfpack_v3d(uint32_t a, uint32_t b)
 {
   return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
 }
 static inline uint32_t vfsat_v3d(uint32_t a)
 {
   const uint32_t low = fui(SATURATE(_mesa_half_to_float(a & 0xffff)));
   const uint32_t high = fui(SATURATE(_mesa_half_to_float(a >> 16)));
   return vfpack_v3d(low, high);
 }
 static inline uint32_t fmul_v3d(uint32_t a, uint32_t b)
 {
   return fui(uif(a) * uif(b));
 }
 static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
 {
   const uint32_t low = fmul_v3d(float_unpack16_v3d(a & 0xffff),
                                 float_unpack16_v3d(b & 0xffff));
   const uint32_t high = fmul_v3d(float_unpack16_v3d(a >> 16),
                                  float_unpack16_v3d(b >> 16));
   return vfpack_v3d(low, high);
 }
 /* Convert 2x16-bit floating point to 2x10-bit unorm */
 static uint32_t pack_2x16_to_unorm_2x10(uint32_t src0)
 {
   return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
 }
 /*
 * Convert 2x16-bit floating point to one 2-bit and one
 * 10-bit unorm
 */
 static uint32_t pack_2x16_to_unorm_10_2(uint32_t src0)
 {
   return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
 }
 /* Some typed vector structures to make things like src0.y work */
 typedef int8_t int1_t;
 typedef uint8_t uint1_t;
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@ -1413,6 +1413,58 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)
 # v3d-specific opcodes
 # v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
 # r11g11b10 bits, rounding to nearest even, so
 #  dst[10:0]  = float16_to_float11 (src0[15:0])
 #  dst[21:11] = float16_to_float11 (src0[31:16])
 #  dst[31:22] = float16_to_float10 (src1[15:0])
 binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
              "pack_32_to_r11g11b10_v3d(src0, src1)")
 # v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
 # difference with pack_32_2x16_split is that the sources are 32bit too. So it
 # receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
 # integer.
 binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
            "(src0.x & 0xffff) | (src1.x << 16)")
 # v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
 # r10g10b10a2:
 #   dst[9:0]   = src0[9:0]
 #   dst[19:10] = src0[25:16]
 #   dst[29:20] = src1[9:0]
 #   dst[31:30] = src1[17:16]
 binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
 # v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
 #   dst[7:0]   = src0[7:0]
 #   dst[15:8]  = src0[23:16]
 #   dst[23:16] = src1[7:0]
 #   dst[31:24] = src1[23:16]
 opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
       False, "",
       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
 # v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
 unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
 unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
     "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
 # v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
 unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
 unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
 # v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
 unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
 # v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
 # and one 10 bit unorm
 unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
 # Mali-specific opcodes
 unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))