nir: add new opcodes to map new v71 packing/conversion instructions

Since v71, broadcom hw include specific packing/conversion
instructions, so this commit adds opcodes to be able to make use of
them, specially for image stores:

   * pack_2x16_to_unorm_2x8 (on backend vftounorm8/vftosnorm8):
     2x16-bit floating point to 2x8-bit unorm/snorm

   * f2unorm_16/f2snorm_16 (on backend ftounorm16/ftosnorm16):
     floating point to 16-bit unorm/snorm

   * pack_2x16_to_unorm_2x10/pack_2x16_to_unorm_10_2 (on backend
     vftounorm10lo/vftounorm10hi): used to convert a floating point to
     a r10g10b10a2 unorm

   * pack_32_to_r11g11b10 (on backend v11fpack): packs 2 2x16 FP into
     R11G11B10.

   * pack_uint_32_to_r10g10b10a2 (on backend v10pack): pack 2 2x16
     integer into R10G10B10A2

   * pack_4x16_to_4x8 (on backend v8pack): packs 2 2x16 bit integer
     into 4x8 bits.

   * pack_2x32_to_2x16 (on backend vpack): 2x32 bit to 2x16 integer
     pack

For the latter, it can be easly confused with the existing
pack_32_2x16_split. But note that this one receives two 16bit integer,
and packs them on a 32bit integer. But broadcom opcode takes two 32bit
integer, takes the lower halfword, and packs them as 2x16 on a 32bit
integer.

Interestingly broadcom also defines a similar one that packs the
higher halfword. Not used yet.

Note that at this point we use agnostic names, even if we add a _v3d
suffix as they are only available for broadcom, in order to follow
current NIR conventions.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25726>
This commit is contained in:
Alejandro Piñeiro 2023-11-18 10:56:37 +01:00 committed by Marge Bot
parent fc044928b2
commit c0cfa4f53b
2 changed files with 150 additions and 0 deletions

View file

@ -62,6 +62,8 @@ template = """\
#include "util/softfloat.h" #include "util/softfloat.h"
#include "util/bigmath.h" #include "util/bigmath.h"
#include "util/format/format_utils.h" #include "util/format/format_utils.h"
#include "util/format_r11g11b10f.h"
#include "util/u_math.h"
#include "nir_constant_expressions.h" #include "nir_constant_expressions.h"
/** /**
@ -277,6 +279,102 @@ unpack_half_1x16(uint16_t u)
return _mesa_half_to_float(u); return _mesa_half_to_float(u);
} }
/* Broadcom v3d specific instructions */
/**
* Packs 2 2x16 floating split into a r11g11b10f:
*
* dst[10:0] = float16_to_float11 (src0[15:0])
* dst[21:11] = float16_to_float11 (src0[31:16])
* dst[31:22] = float16_to_float10 (src1[15:0])
*/
static uint32_t pack_32_to_r11g11b10_v3d(const uint32_t src0,
const uint32_t src1)
{
float rgb[3] = {
unpack_half_1x16((src0 & 0xffff)),
unpack_half_1x16((src0 >> 16)),
unpack_half_1x16((src1 & 0xffff)),
};
return float3_to_r11g11b10f(rgb);
}
/**
* The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
* as they receives a uint16_t val instead of a float
*/
static inline uint8_t _mesa_half_to_snorm8(uint16_t val)
{
return pack_snorm_1x8(_mesa_half_to_float(val));
}
static uint16_t _mesa_float_to_snorm16(uint32_t val)
{
union fi aux;
aux.ui = val;
return pack_snorm_1x16(aux.f);
}
static uint16_t _mesa_float_to_unorm16(uint32_t val)
{
union fi aux;
aux.ui = val;
return pack_unorm_1x16(aux.f);
}
static inline uint32_t float_pack16_v3d(uint32_t f32)
{
return _mesa_float_to_half(uif(f32));
}
static inline uint32_t float_unpack16_v3d(uint32_t f16)
{
return fui(_mesa_half_to_float(f16));
}
static inline uint32_t vfpack_v3d(uint32_t a, uint32_t b)
{
return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
}
static inline uint32_t vfsat_v3d(uint32_t a)
{
const uint32_t low = fui(SATURATE(_mesa_half_to_float(a & 0xffff)));
const uint32_t high = fui(SATURATE(_mesa_half_to_float(a >> 16)));
return vfpack_v3d(low, high);
}
static inline uint32_t fmul_v3d(uint32_t a, uint32_t b)
{
return fui(uif(a) * uif(b));
}
static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
{
const uint32_t low = fmul_v3d(float_unpack16_v3d(a & 0xffff),
float_unpack16_v3d(b & 0xffff));
const uint32_t high = fmul_v3d(float_unpack16_v3d(a >> 16),
float_unpack16_v3d(b >> 16));
return vfpack_v3d(low, high);
}
/* Convert 2x16-bit floating point to 2x10-bit unorm */
static uint32_t pack_2x16_to_unorm_2x10(uint32_t src0)
{
return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
}
/*
* Convert 2x16-bit floating point to one 2-bit and one
* 10-bit unorm
*/
static uint32_t pack_2x16_to_unorm_10_2(uint32_t src0)
{
return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
}
/* Some typed vector structures to make things like src0.y work */ /* Some typed vector structures to make things like src0.y work */
typedef int8_t int1_t; typedef int8_t int1_t;
typedef uint8_t uint1_t; typedef uint8_t uint1_t;

View file

@ -1413,6 +1413,58 @@ for (int i = 0; i < 32; i += 8) {
} }
""") """)
# v3d-specific opcodes
# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
# r11g11b10 bits, rounding to nearest even, so
# dst[10:0] = float16_to_float11 (src0[15:0])
# dst[21:11] = float16_to_float11 (src0[31:16])
# dst[31:22] = float16_to_float10 (src1[15:0])
binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
"pack_32_to_r11g11b10_v3d(src0, src1)")
# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
# difference with pack_32_2x16_split is that the sources are 32bit too. So it
# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
# integer.
binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
"(src0.x & 0xffff) | (src1.x << 16)")
# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
# r10g10b10a2:
# dst[9:0] = src0[9:0]
# dst[19:10] = src0[25:16]
# dst[29:20] = src1[9:0]
# dst[31:30] = src1[17:16]
binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
"(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
# dst[7:0] = src0[7:0]
# dst[15:8] = src0[23:16]
# dst[23:16] = src1[7:0]
# dst[31:24] = src1[23:16]
opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
False, "",
"(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
"_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
"_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
# and one 10 bit unorm
unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
# Mali-specific opcodes # Mali-specific opcodes
unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))