From c92dab8e2b6964b6dbd9ea122d7ff819efd45244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Fri, 28 May 2021 21:53:06 +0200 Subject: [PATCH] nir: Add nir_op_sad_u8x4 which corresponds to AMD's v_sad_u8. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NIR currently doesn't have any intrinsics for a horizontal packed add, so this one is modeled after AMD's v_sad_u8. Signed-off-by: Timur Kristóf Reviewed-by: Tony Wasserka Part-of: --- src/compiler/nir/nir_opcodes.py | 18 ++++++++++++++++++ src/compiler/nir/nir_range_analysis.c | 3 +++ 2 files changed, 21 insertions(+) diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index eda8d11cb3d..b39c7b57498 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1097,6 +1097,24 @@ if (bits == 0) { } """) +triop_horiz("sad_u8x4", 1, 1, 1, 1, """ +uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0; +uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8; +uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16; +uint8_t s0_b3 = (src0.x & 0xff000000) >> 24; + +uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0; +uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8; +uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16; +uint8_t s1_b3 = (src1.x & 0xff000000) >> 24; + +dst.x = src2.x + + (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) + + (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) + + (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) + + (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3)); +""") + # Combines the first component of each input to make a 3-component vector. triop_horiz("vec3", 3, 1, 1, 1, """ diff --git a/src/compiler/nir/nir_range_analysis.c b/src/compiler/nir/nir_range_analysis.c index 501084f14f5..e18d0446e3e 100644 --- a/src/compiler/nir/nir_range_analysis.c +++ b/src/compiler/nir/nir_range_analysis.c @@ -1579,6 +1579,9 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht, case nir_op_u2u32: res = MIN2(src0, max); break; + case nir_op_sad_u8x4: + res = src2 + 4 * 255; + break; default: res = max; break;