nir: remove sad_u8x4

All uses of this can be replaced with msad_4x8. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26907>
2026-02-21 10:00:36 +01:00 · 2023-11-20 15:53:39 +00:00 · 2023-11-20 15:53:39 +00:00 · ae54cbeb3f
commit ae54cbeb3f
parent 5fd747a502
6 changed files with 4 additions and 40 deletions
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@ -284,7 +284,7 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords
    *
    * If the v_dot instruction can't be used, we left-shift the packed bytes.
    * This will shift out the unneeded bytes and shift in zeroes instead,
-    * then we sum them using v_sad_u8.
+    * then we sum them using v_msad_u8.
    */

   nir_def *lane_id = nir_load_subgroup_invocation(b);
@ -302,7 +302,7 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords
         return nir_udot_4x8_uadd(b, packed, dot_op, nir_imm_int(b, 0));
      } else {
         nir_def *sad_op = nir_ishl(b, nir_ishl(b, packed, shift), shift);
-         return nir_sad_u8x4(b, sad_op, nir_imm_int(b, 0), nir_imm_int(b, 0));
+         return nir_msad_4x8(b, sad_op, nir_imm_int(b, 0), nir_imm_int(b, 0));
      }
   } else if (num_lds_dwords == 2) {
      nir_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int64(b, 0x0101010101010101), shift), shift);
@ -317,8 +317,8 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords
         return nir_udot_4x8_uadd(b, packed_dw1, nir_unpack_64_2x32_split_y(b, dot_op), sum);
      } else {
         nir_def *sad_op = nir_ishl(b, nir_ishl(b, nir_pack_64_2x32_split(b, packed_dw0, packed_dw1), shift), shift);
-         nir_def *sum = nir_sad_u8x4(b, nir_unpack_64_2x32_split_x(b, sad_op), nir_imm_int(b, 0), nir_imm_int(b, 0));
-         return nir_sad_u8x4(b, nir_unpack_64_2x32_split_y(b, sad_op), nir_imm_int(b, 0), sum);
+         nir_def *sum = nir_msad_4x8(b, nir_unpack_64_2x32_split_x(b, sad_op), nir_imm_int(b, 0), nir_imm_int(b, 0));
+         return nir_msad_4x8(b, nir_unpack_64_2x32_split_y(b, sad_op), nir_imm_int(b, 0), sum);
      }
   } else {
      unreachable("Unimplemented NGG wave count");
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -3420,11 +3420,6 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
      }
      break;
   }
-   case nir_op_sad_u8x4: {
-      assert(dst.regClass() == v1);
-      emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
-      break;
-   }
   case nir_op_msad_4x8: {
      assert(dst.regClass() == v1);
      emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -392,7 +392,6 @@ init_context(isel_context* ctx, nir_shader* shader)
               case nir_op_frexp_sig:
               case nir_op_frexp_exp:
               case nir_op_cube_amd:
-               case nir_op_sad_u8x4:
               case nir_op_msad_4x8:
               case nir_op_udot_4x8_uadd:
               case nir_op_sdot_4x8_iadd:
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@ -1253,11 +1253,6 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
      break;
   }

-   case nir_op_sad_u8x4:
-      result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
-                                  (LLVMValueRef[]){src[0], src[1], src[2]}, 3, 0);
-      break;
-
   case nir_op_msad_4x8:
      result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.msad.u8", ctx->ac.i32,
                                  (LLVMValueRef[]){src[1], src[0], src[2]}, 3, 0);
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@ -1126,30 +1126,6 @@ if (bits == 0) {
 }
 """)

-triop_horiz("sad_u8x4", 1, 1, 1, 1, """
-uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0;
-uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8;
-uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16;
-uint8_t s0_b3 = (src0.x & 0xff000000) >> 24;
-
-uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0;
-uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8;
-uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16;
-uint8_t s1_b3 = (src1.x & 0xff000000) >> 24;
-
-dst.x = src2.x +
-        (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) +
-        (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) +
-        (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) +
-        (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3));
-""", description = """
-Sum of absolute differences with accumulation. Equivalent to AMD's v_sad_u8 instruction.
-
-The first two sources contain packed 8-bit unsigned integers, the instruction will
-calculate the absolute difference of these, and then add them together. There is also a
-third source which is a 32-bit unsigned integer and added to the result.
-""")
-
 triop("msad_4x8", tuint32, "", """
 dst = msad(src0, src1, src2);
 """, description = """
--- a/src/compiler/nir/nir_range_analysis.c
+++ b/src/compiler/nir/nir_range_analysis.c
@ -1864,7 +1864,6 @@ get_alu_uub(struct analysis_state *state, struct uub_query q, uint32_t *result,
   case nir_op_b2i32:
      *result = 1;
      break;
-   case nir_op_sad_u8x4:
   case nir_op_msad_4x8:
      *result = MIN2((uint64_t)src[2] + 4 * 255, UINT32_MAX);
      break;