intel/brw: Add SHADER_OPCODE_QUAD_SWAP

For the horizontal, vertical and diagonal variants. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31053>
2025-12-27 08:20:12 +01:00 · 2024-09-05 17:37:25 -07:00 · 2024-09-05 17:37:25 -07:00 · 8474dc853d
commit 8474dc853d
parent 73fc29b25c
7 changed files with 106 additions and 52 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@ -451,6 +451,13 @@ enum opcode {
    */
   SHADER_OPCODE_SEL_EXEC,

+   /* Swap values inside a quad based on the direction.
+    *
+    * Source 0: Value.
+    * Source 1: Immediate with brw_swap_direction.
+    */
+   SHADER_OPCODE_QUAD_SWAP,
+
   /* This turns into an align16 mov from src0 to dst with a swizzle
    * provided as an immediate in src1.
    */
@ -723,6 +730,12 @@ enum brw_reduce_op {
   BRW_REDUCE_OP_XOR,
 };

+enum brw_swap_direction {
+   BRW_SWAP_HORIZONTAL,
+   BRW_SWAP_VERTICAL,
+   BRW_SWAP_DIAGONAL,
+};
+
 enum ENUM_PACKED brw_predicate {
   BRW_PREDICATE_NONE                =  0,
   BRW_PREDICATE_NORMAL              =  1,
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -249,6 +249,9 @@ fs_inst::is_control_source(unsigned arg) const
             arg != MEMORY_LOGICAL_DATA0 &&
             arg != MEMORY_LOGICAL_DATA1;

+   case SHADER_OPCODE_QUAD_SWAP:
+      return arg == 1;
+
   default:
      return false;
   }
@ -325,6 +328,7 @@ fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
   case SHADER_OPCODE_VOTE_ALL:
   case SHADER_OPCODE_VOTE_EQUAL:
   case SHADER_OPCODE_BALLOT:
+   case SHADER_OPCODE_QUAD_SWAP:
      return false;
   default:
      return true;
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@ -649,6 +649,7 @@ instruction_requires_packed_data(fs_inst *inst)
   case FS_OPCODE_DDY_FINE:
   case FS_OPCODE_DDY_COARSE:
   case SHADER_OPCODE_QUAD_SWIZZLE:
+   case SHADER_OPCODE_QUAD_SWAP:
      return true;
   default:
      return false;
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -6699,61 +6699,21 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
      break;
   }

-   case nir_intrinsic_quad_swap_horizontal: {
-      const brw_reg value = get_nir_src(ntb, instr->src[0]);
-      const brw_reg tmp = bld.vgrf(value.type);
-
-      const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
-
-      const brw_reg src_left = horiz_stride(value, 2);
-      const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
-      const brw_reg tmp_left = horiz_stride(tmp, 2);
-      const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
-
-      ubld.MOV(tmp_left, src_right);
-      ubld.MOV(tmp_right, src_left);
-
-      bld.MOV(retype(dest, value.type), tmp);
-      break;
-   }
-
-   case nir_intrinsic_quad_swap_vertical: {
-      const brw_reg value = get_nir_src(ntb, instr->src[0]);
-      if (nir_src_bit_size(instr->src[0]) == 32) {
-         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
-         const brw_reg tmp = bld.vgrf(value.type);
-         const fs_builder ubld = bld.exec_all();
-         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
-                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
-         bld.MOV(retype(dest, value.type), tmp);
-      } else {
-         /* For larger data types, we have to either emit dispatch_width many
-          * MOVs or else fall back to doing indirects.
-          */
-         brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x2));
-         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
-      }
-      break;
-   }
-
+   case nir_intrinsic_quad_swap_horizontal:
+   case nir_intrinsic_quad_swap_vertical:
   case nir_intrinsic_quad_swap_diagonal: {
      const brw_reg value = get_nir_src(ntb, instr->src[0]);
-      if (nir_src_bit_size(instr->src[0]) == 32) {
-         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
-         const brw_reg tmp = bld.vgrf(value.type);
-         const fs_builder ubld = bld.exec_all();
-         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
-                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
-         bld.MOV(retype(dest, value.type), tmp);
-      } else {
-         /* For larger data types, we have to either emit dispatch_width many
-          * MOVs or else fall back to doing indirects.
-          */
-         brw_reg idx = bld.vgrf(BRW_TYPE_W);
-         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x3));
-         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+
+      enum brw_swap_direction dir;
+      switch (instr->intrinsic) {
+      case nir_intrinsic_quad_swap_horizontal: dir = BRW_SWAP_HORIZONTAL; break;
+      case nir_intrinsic_quad_swap_vertical:   dir = BRW_SWAP_VERTICAL;   break;
+      case nir_intrinsic_quad_swap_diagonal:   dir = BRW_SWAP_DIAGONAL;   break;
+      default: unreachable("invalid quad swap");
      }
+
+      bld.emit(SHADER_OPCODE_QUAD_SWAP, retype(dest, value.type),
+               value, brw_imm_ud(dir));
      break;
   }

--- a/src/intel/compiler/brw_fs_validate.cpp
+++ b/src/intel/compiler/brw_fs_validate.cpp
@ -238,6 +238,7 @@ brw_validate_instruction_phase(const fs_visitor &s, fs_inst *inst)
   case SHADER_OPCODE_VOTE_ALL:
   case SHADER_OPCODE_VOTE_EQUAL:
   case SHADER_OPCODE_BALLOT:
+   case SHADER_OPCODE_QUAD_SWAP:
      invalid_from = BRW_SHADER_PHASE_AFTER_EARLY_LOWERING;
      break;

--- a/src/intel/compiler/brw_lower_subgroup_ops.cpp
+++ b/src/intel/compiler/brw_lower_subgroup_ops.cpp
@ -539,6 +539,62 @@ brw_lower_ballot(fs_visitor &s, bblock_t *block, fs_inst *inst)
   return true;
 }

+static bool
+brw_lower_quad_swap(fs_visitor &s, bblock_t *block, fs_inst *inst)
+{
+   const fs_builder bld(&s, block, inst);
+
+   assert(inst->dst.type == inst->src[0].type);
+   brw_reg dst = inst->dst;
+   brw_reg value = inst->src[0];
+
+   assert(inst->src[1].file == IMM);
+   enum brw_swap_direction dir = (enum brw_swap_direction)inst->src[1].ud;
+
+   switch (dir) {
+   case BRW_SWAP_HORIZONTAL: {
+      const brw_reg tmp = bld.vgrf(value.type);
+
+      const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
+
+      const brw_reg src_left = horiz_stride(value, 2);
+      const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
+      const brw_reg tmp_left = horiz_stride(tmp, 2);
+      const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
+
+      ubld.MOV(tmp_left, src_right);
+      ubld.MOV(tmp_right, src_left);
+
+      bld.MOV(retype(dst, value.type), tmp);
+      break;
+   }
+   case BRW_SWAP_VERTICAL:
+   case BRW_SWAP_DIAGONAL: {
+      if (brw_type_size_bits(value.type) == 32) {
+         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+         const unsigned swizzle = dir == BRW_SWAP_VERTICAL ? BRW_SWIZZLE4(2,3,0,1)
+                                                           : BRW_SWIZZLE4(3,2,1,0);
+         const brw_reg tmp = bld.vgrf(value.type);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, brw_imm_ud(swizzle));
+         bld.MOV(dst, tmp);
+      } else {
+         /* For larger data types, we have to either emit dispatch_width many
+          * MOVs or else fall back to doing indirects.
+          */
+         const unsigned xor_mask = dir == BRW_SWAP_VERTICAL ? 0x2 : 0x3;
+         brw_reg idx = bld.vgrf(BRW_TYPE_W);
+         bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(xor_mask));
+         bld.emit(SHADER_OPCODE_SHUFFLE, dst, value, idx);
+      }
+      break;
+   }
+   }
+
+   inst->remove(block);
+   return true;
+}
+
 bool
 brw_fs_lower_subgroup_ops(fs_visitor &s)
 {
@ -565,6 +621,10 @@ brw_fs_lower_subgroup_ops(fs_visitor &s)
         progress |= brw_lower_ballot(s, block, inst);
         break;

+      case SHADER_OPCODE_QUAD_SWAP:
+         progress |= brw_lower_quad_swap(s, block, inst);
+         break;
+
      default:
         /* Nothing to do. */
         break;
--- a/src/intel/compiler/brw_print.cpp
+++ b/src/intel/compiler/brw_print.cpp
@ -305,6 +305,8 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
      return "vote_equal";
   case SHADER_OPCODE_BALLOT:
      return "ballot";
+   case SHADER_OPCODE_QUAD_SWAP:
+      return "quad_swap";
   }

   unreachable("not reached");
@ -611,6 +613,19 @@ brw_print_instruction_to_file(const fs_visitor &s, const fs_inst *inst, FILE *fi

         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
      }
+
+      if (inst->opcode == SHADER_OPCODE_QUAD_SWAP && i == 1) {
+         assert(inst->src[i].file == IMM);
+         const char *name = NULL;
+         switch (inst->src[i].ud) {
+         case BRW_SWAP_HORIZONTAL: name = "horizontal"; break;
+         case BRW_SWAP_VERTICAL:   name = "vertical";   break;
+         case BRW_SWAP_DIAGONAL:   name = "diagonal";   break;
+         default:
+            unreachable("invalid brw_swap_direction");
+         }
+         fprintf(file, " (%s)", name);
+      }
   }

   fprintf(file, " ");