intel/vec4: Don't spill fp64 registers more than once

The way we handle spilling for fp64 in vec4 is to emit a series of MOVs which swizzles the data around and then a pair of 32-bit spills. This works great except that the next time we go to pick a spill reg, the compiler isn't smart enough to figure out that the register has already been spilled. Normally we do this by looking at the sources of spill instructions (or destinations of fills) but, because it's separated from the actual value by a MOV, we can't see it. This commit adds a new opcode VEC4_OPCODE_MOV_FOR_SCRATCH which is identical to MOV in semantics except that it lets RA know not to spill again. Fixes: 82c69426a5 "i965/vec4: support basic spilling of 64-bit registers" Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10571> (cherry picked from commit 2db8867943)
2026-05-06 09:28:07 +02:00 · 2021-05-04 15:30:27 -05:00 · 2021-05-04 15:30:27 -05:00 · acb53b268f
commit acb53b268f
parent 29c18f3c3b
9 changed files with 20 additions and 10 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -175,7 +175,7 @@
        "description": "intel/vec4: Don't spill fp64 registers more than once",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "master_sha": null,
        "because_sha": "82c69426a5a32f9189c8b01059f831c84e9b83a3"
    },
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@ -572,6 +572,7 @@ enum opcode {
   VEC4_OPCODE_PICK_HIGH_32BIT,
   VEC4_OPCODE_SET_LOW_32BIT,
   VEC4_OPCODE_SET_HIGH_32BIT,
+   VEC4_OPCODE_MOV_FOR_SCRATCH,

   FS_OPCODE_DDX_COARSE,
   FS_OPCODE_DDX_FINE,
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@ -379,6 +379,7 @@ namespace {
      case BRW_OPCODE_ADD:
      case BRW_OPCODE_MUL:
      case SHADER_OPCODE_MOV_RELOC_IMM:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
         if (devinfo->ver >= 11) {
            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
                                  0, 10, 6, 14, 0, 0);
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@ -421,6 +421,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
      return "set_low_32bit";
   case VEC4_OPCODE_SET_HIGH_32BIT:
      return "set_high_32bit";
+   case VEC4_OPCODE_MOV_FOR_SCRATCH:
+      return "mov_for_scratch";

   case FS_OPCODE_DDX_COARSE:
      return "ddx_coarse";
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@ -320,6 +320,7 @@ public:

   vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
                                        bool for_write,
+                                        bool for_scratch = false,
                                        bblock_t *block = NULL,
                                        vec4_instruction *ref = NULL);

--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@ -1538,6 +1538,7 @@ generate_code(struct brw_codegen *p,
      switch (inst->opcode) {
      case VEC4_OPCODE_UNPACK_UNIFORM:
      case BRW_OPCODE_MOV:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
         brw_MOV(p, dst, src[0]);
         break;
      case BRW_OPCODE_ADD:
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@ -2138,6 +2138,7 @@ vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
 */
 vec4_instruction *
 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+                                 bool for_scratch,
                                 bblock_t *block, vec4_instruction *ref)
 {
   assert(type_sz(src.type) == 8);
@ -2145,6 +2146,8 @@ vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
   assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
   assert(!ref == !block);

+   opcode mov_op = for_scratch ? VEC4_OPCODE_MOV_FOR_SCRATCH : BRW_OPCODE_MOV;
+
   const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
                                   vec4_builder(this).at(block, ref->next);

@ -2156,22 +2159,22 @@ vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
   }

   /* dst+0.XY = src+0.XY */
-   bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
+   bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);

   /* dst+0.ZW = src+1.XY */
   bld.group(4, for_write ? 1 : 0)
-             .MOV(writemask(dst, WRITEMASK_ZW),
+            .emit(mov_op, writemask(dst, WRITEMASK_ZW),
                  swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));

   /* dst+1.XY = src+0.ZW */
   bld.group(4, for_write ? 0 : 1)
-            .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
-                 swizzle(src, BRW_SWIZZLE_ZWZW));
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+                  swizzle(src, BRW_SWIZZLE_ZWZW));

   /* dst+1.ZW = src+1.ZW */
   return bld.group(4, 1)
-             .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
-                 byte_offset(src, REG_SIZE));
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+                  byte_offset(src, REG_SIZE));
 }

 }
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@ -468,6 +468,7 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)

      case SHADER_OPCODE_GFX4_SCRATCH_READ:
      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
         for (int i = 0; i < 3; i++) {
            if (inst->src[i].file == VGRF)
               no_spill[inst->src[i].nr] = true;
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@ -1400,7 +1400,7 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
      vec4_instruction *last_read =
         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
      emit_before(block, inst, last_read);
-      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+      shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
   }
 }

@ -1445,7 +1445,7 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
   } else {
      dst_reg shuffled = dst_reg(this, alloc_type);
      vec4_instruction *last =
-         shuffle_64bit_data(shuffled, temp, true, block, inst);
+         shuffle_64bit_data(shuffled, temp, true, true, block, inst);
      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));

      uint8_t mask = 0;
@ -1652,7 +1652,7 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,

   if (is_64bit) {
      temp = retype(temp, BRW_REGISTER_TYPE_DF);
-      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
+      shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
   }
 }