From acb53b268f79375220f0d723f152a69251f3126d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Tue, 4 May 2021 15:30:27 -0500
Subject: [PATCH] intel/vec4: Don't spill fp64 registers more than once

The way we handle spilling for fp64 in vec4 is to emit a series of MOVs
which swizzles the data around and then a pair of 32-bit spills.  This
works great except that the next time we go to pick a spill reg, the
compiler isn't smart enough to figure out that the register has already
been spilled.  Normally we do this by looking at the sources of spill
instructions (or destinations of fills) but, because it's separated from
the actual value by a MOV, we can't see it.  This commit adds a new
opcode VEC4_OPCODE_MOV_FOR_SCRATCH which is identical to MOV in
semantics except that it lets RA know not to spill again.

Fixes: 82c69426a5a3 "i965/vec4: support basic spilling of 64-bit registers"
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10571>
(cherry picked from commit 2db88679432bd34a2c4ed761baec747192fa3e60)
---
 .pick_status.json                            |  2 +-
 src/intel/compiler/brw_eu_defines.h          |  1 +
 src/intel/compiler/brw_ir_performance.cpp    |  1 +
 src/intel/compiler/brw_shader.cpp            |  2 ++
 src/intel/compiler/brw_vec4.h                |  1 +
 src/intel/compiler/brw_vec4_generator.cpp    |  1 +
 src/intel/compiler/brw_vec4_nir.cpp          | 15 +++++++++------
 src/intel/compiler/brw_vec4_reg_allocate.cpp |  1 +
 src/intel/compiler/brw_vec4_visitor.cpp      |  6 +++---
 9 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 06af7362e4b..5ae48edbabe 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -175,7 +175,7 @@
         "description": "intel/vec4: Don't spill fp64 registers more than once",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": "82c69426a5a32f9189c8b01059f831c84e9b83a3"
     },
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index e9b3827599e..a5ebaaa46d2 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -572,6 +572,7 @@ enum opcode {
    VEC4_OPCODE_PICK_HIGH_32BIT,
    VEC4_OPCODE_SET_LOW_32BIT,
    VEC4_OPCODE_SET_HIGH_32BIT,
+   VEC4_OPCODE_MOV_FOR_SCRATCH,
 
    FS_OPCODE_DDX_COARSE,
    FS_OPCODE_DDX_FINE,
diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp
index 6198eb81a85..5e57f0d708f 100644
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -379,6 +379,7 @@ namespace {
       case BRW_OPCODE_ADD:
       case BRW_OPCODE_MUL:
       case SHADER_OPCODE_MOV_RELOC_IMM:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
          if (devinfo->ver >= 11) {
             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
                                   0, 10, 6, 14, 0, 0);
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index efd9b8bcb02..1ea61205b09 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -421,6 +421,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
       return "set_low_32bit";
    case VEC4_OPCODE_SET_HIGH_32BIT:
       return "set_high_32bit";
+   case VEC4_OPCODE_MOV_FOR_SCRATCH:
+      return "mov_for_scratch";
 
    case FS_OPCODE_DDX_COARSE:
       return "ddx_coarse";
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
index 9cfb48596a2..b928239f1ca 100644
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -320,6 +320,7 @@ public:
 
    vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
                                         bool for_write,
+                                        bool for_scratch = false,
                                         bblock_t *block = NULL,
                                         vec4_instruction *ref = NULL);
 
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 1266095eb8b..bd7d83f3ce6 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1538,6 +1538,7 @@ generate_code(struct brw_codegen *p,
       switch (inst->opcode) {
       case VEC4_OPCODE_UNPACK_UNIFORM:
       case BRW_OPCODE_MOV:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
          brw_MOV(p, dst, src[0]);
          break;
       case BRW_OPCODE_ADD:
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 966bc223f84..188175adc41 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -2138,6 +2138,7 @@ vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
  */
 vec4_instruction *
 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+                                 bool for_scratch,
                                  bblock_t *block, vec4_instruction *ref)
 {
    assert(type_sz(src.type) == 8);
@@ -2145,6 +2146,8 @@ vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
    assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
    assert(!ref == !block);
 
+   opcode mov_op = for_scratch ? VEC4_OPCODE_MOV_FOR_SCRATCH : BRW_OPCODE_MOV;
+
    const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
                                    vec4_builder(this).at(block, ref->next);
 
@@ -2156,22 +2159,22 @@ vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
    }
 
    /* dst+0.XY = src+0.XY */
-   bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
+   bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);
 
    /* dst+0.ZW = src+1.XY */
    bld.group(4, for_write ? 1 : 0)
-             .MOV(writemask(dst, WRITEMASK_ZW),
+            .emit(mov_op, writemask(dst, WRITEMASK_ZW),
                   swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
 
    /* dst+1.XY = src+0.ZW */
    bld.group(4, for_write ? 0 : 1)
-            .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
-                 swizzle(src, BRW_SWIZZLE_ZWZW));
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+                  swizzle(src, BRW_SWIZZLE_ZWZW));
 
    /* dst+1.ZW = src+1.ZW */
    return bld.group(4, 1)
-             .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
-                 byte_offset(src, REG_SIZE));
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+                  byte_offset(src, REG_SIZE));
 }
 
 }
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index 35c47fc37ca..19917124d79 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -468,6 +468,7 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
 
       case SHADER_OPCODE_GFX4_SCRATCH_READ:
       case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
          for (int i = 0; i < 3; i++) {
             if (inst->src[i].file == VGRF)
                no_spill[inst->src[i].nr] = true;
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
index cfad712e01a..c20d620a85a 100644
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -1400,7 +1400,7 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
       vec4_instruction *last_read =
          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
       emit_before(block, inst, last_read);
-      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+      shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
    }
 }
 
@@ -1445,7 +1445,7 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    } else {
       dst_reg shuffled = dst_reg(this, alloc_type);
       vec4_instruction *last =
-         shuffle_64bit_data(shuffled, temp, true, block, inst);
+         shuffle_64bit_data(shuffled, temp, true, true, block, inst);
       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
 
       uint8_t mask = 0;
@@ -1652,7 +1652,7 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 
    if (is_64bit) {
       temp = retype(temp, BRW_REGISTER_TYPE_DF);
-      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
+      shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
    }
 }