From 5e17a39b15e3a11d5eda30afc03549dbbb9ea702 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Mon, 19 Feb 2024 17:00:19 +0000
Subject: [PATCH] aco: allow p_start_linear_vgpr to use multiple operands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merging the p_create_vector into the p_start_linear_vgpr is useful since
we stopped attempting to place the p_start_linear_vgpr definition in the
same registers as the operand.

fossil-db (navi31):
Totals from 927 (1.17% of 79242) affected shaders:
MaxWaves: 26412 -> 26442 (+0.11%)
Instrs: 938328 -> 938181 (-0.02%); split: -0.14%, +0.13%
CodeSize: 4891448 -> 4890820 (-0.01%); split: -0.11%, +0.10%
VGPRs: 47016 -> 47004 (-0.03%); split: -0.13%, +0.10%
SpillSGPRs: 222 -> 226 (+1.80%)
Latency: 5076065 -> 5075191 (-0.02%); split: -0.12%, +0.10%
InvThroughput: 712316 -> 712421 (+0.01%); split: -0.09%, +0.10%
SClause: 27992 -> 27972 (-0.07%); split: -0.09%, +0.02%
Copies: 38042 -> 38104 (+0.16%); split: -1.95%, +2.12%
PreVGPRs: 39448 -> 39369 (-0.20%)
VALU: 570157 -> 570224 (+0.01%); split: -0.13%, +0.14%
SALU: 51672 -> 51678 (+0.01%); split: -0.01%, +0.02%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27697>
---
 .../compiler/aco_instruction_selection.cpp    |  7 +-
 src/amd/compiler/aco_lower_to_hw_instr.cpp    | 28 ++-----
 src/amd/compiler/aco_optimizer.cpp            |  1 +
 src/amd/compiler/aco_validate.cpp             | 22 +++---
 src/amd/compiler/tests/test_d3d11_derivs.cpp  | 75 +++++++++----------
 5 files changed, 55 insertions(+), 78 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 4e38dbd22c2..9a56ca2c68c 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -9271,7 +9271,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    case nir_intrinsic_strict_wqm_coord_amd: {
       Temp dst = get_ssa_temp(ctx, &instr->def);
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
       unsigned begin_size = nir_intrinsic_base(instr);
 
       unsigned num_src = 1;
@@ -9280,7 +9279,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
          num_src = src.bytes() / it->second[0].bytes();
 
       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
-         aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
+         aco_opcode::p_start_linear_vgpr, Format::PSEUDO, num_src + !!begin_size, 1)};
 
       if (begin_size)
          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
@@ -9289,10 +9288,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
          vec->operands[i + !!begin_size] = Operand(comp);
       }
 
-      vec->definitions[0] = Definition(tmp);
+      vec->definitions[0] = Definition(dst);
       ctx->block->instructions.emplace_back(std::move(vec));
-
-      bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
       break;
    }
    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 03282ea2857..9c42495f552 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2357,14 +2357,18 @@ lower_to_hw_instr(Program* program)
                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
                break;
             }
-            case aco_opcode::p_create_vector: {
+            case aco_opcode::p_create_vector:
+            case aco_opcode::p_start_linear_vgpr: {
+               if (instr->operands.empty())
+                  break;
+
                std::map<PhysReg, copy_operation> copy_operations;
                PhysReg reg = instr->definitions[0].physReg();
 
                for (const Operand& op : instr->operands) {
+                  RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
                   if (op.isConstant()) {
-                     const Definition def = Definition(
-                        reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
+                     const Definition def = Definition(reg, rc);
                      copy_operations[reg] = {op, def, op.bytes()};
                      reg.reg_b += op.bytes();
                      continue;
@@ -2375,10 +2379,7 @@ lower_to_hw_instr(Program* program)
                      continue;
                   }
 
-                  RegClass rc_def =
-                     op.regClass().is_subdword()
-                        ? op.regClass()
-                        : instr->definitions[0].getTemp().regClass().resize(op.bytes());
+                  RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
                   const Definition def = Definition(reg, rc_def);
                   copy_operations[def.physReg()] = {op, def, op.bytes()};
                   reg.reg_b += op.bytes();
@@ -2411,19 +2412,6 @@ lower_to_hw_instr(Program* program)
                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
                break;
             }
-            case aco_opcode::p_start_linear_vgpr: {
-               if (instr->operands.empty())
-                  break;
-
-               Definition def(instr->definitions[0].physReg(),
-                              RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
-
-               std::map<PhysReg, copy_operation> copy_operations;
-               copy_operations[def.physReg()] = {instr->operands[0], def,
-                                                 instr->operands[0].bytes()};
-               handle_operands(copy_operations, &ctx, program->gfx_level, pi);
-               break;
-            }
             case aco_opcode::p_exit_early_if: {
                /* don't bother with an early exit near the end of the program */
                if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 73530876c4b..dc7e003b6ab 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -580,6 +580,7 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
    case aco_opcode::p_linear_phi:
    case aco_opcode::p_parallelcopy:
    case aco_opcode::p_create_vector:
+   case aco_opcode::p_start_linear_vgpr:
       if (temp.bytes() != instr->operands[index].bytes())
          return false;
       break;
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index b571e8fd09f..5d59b0af565 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -366,6 +366,7 @@ validate_ir(Program* program)
                bool flat = instr->isFlatLike();
                bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                    instr->opcode == aco_opcode::p_create_vector ||
+                                   instr->opcode == aco_opcode::p_start_linear_vgpr ||
                                    instr->opcode == aco_opcode::p_jump_to_epilog ||
                                    instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
                                    instr->opcode == aco_opcode::p_end_with_regs ||
@@ -527,20 +528,26 @@ validate_ir(Program* program)
 
          switch (instr->format) {
          case Format::PSEUDO: {
-            if (instr->opcode == aco_opcode::p_create_vector) {
+            if (instr->opcode == aco_opcode::p_create_vector ||
+                instr->opcode == aco_opcode::p_start_linear_vgpr) {
                unsigned size = 0;
                for (const Operand& op : instr->operands) {
                   check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
                   size += op.bytes();
                }
-               check(size == instr->definitions[0].bytes(),
-                     "Definition size does not match operand sizes", instr.get());
+               if (!instr->operands.empty() || instr->opcode == aco_opcode::p_create_vector) {
+                  check(size == instr->definitions[0].bytes(),
+                        "Definition size does not match operand sizes", instr.get());
+               }
                if (instr->definitions[0].regClass().type() == RegType::sgpr) {
                   for (const Operand& op : instr->operands) {
                      check(op.isConstant() || op.regClass().type() == RegType::sgpr,
                            "Wrong Operand type for scalar vector", instr.get());
                   }
                }
+               if (instr->opcode == aco_opcode::p_start_linear_vgpr)
+                  check(instr->definitions[0].regClass().is_linear_vgpr(),
+                        "Definition must be linear VGPR", instr.get());
             } else if (instr->opcode == aco_opcode::p_extract_vector) {
                check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
                      "Wrong Operand types", instr.get());
@@ -680,15 +687,6 @@ validate_ir(Program* program)
                      instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
                      "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
                }
-            } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
-               check(instr->definitions.size() == 1, "Must have one definition", instr.get());
-               check(instr->operands.size() <= 1, "Must have one or zero operands", instr.get());
-               if (!instr->definitions.empty())
-                  check(instr->definitions[0].regClass().is_linear_vgpr(),
-                        "Definition must be linear VGPR", instr.get());
-               if (!instr->definitions.empty() && !instr->operands.empty())
-                  check(instr->definitions[0].bytes() == instr->operands[0].bytes(),
-                        "Operand size must match definition", instr.get());
             }
             break;
          }
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index 17370714e2e..f180de4a157 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -52,8 +52,7 @@ BEGIN_TEST(d3d11_derivs.simple)
 
    //>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x
    //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector (kill)%x, (kill)%y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
    //>> BB2
@@ -63,8 +62,8 @@ BEGIN_TEST(d3d11_derivs.simple)
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                         ; $_
-   //>> v_mov_b32_e32 v#ry_tmp2, v#ry_tmp                                                  ; $_
-   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp2]                                   ; $_ $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
    //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@@ -94,8 +93,7 @@ BEGIN_TEST(d3d11_derivs.constant)
    pbld.add_vsfs(vs, fs);
 
    //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v2: %vec = p_create_vector (kill)%x, -0.5
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, -0.5
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
    //>> BB2
@@ -134,7 +132,7 @@ BEGIN_TEST(d3d11_derivs.discard)
    pbld.add_vsfs(vs, fs);
 
    /* The interpolation must be done before the discard_if. */
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%_
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%_, (kill)%_
    //>> s2: %_:exec, s1: (kill)%_:scc = s_andn2_b64 %_:exec, %_
    //>> s2: %_, s1: %_:scc = s_andn2_b64 (kill)%_, (kill)%_
    //>> p_exit_early_if (kill)%_:scc
@@ -167,8 +165,7 @@ BEGIN_TEST(d3d11_derivs.bias)
    pbld.add_vsfs(vs, fs);
 
    //>> s2: %_:s[0-1], s1: %_:s[2], s1: %_:s[3], s1: %_:s[4], v2: %_:v[0-1], v1: %bias:v[2] = p_startpgm
-   //>> v3: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_
    //>> BB1
    //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%bias 2d
    //>> BB2
@@ -176,12 +173,12 @@ BEGIN_TEST(d3d11_derivs.bias)
    //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                                       ; $_
-   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                   ; $_
-   //>> v_mov_b32_e32 v#rb, v2                                                                       ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                 ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                 ; $_
+   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                 ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                               ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                               ; $_
    //>> BB1:
-   //>> image_sample_b v[#_:#_], [v#rb, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
+   //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
 
@@ -210,8 +207,7 @@ BEGIN_TEST(d3d11_derivs.offset)
    PipelineBuilder pbld(get_vk_device(GFX9));
    pbld.add_vsfs(vs, fs);
 
-   //>> v3: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_
    //>> BB1
    //>> v1: %offset = p_parallelcopy 0x201
    //>> v4: %_ = image_sample_o (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%offset 2d
@@ -220,8 +216,9 @@ BEGIN_TEST(d3d11_derivs.offset)
    //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                            ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                        ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                        ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                      ; $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                      ; $_
    //>> BB1:
    //>> v_mov_b32_e32 v#ro_tmp, 0x201                                     ; $_ $_
@@ -256,8 +253,7 @@ BEGIN_TEST(d3d11_derivs.array)
    pbld.add_vsfs(vs, fs);
 
    //>> v1: %layer = v_rndne_f32 (kill)%_
-   //>> v3: %vec = p_create_vector (kill)%_, (kill)%_, (kill)%layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%_, (kill)%_, (kill)%layer
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2darray da
    //>> BB2
@@ -266,9 +262,11 @@ BEGIN_TEST(d3d11_derivs.array)
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                               ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                                                           ; $_
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                                   ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                               ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                               ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                             ; $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                             ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rl
@@ -302,8 +300,7 @@ BEGIN_TEST(d3d11_derivs.bias_array)
 
    //>> s2: %_:s[0-1], s1: %_:s[2], s1: %_:s[3], s1: %_:s[4], v2: %_:v[0-1], v1: %bias:v[2] = p_startpgm
    //>> v1: %layer = v_rndne_f32 (kill)%_
-   //>> v4: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_, (kill)%layer
-   //>> lv4: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv4: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_, (kill)%layer
    //>> BB1
    //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%bias 2darray da
    //>> BB2
@@ -312,11 +309,12 @@ BEGIN_TEST(d3d11_derivs.bias_array)
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                                             ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                                                                         ; $_
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                             ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                             ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                                     ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                                           ; $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                           ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                                           ; $_
    //>> BB1:
    //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry, v#rl], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; $_ $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@@ -347,8 +345,7 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
    pbld.add_vsfs(vs, fs);
 
    //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v2: %vec = p_create_vector (kill)%x, 0.5
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, 0.5
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
    //>> BB2
@@ -389,8 +386,7 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
 
    //>> v1: %layer = v_rndne_f32 (kill)%_
    //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v3: %vec = p_create_vector (kill)%x, 0.5, (kill)%layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, 0.5, (kill)%layer
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2darray da
    //>> BB2
@@ -400,8 +396,9 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
    //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rl
@@ -436,8 +433,7 @@ BEGIN_TEST(d3d11_derivs.cube)
    //>> v1: %face = v_cubeid_f32 (kill)%_, (kill)%_, (kill)%_
    //>> v1: %x = v_fmaak_f32 (kill)%_, %_, 0x3fc00000
    //>> v1: %y = v_fmaak_f32 (kill)%_, (kill)%_, 0x3fc00000
-   //>> v3: %vec = p_create_vector (kill)%x, (kill)%y, (kill)%face
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y, (kill)%face
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm cube da
    //>> BB2
@@ -446,10 +442,10 @@ BEGIN_TEST(d3d11_derivs.cube)
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_                                                 ; $_ $_
-   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
+   //>> v_fmaak_f32 v#rx, v#_, v#_, 0x3fc00000                                               ; $_ $_
    //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
    //>> v_mov_b32_e32 v#rf, v#rf_tmp                                                         ; $_
-   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp]                                      ; $_ $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                         ; $_
    //; success = rx+1 == ry and rx+2 == rf
    //>> image_sample v[#_:#_], v[#rx:#rf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@@ -484,8 +480,7 @@ BEGIN_TEST(d3d11_derivs.cube_array)
    //>> v1: %x = v_fmaak_f32 (kill)%_, %_, 0x3fc00000
    //>> v1: %y = v_fmaak_f32 (kill)%_, (kill)%_, 0x3fc00000
    //>> v1: %face_layer = v_fmamk_f32 (kill)%layer, (kill)%face, 0x41000000
-   //>> v3: %vec = p_create_vector (kill)%x, (kill)%y, (kill)%face_layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y, (kill)%face_layer
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm cube da
    //>> BB2
@@ -495,12 +490,12 @@ BEGIN_TEST(d3d11_derivs.cube_array)
 
    //>> v_rndne_f32_e32 v#rl, v#_                                                             ; $_
    //>> v_cubeid_f32 v#rf, v#_, v#_, v#_                                                      ; $_ $_
+   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
    //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
    //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
-   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                          ; $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                          ; $_
    //>> v_mov_b32_e32 v#rlf, v#rlf_tmp                                                        ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                          ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rlf
    //>> image_sample v[#_:#_], v[#rx:#rlf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
@@ -566,8 +561,7 @@ BEGIN_TEST(d3d11_derivs.bc_optimize)
    //>> v1: %y_coord2 = v_cndmask_b32 (kill)%_, %_, (kill)%_
    //>> v1: %x = v_interp_p2_f32 (kill)%_, %_:m0, (kill)%_ attr0.x
    //>> v1: %y = v_interp_p2_f32 (kill)%y_coord2, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector (kill)%x, (kill)%y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y
    //>> BB1
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
    //>> BB2
@@ -602,8 +596,7 @@ BEGIN_TEST(d3d11_derivs.get_lod)
 
    //>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x
    //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector %x, %y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr %x, %y
    //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi
    //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
    //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi