aco: allow p_start_linear_vgpr to use multiple operands

Merging the p_create_vector into the p_start_linear_vgpr is useful since we stopped attempting to place the p_start_linear_vgpr definition in the same registers as the operand. fossil-db (navi31): Totals from 927 (1.17% of 79242) affected shaders: MaxWaves: 26412 -> 26442 (+0.11%) Instrs: 938328 -> 938181 (-0.02%); split: -0.14%, +0.13% CodeSize: 4891448 -> 4890820 (-0.01%); split: -0.11%, +0.10% VGPRs: 47016 -> 47004 (-0.03%); split: -0.13%, +0.10% SpillSGPRs: 222 -> 226 (+1.80%) Latency: 5076065 -> 5075191 (-0.02%); split: -0.12%, +0.10% InvThroughput: 712316 -> 712421 (+0.01%); split: -0.09%, +0.10% SClause: 27992 -> 27972 (-0.07%); split: -0.09%, +0.02% Copies: 38042 -> 38104 (+0.16%); split: -1.95%, +2.12% PreVGPRs: 39448 -> 39369 (-0.20%) VALU: 570157 -> 570224 (+0.01%); split: -0.13%, +0.14% SALU: 51672 -> 51678 (+0.01%); split: -0.01%, +0.02% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27697>
2026-05-06 11:38:05 +02:00 · 2024-02-19 17:00:19 +00:00 · 2024-02-19 17:00:19 +00:00 · 5e17a39b15
commit 5e17a39b15
parent f764f6848a
5 changed files with 55 additions and 78 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -9271,7 +9271,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
   case nir_intrinsic_strict_wqm_coord_amd: {
      Temp dst = get_ssa_temp(ctx, &instr->def);
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
      unsigned begin_size = nir_intrinsic_base(instr);

      unsigned num_src = 1;
@ -9280,7 +9279,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
         num_src = src.bytes() / it->second[0].bytes();

      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
-         aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
+         aco_opcode::p_start_linear_vgpr, Format::PSEUDO, num_src + !!begin_size, 1)};

      if (begin_size)
         vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
@ -9289,10 +9288,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
         vec->operands[i + !!begin_size] = Operand(comp);
      }

-      vec->definitions[0] = Definition(tmp);
+      vec->definitions[0] = Definition(dst);
      ctx->block->instructions.emplace_back(std::move(vec));
-
-      bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
      break;
   }
   case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -2357,14 +2357,18 @@ lower_to_hw_instr(Program* program)
               handle_operands(copy_operations, &ctx, program->gfx_level, pi);
               break;
            }
-            case aco_opcode::p_create_vector: {
+            case aco_opcode::p_create_vector:
+            case aco_opcode::p_start_linear_vgpr: {
+               if (instr->operands.empty())
+                  break;
+
               std::map<PhysReg, copy_operation> copy_operations;
               PhysReg reg = instr->definitions[0].physReg();

               for (const Operand& op : instr->operands) {
+                  RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
                  if (op.isConstant()) {
-                     const Definition def = Definition(
-                        reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
+                     const Definition def = Definition(reg, rc);
                     copy_operations[reg] = {op, def, op.bytes()};
                     reg.reg_b += op.bytes();
                     continue;
@ -2375,10 +2379,7 @@ lower_to_hw_instr(Program* program)
                     continue;
                  }

-                  RegClass rc_def =
-                     op.regClass().is_subdword()
-                        ? op.regClass()
-                        : instr->definitions[0].getTemp().regClass().resize(op.bytes());
+                  RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
                  const Definition def = Definition(reg, rc_def);
                  copy_operations[def.physReg()] = {op, def, op.bytes()};
                  reg.reg_b += op.bytes();
@ -2411,19 +2412,6 @@ lower_to_hw_instr(Program* program)
               handle_operands(copy_operations, &ctx, program->gfx_level, pi);
               break;
            }
-            case aco_opcode::p_start_linear_vgpr: {
-               if (instr->operands.empty())
-                  break;
-
-               Definition def(instr->definitions[0].physReg(),
-                              RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
-
-               std::map<PhysReg, copy_operation> copy_operations;
-               copy_operations[def.physReg()] = {instr->operands[0], def,
-                                                 instr->operands[0].bytes()};
-               handle_operands(copy_operations, &ctx, program->gfx_level, pi);
-               break;
-            }
            case aco_opcode::p_exit_early_if: {
               /* don't bother with an early exit near the end of the program */
               if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -580,6 +580,7 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
   case aco_opcode::p_linear_phi:
   case aco_opcode::p_parallelcopy:
   case aco_opcode::p_create_vector:
+   case aco_opcode::p_start_linear_vgpr:
      if (temp.bytes() != instr->operands[index].bytes())
         return false;
      break;
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -366,6 +366,7 @@ validate_ir(Program* program)
               bool flat = instr->isFlatLike();
               bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                   instr->opcode == aco_opcode::p_create_vector ||
+                                   instr->opcode == aco_opcode::p_start_linear_vgpr ||
                                   instr->opcode == aco_opcode::p_jump_to_epilog ||
                                   instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
                                   instr->opcode == aco_opcode::p_end_with_regs ||
@ -527,20 +528,26 @@ validate_ir(Program* program)

         switch (instr->format) {
         case Format::PSEUDO: {
-            if (instr->opcode == aco_opcode::p_create_vector) {
+            if (instr->opcode == aco_opcode::p_create_vector ||
+                instr->opcode == aco_opcode::p_start_linear_vgpr) {
               unsigned size = 0;
               for (const Operand& op : instr->operands) {
                  check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
                  size += op.bytes();
               }
-               check(size == instr->definitions[0].bytes(),
-                     "Definition size does not match operand sizes", instr.get());
+               if (!instr->operands.empty() || instr->opcode == aco_opcode::p_create_vector) {
+                  check(size == instr->definitions[0].bytes(),
+                        "Definition size does not match operand sizes", instr.get());
+               }
               if (instr->definitions[0].regClass().type() == RegType::sgpr) {
                  for (const Operand& op : instr->operands) {
                     check(op.isConstant() || op.regClass().type() == RegType::sgpr,
                           "Wrong Operand type for scalar vector", instr.get());
                  }
               }
+               if (instr->opcode == aco_opcode::p_start_linear_vgpr)
+                  check(instr->definitions[0].regClass().is_linear_vgpr(),
+                        "Definition must be linear VGPR", instr.get());
            } else if (instr->opcode == aco_opcode::p_extract_vector) {
               check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
                     "Wrong Operand types", instr.get());
@ -680,15 +687,6 @@ validate_ir(Program* program)
                     instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
                     "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
               }
-            } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
-               check(instr->definitions.size() == 1, "Must have one definition", instr.get());
-               check(instr->operands.size() <= 1, "Must have one or zero operands", instr.get());
-               if (!instr->definitions.empty())
-                  check(instr->definitions[0].regClass().is_linear_vgpr(),
-                        "Definition must be linear VGPR", instr.get());
-               if (!instr->definitions.empty() && !instr->operands.empty())
-                  check(instr->definitions[0].bytes() == instr->operands[0].bytes(),
-                        "Operand size must match definition", instr.get());
            }
            break;
         }
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@ -52,8 +52,7 @@ BEGIN_TEST(d3d11_derivs.simple)

   //>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x
   //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector (kill)%x, (kill)%y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
   //>> BB2
@ -63,8 +62,8 @@ BEGIN_TEST(d3d11_derivs.simple)

   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                         ; $_
-   //>> v_mov_b32_e32 v#ry_tmp2, v#ry_tmp                                                  ; $_
-   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp2]                                   ; $_ $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
   //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@ -94,8 +93,7 @@ BEGIN_TEST(d3d11_derivs.constant)
   pbld.add_vsfs(vs, fs);

   //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v2: %vec = p_create_vector (kill)%x, -0.5
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, -0.5
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
   //>> BB2
@ -134,7 +132,7 @@ BEGIN_TEST(d3d11_derivs.discard)
   pbld.add_vsfs(vs, fs);

   /* The interpolation must be done before the discard_if. */
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%_
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%_, (kill)%_
   //>> s2: %_:exec, s1: (kill)%_:scc = s_andn2_b64 %_:exec, %_
   //>> s2: %_, s1: %_:scc = s_andn2_b64 (kill)%_, (kill)%_
   //>> p_exit_early_if (kill)%_:scc
@ -167,8 +165,7 @@ BEGIN_TEST(d3d11_derivs.bias)
   pbld.add_vsfs(vs, fs);

   //>> s2: %_:s[0-1], s1: %_:s[2], s1: %_:s[3], s1: %_:s[4], v2: %_:v[0-1], v1: %bias:v[2] = p_startpgm
-   //>> v3: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_
   //>> BB1
   //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%bias 2d
   //>> BB2
@ -176,12 +173,12 @@ BEGIN_TEST(d3d11_derivs.bias)
   //>> p_end_linear_vgpr (kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                                       ; $_
-   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                   ; $_
-   //>> v_mov_b32_e32 v#rb, v2                                                                       ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                 ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                 ; $_
+   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                 ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                               ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                               ; $_
   //>> BB1:
-   //>> image_sample_b v[#_:#_], [v#rb, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
+   //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST

@ -210,8 +207,7 @@ BEGIN_TEST(d3d11_derivs.offset)
   PipelineBuilder pbld(get_vk_device(GFX9));
   pbld.add_vsfs(vs, fs);

-   //>> v3: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_
   //>> BB1
   //>> v1: %offset = p_parallelcopy 0x201
   //>> v4: %_ = image_sample_o (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%offset 2d
@ -220,8 +216,9 @@ BEGIN_TEST(d3d11_derivs.offset)
   //>> p_end_linear_vgpr (kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                            ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                        ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                        ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                      ; $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                      ; $_
   //>> BB1:
   //>> v_mov_b32_e32 v#ro_tmp, 0x201                                     ; $_ $_
@ -256,8 +253,7 @@ BEGIN_TEST(d3d11_derivs.array)
   pbld.add_vsfs(vs, fs);

   //>> v1: %layer = v_rndne_f32 (kill)%_
-   //>> v3: %vec = p_create_vector (kill)%_, (kill)%_, (kill)%layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%_, (kill)%_, (kill)%layer
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2darray da
   //>> BB2
@ -266,9 +262,11 @@ BEGIN_TEST(d3d11_derivs.array)
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

   //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                               ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                                                           ; $_
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                                   ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                               ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                               ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                             ; $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                             ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rl
@ -302,8 +300,7 @@ BEGIN_TEST(d3d11_derivs.bias_array)

   //>> s2: %_:s[0-1], s1: %_:s[2], s1: %_:s[3], s1: %_:s[4], v2: %_:v[0-1], v1: %bias:v[2] = p_startpgm
   //>> v1: %layer = v_rndne_f32 (kill)%_
-   //>> v4: %vec = p_create_vector v1: undef, (kill)%_, (kill)%_, (kill)%layer
-   //>> lv4: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv4: %wqm = p_start_linear_vgpr v1: undef, (kill)%_, (kill)%_, (kill)%layer
   //>> BB1
   //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, %wqm, (kill)%bias 2darray da
   //>> BB2
@ -312,11 +309,12 @@ BEGIN_TEST(d3d11_derivs.bias_array)
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

   //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                                             ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                                                                         ; $_
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                             ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                             ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                                     ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                                           ; $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                           ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                                           ; $_
   //>> BB1:
   //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry, v#rl], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; $_ $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@ -347,8 +345,7 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
   pbld.add_vsfs(vs, fs);

   //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v2: %vec = p_create_vector (kill)%x, 0.5
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, 0.5
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
   //>> BB2
@ -389,8 +386,7 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)

   //>> v1: %layer = v_rndne_f32 (kill)%_
   //>> v1: %x = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.x
-   //>> v3: %vec = p_create_vector (kill)%x, 0.5, (kill)%layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, 0.5, (kill)%layer
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2darray da
   //>> BB2
@ -400,8 +396,9 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)

   //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
   //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rl
@ -436,8 +433,7 @@ BEGIN_TEST(d3d11_derivs.cube)
   //>> v1: %face = v_cubeid_f32 (kill)%_, (kill)%_, (kill)%_
   //>> v1: %x = v_fmaak_f32 (kill)%_, %_, 0x3fc00000
   //>> v1: %y = v_fmaak_f32 (kill)%_, (kill)%_, 0x3fc00000
-   //>> v3: %vec = p_create_vector (kill)%x, (kill)%y, (kill)%face
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y, (kill)%face
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm cube da
   //>> BB2
@ -446,10 +442,10 @@ BEGIN_TEST(d3d11_derivs.cube)
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

   //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_                                                 ; $_ $_
-   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
+   //>> v_fmaak_f32 v#rx, v#_, v#_, 0x3fc00000                                               ; $_ $_
   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
   //>> v_mov_b32_e32 v#rf, v#rf_tmp                                                         ; $_
-   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp]                                      ; $_ $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                         ; $_
   //; success = rx+1 == ry and rx+2 == rf
   //>> image_sample v[#_:#_], v[#rx:#rf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@ -484,8 +480,7 @@ BEGIN_TEST(d3d11_derivs.cube_array)
   //>> v1: %x = v_fmaak_f32 (kill)%_, %_, 0x3fc00000
   //>> v1: %y = v_fmaak_f32 (kill)%_, (kill)%_, 0x3fc00000
   //>> v1: %face_layer = v_fmamk_f32 (kill)%layer, (kill)%face, 0x41000000
-   //>> v3: %vec = p_create_vector (kill)%x, (kill)%y, (kill)%face_layer
-   //>> lv3: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv3: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y, (kill)%face_layer
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm cube da
   //>> BB2
@ -495,12 +490,12 @@ BEGIN_TEST(d3d11_derivs.cube_array)

   //>> v_rndne_f32_e32 v#rl, v#_                                                             ; $_
   //>> v_cubeid_f32 v#rf, v#_, v#_, v#_                                                      ; $_ $_
+   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
-   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                          ; $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                          ; $_
   //>> v_mov_b32_e32 v#rlf, v#rlf_tmp                                                        ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                          ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rlf
   //>> image_sample v[#_:#_], v[#rx:#rlf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
@ -566,8 +561,7 @@ BEGIN_TEST(d3d11_derivs.bc_optimize)
   //>> v1: %y_coord2 = v_cndmask_b32 (kill)%_, %_, (kill)%_
   //>> v1: %x = v_interp_p2_f32 (kill)%_, %_:m0, (kill)%_ attr0.x
   //>> v1: %y = v_interp_p2_f32 (kill)%y_coord2, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector (kill)%x, (kill)%y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr (kill)%x, (kill)%y
   //>> BB1
   //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, %wqm 2d
   //>> BB2
@ -602,8 +596,7 @@ BEGIN_TEST(d3d11_derivs.get_lod)

   //>> v1: %x = v_interp_p2_f32 %_, %_:m0, (kill)%_ attr0.x
   //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y
-   //>> v2: %vec = p_create_vector %x, %y
-   //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
+   //>> lv2: %wqm = p_start_linear_vgpr %x, %y
   //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi
   //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi
   //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi