radv,aco: lower color exports in NIR

fossils-db (Sienna Cichlid): Totals from 27108 (20.09% of 134913) affected shaders: VGPRs: 1260608 -> 1261424 (+0.06%); split: -0.00%, +0.07% CodeSize: 112795868 -> 112785892 (-0.01%); split: -0.05%, +0.04% MaxWaves: 628608 -> 628448 (-0.03%); split: +0.00%, -0.03% Instrs: 20750003 -> 20749314 (-0.00%); split: -0.01%, +0.00% Latency: 288088081 -> 288015865 (-0.03%); split: -0.06%, +0.04% InvThroughput: 53944847 -> 53961693 (+0.03%); split: -0.01%, +0.04% VClause: 396463 -> 396467 (+0.00%); split: -0.02%, +0.02% SClause: 842088 -> 842150 (+0.01%); split: -0.03%, +0.04% Copies: 1244982 -> 1259026 (+1.13%); split: -0.01%, +1.14% PreSGPRs: 1251949 -> 1251909 (-0.00%) PreVGPRs: 1099647 -> 1100879 (+0.11%); split: -0.03%, +0.14% fossils-db (Polaris10): Totals from 23928 (17.60% of 135960) affected shaders: SGPRs: 1751792 -> 1751024 (-0.04%); split: -0.05%, +0.01% VGPRs: 1098964 -> 1098556 (-0.04%); split: -0.13%, +0.09% CodeSize: 99893472 -> 99837940 (-0.06%); split: -0.06%, +0.00% MaxWaves: 138322 -> 138306 (-0.01%); split: +0.03%, -0.04% Instrs: 19213995 -> 19211980 (-0.01%); split: -0.02%, +0.01% Latency: 273026926 -> 273109402 (+0.03%); split: -0.01%, +0.04% InvThroughput: 111160907 -> 111195187 (+0.03%); split: -0.04%, +0.07% VClause: 343058 -> 343097 (+0.01%); split: -0.02%, +0.03% SClause: 802756 -> 802884 (+0.02%); split: -0.04%, +0.06% Copies: 1729387 -> 1739208 (+0.57%); split: -0.04%, +0.61% PreSGPRs: 1090264 -> 1090303 (+0.00%); split: -0.00%, +0.01% PreVGPRs: 959490 -> 960600 (+0.12%); split: -0.04%, +0.15% Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15263>
2026-05-09 04:38:03 +02:00 · 2022-02-22 16:39:29 +01:00 · 2022-02-22 16:39:29 +01:00 · 8c51874af4
commit 8c51874af4
parent 9e31991c6e
2 changed files with 172 additions and 132 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -10615,33 +10615,12 @@ export_fs_mrt_color(isel_context* ctx, int slot)

   unsigned target, col_format;
   unsigned enabled_channels = 0;
-   aco_opcode compr_op = (aco_opcode)0;
   bool compr = false;

   slot -= FRAG_RESULT_DATA0;
   target = V_008DFC_SQ_EXP_MRT + slot;
   col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;

-   bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
-   bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
-   bool is_16bit = values[0].regClass() == v2b;
-
-   /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
-   if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
-       (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
-        col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
-        col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
-      for (int i = 0; i < 4; i++) {
-         if (!(write_mask & (1 << i)))
-            continue;
-
-         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
-                               values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
-         values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
-                              bld.copy(bld.def(v1), Operand::zero()), isnan);
-      }
-   }
-
   switch (col_format) {
   case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;

@ -10659,103 +10638,12 @@ export_fs_mrt_color(isel_context* ctx, int slot)
      break;

   case V_028714_SPI_SHADER_FP16_ABGR:
-      for (int i = 0; i < 2; i++) {
-         bool enabled = (write_mask >> (i * 2)) & 0x3;
-         if (enabled) {
-            enabled_channels |= 0x3 << (i * 2);
-            if (is_16bit) {
-               values[i] =
-                  bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
-                             values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
-                             values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
-            } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
-               values[i] =
-                  bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
-                           values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
-                           values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
-            } else {
-               values[i] =
-                  bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
-                           values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
-                           values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
-            }
-         } else {
-            values[i] = Operand(v1);
-         }
-      }
-      values[2] = Operand(v1);
-      values[3] = Operand(v1);
-      compr = true;
-      break;
-
   case V_028714_SPI_SHADER_UNORM16_ABGR:
-      if (is_16bit && ctx->options->chip_class >= GFX9) {
-         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
-      } else {
-         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
-      }
-      break;
-
   case V_028714_SPI_SHADER_SNORM16_ABGR:
-      if (is_16bit && ctx->options->chip_class >= GFX9) {
-         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
-      } else {
-         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
-      }
-      break;
-
-   case V_028714_SPI_SHADER_UINT16_ABGR: {
-      compr_op = aco_opcode::v_cvt_pk_u16_u32;
-      if (is_int8 || is_int10) {
-         /* clamp */
-         uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
-         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
-
-         for (unsigned i = 0; i < 4; i++) {
-            if ((write_mask >> i) & 1) {
-               values[i] =
-                  bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
-                           i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
-            }
-         }
-      } else if (is_16bit) {
-         for (unsigned i = 0; i < 4; i++) {
-            if ((write_mask >> i) & 1) {
-               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
-               values[i] = Operand(tmp);
-            }
-         }
-      }
-      break;
-   }
-
+   case V_028714_SPI_SHADER_UINT16_ABGR:
   case V_028714_SPI_SHADER_SINT16_ABGR:
-      compr_op = aco_opcode::v_cvt_pk_i16_i32;
-      if (is_int8 || is_int10) {
-         /* clamp */
-         uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
-         uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
-         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
-         Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
-
-         for (unsigned i = 0; i < 4; i++) {
-            if ((write_mask >> i) & 1) {
-               values[i] =
-                  bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
-                           i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
-               values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
-                                    i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
-                                    values[i]);
-            }
-         }
-      } else if (is_16bit) {
-         for (unsigned i = 0; i < 4; i++) {
-            if ((write_mask >> i) & 1) {
-               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
-               values[i] = Operand(tmp);
-            }
-         }
-      }
+      enabled_channels = util_widen_mask(write_mask, 2);
+      compr = true;
      break;

   case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
@ -10764,23 +10652,7 @@ export_fs_mrt_color(isel_context* ctx, int slot)
   default: return false;
   }

-   if ((bool)compr_op) {
-      for (int i = 0; i < 2; i++) {
-         /* check if at least one of the values to be compressed is enabled */
-         bool enabled = (write_mask >> (i * 2)) & 0x3;
-         if (enabled) {
-            enabled_channels |= 0x3 << (i * 2);
-            values[i] = bld.vop3(
-               compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
-               values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
-         } else {
-            values[i] = Operand(v1);
-         }
-      }
-      values[2] = Operand(v1);
-      values[3] = Operand(v1);
-      compr = true;
-   } else if (!compr) {
+   if (!compr) {
      for (int i = 0; i < 4; i++)
         values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
   }
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -3872,6 +3872,169 @@ radv_lower_vs_input(nir_shader *nir, const struct radv_pipeline_key *pipeline_ke
   return progress;
 }

+static bool
+radv_lower_fs_output(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   bool progress = false;
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         int slot = nir_intrinsic_base(intrin) - FRAG_RESULT_DATA0;
+         if (slot < 0)
+            continue;
+
+         unsigned write_mask = nir_intrinsic_write_mask(intrin);
+         unsigned col_format = (pipeline_key->ps.col_format >> (4 * slot)) & 0xf;
+         bool is_int8 = (pipeline_key->ps.is_int8 >> slot) & 1;
+         bool is_int10 = (pipeline_key->ps.is_int10 >> slot) & 1;
+         bool is_16bit = intrin->src[0].ssa->bit_size == 16;
+
+         if (col_format == V_028714_SPI_SHADER_ZERO)
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+         nir_ssa_def *values[4];
+
+         /* Extract the export values. */
+         for (unsigned i = 0; i < 4; i++) {
+            if (write_mask & (1 << i)) {
+               values[i] = nir_channel(&b, intrin->src[0].ssa, i);
+            } else {
+               values[i] = nir_ssa_undef(&b, 1, 32);
+            }
+         }
+
+         /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+         if (pipeline_key->ps.enable_mrt_output_nan_fixup && !nir->info.internal && !is_16bit &&
+             (col_format == V_028714_SPI_SHADER_32_R ||
+              col_format == V_028714_SPI_SHADER_32_GR ||
+              col_format == V_028714_SPI_SHADER_32_AR ||
+              col_format == V_028714_SPI_SHADER_32_ABGR ||
+              col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+            u_foreach_bit(i, write_mask) {
+               const bool save_exact = b.exact;
+
+               b.exact = true;
+               nir_ssa_def *isnan = nir_fneu(&b, values[i], values[i]);
+               b.exact = save_exact;
+
+               values[i] = nir_bcsel(&b, isnan, nir_imm_zero(&b, 1, 32), values[i]);
+            }
+         }
+
+         if (col_format == V_028714_SPI_SHADER_FP16_ABGR ||
+             col_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
+             col_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
+             col_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+             col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+            /* Convert and/or clamp the export values. */
+            switch (col_format) {
+            case V_028714_SPI_SHADER_UINT16_ABGR: {
+               unsigned max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
+               u_foreach_bit(i, write_mask) {
+                  if (is_int8 || is_int10) {
+                     values[i] = nir_umin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 3u)
+                                                                            : nir_imm_int(&b, max_rgb));
+                  } else if (is_16bit) {
+                     values[i] = nir_u2u32(&b, values[i]);
+                  }
+               }
+               break;
+            }
+            case V_028714_SPI_SHADER_SINT16_ABGR: {
+               unsigned max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
+               unsigned min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
+               u_foreach_bit(i, write_mask) {
+                  if (is_int8 || is_int10) {
+                     values[i] = nir_imin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 1u)
+                                                                            : nir_imm_int(&b, max_rgb));
+                     values[i] = nir_imax(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, -2u)
+                                                                            : nir_imm_int(&b, min_rgb));
+                  } else if (is_16bit) {
+                     values[i] = nir_i2i32(&b, values[i]);
+                  }
+               }
+               break;
+            }
+            case V_028714_SPI_SHADER_UNORM16_ABGR:
+            case V_028714_SPI_SHADER_SNORM16_ABGR:
+               u_foreach_bit(i, write_mask) {
+                  if (is_16bit) {
+                     values[i] = nir_f2f32(&b, values[i]);
+                  }
+               }
+               break;
+            default:
+               break;
+            }
+
+            /* Only nir_pack_32_2x16_split needs 16-bit inputs. */
+            bool input_16_bit = col_format == V_028714_SPI_SHADER_FP16_ABGR && is_16bit;
+            unsigned new_write_mask = 0;
+
+            /* Pack the export values. */
+            for (unsigned i = 0; i < 2; i++) {
+               bool enabled = (write_mask >> (i * 2)) & 0x3;
+
+               if (!enabled) {
+                  values[i] = nir_ssa_undef(&b, 1, 32);
+                  continue;
+               }
+
+               nir_ssa_def *src0 = values[i * 2];
+               nir_ssa_def *src1 = values[i * 2 + 1];
+
+               if (!(write_mask & (1 << (i * 2))))
+                  src0 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
+               if (!(write_mask & (1 << (i * 2 + 1))))
+                  src1 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
+
+               if (col_format == V_028714_SPI_SHADER_FP16_ABGR) {
+                  if (is_16bit) {
+                     values[i] = nir_pack_32_2x16_split(&b, src0, src1);
+                  } else {
+                     values[i] = nir_pack_half_2x16_split(&b, src0, src1);
+                  }
+               } else if (col_format == V_028714_SPI_SHADER_UNORM16_ABGR) {
+                  values[i] = nir_pack_unorm_2x16(&b, nir_vec2(&b, src0, src1));
+               } else if (col_format == V_028714_SPI_SHADER_SNORM16_ABGR) {
+                  values[i] = nir_pack_snorm_2x16(&b, nir_vec2(&b, src0, src1));
+               } else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR) {
+                  values[i] = nir_pack_uint_2x16(&b, nir_vec2(&b, src0, src1));
+               } else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+                  values[i] = nir_pack_sint_2x16(&b, nir_vec2(&b, src0, src1));
+               }
+
+               new_write_mask |= 1 << i;
+            }
+
+            /* Update the write mask for compressed outputs. */
+            nir_intrinsic_set_write_mask(intrin, new_write_mask);
+            intrin->num_components = util_last_bit(new_write_mask);
+         }
+
+         nir_ssa_def *new_src = nir_vec(&b, values, intrin->num_components);
+
+         nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(new_src));
+
+         progress = true;
+      }
+   }
+
+   return progress;
+}
+
 VkResult
 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
                    struct radv_device *device, struct radv_pipeline_cache *cache,
@ -4010,6 +4173,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
      NIR_PASS_V(nir[MESA_SHADER_VERTEX], radv_lower_vs_input, pipeline_key);
   }

+   if (nir[MESA_SHADER_FRAGMENT] && !radv_use_llvm_for_stage(device, MESA_SHADER_FRAGMENT)) {
+      /* TODO: Convert the LLVM backend. */
+      NIR_PASS_V(nir[MESA_SHADER_FRAGMENT], radv_lower_fs_output, pipeline_key);
+   }
+
   radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir);

   bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) ||