radv,aco: lower color exports in NIR

fossils-db (Sienna Cichlid):
Totals from 27108 (20.09% of 134913) affected shaders:
VGPRs: 1260608 -> 1261424 (+0.06%); split: -0.00%, +0.07%
CodeSize: 112795868 -> 112785892 (-0.01%); split: -0.05%, +0.04%
MaxWaves: 628608 -> 628448 (-0.03%); split: +0.00%, -0.03%
Instrs: 20750003 -> 20749314 (-0.00%); split: -0.01%, +0.00%
Latency: 288088081 -> 288015865 (-0.03%); split: -0.06%, +0.04%
InvThroughput: 53944847 -> 53961693 (+0.03%); split: -0.01%, +0.04%
VClause: 396463 -> 396467 (+0.00%); split: -0.02%, +0.02%
SClause: 842088 -> 842150 (+0.01%); split: -0.03%, +0.04%
Copies: 1244982 -> 1259026 (+1.13%); split: -0.01%, +1.14%
PreSGPRs: 1251949 -> 1251909 (-0.00%)
PreVGPRs: 1099647 -> 1100879 (+0.11%); split: -0.03%, +0.14%

fossils-db (Polaris10):
Totals from 23928 (17.60% of 135960) affected shaders:
SGPRs: 1751792 -> 1751024 (-0.04%); split: -0.05%, +0.01%
VGPRs: 1098964 -> 1098556 (-0.04%); split: -0.13%, +0.09%
CodeSize: 99893472 -> 99837940 (-0.06%); split: -0.06%, +0.00%
MaxWaves: 138322 -> 138306 (-0.01%); split: +0.03%, -0.04%
Instrs: 19213995 -> 19211980 (-0.01%); split: -0.02%, +0.01%
Latency: 273026926 -> 273109402 (+0.03%); split: -0.01%, +0.04%
InvThroughput: 111160907 -> 111195187 (+0.03%); split: -0.04%, +0.07%
VClause: 343058 -> 343097 (+0.01%); split: -0.02%, +0.03%
SClause: 802756 -> 802884 (+0.02%); split: -0.04%, +0.06%
Copies: 1729387 -> 1739208 (+0.57%); split: -0.04%, +0.61%
PreSGPRs: 1090264 -> 1090303 (+0.00%); split: -0.00%, +0.01%
PreVGPRs: 959490 -> 960600 (+0.12%); split: -0.04%, +0.15%

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15263>
This commit is contained in:
Samuel Pitoiset 2022-02-22 16:39:29 +01:00 committed by Marge Bot
parent 9e31991c6e
commit 8c51874af4
2 changed files with 172 additions and 132 deletions

View file

@ -10615,33 +10615,12 @@ export_fs_mrt_color(isel_context* ctx, int slot)
unsigned target, col_format;
unsigned enabled_channels = 0;
aco_opcode compr_op = (aco_opcode)0;
bool compr = false;
slot -= FRAG_RESULT_DATA0;
target = V_008DFC_SQ_EXP_MRT + slot;
col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
bool is_16bit = values[0].regClass() == v2b;
/* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
(col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
for (int i = 0; i < 4; i++) {
if (!(write_mask & (1 << i)))
continue;
Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
bld.copy(bld.def(v1), Operand::zero()), isnan);
}
}
switch (col_format) {
case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
@ -10659,103 +10638,12 @@ export_fs_mrt_color(isel_context* ctx, int slot)
break;
case V_028714_SPI_SHADER_FP16_ABGR:
for (int i = 0; i < 2; i++) {
bool enabled = (write_mask >> (i * 2)) & 0x3;
if (enabled) {
enabled_channels |= 0x3 << (i * 2);
if (is_16bit) {
values[i] =
bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
} else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
values[i] =
bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
} else {
values[i] =
bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
}
} else {
values[i] = Operand(v1);
}
}
values[2] = Operand(v1);
values[3] = Operand(v1);
compr = true;
break;
case V_028714_SPI_SHADER_UNORM16_ABGR:
if (is_16bit && ctx->options->chip_class >= GFX9) {
compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
} else {
compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
}
break;
case V_028714_SPI_SHADER_SNORM16_ABGR:
if (is_16bit && ctx->options->chip_class >= GFX9) {
compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
} else {
compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
}
break;
case V_028714_SPI_SHADER_UINT16_ABGR: {
compr_op = aco_opcode::v_cvt_pk_u16_u32;
if (is_int8 || is_int10) {
/* clamp */
uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
for (unsigned i = 0; i < 4; i++) {
if ((write_mask >> i) & 1) {
values[i] =
bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
}
}
} else if (is_16bit) {
for (unsigned i = 0; i < 4; i++) {
if ((write_mask >> i) & 1) {
Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
values[i] = Operand(tmp);
}
}
}
break;
}
case V_028714_SPI_SHADER_UINT16_ABGR:
case V_028714_SPI_SHADER_SINT16_ABGR:
compr_op = aco_opcode::v_cvt_pk_i16_i32;
if (is_int8 || is_int10) {
/* clamp */
uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
for (unsigned i = 0; i < 4; i++) {
if ((write_mask >> i) & 1) {
values[i] =
bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
values[i]);
}
}
} else if (is_16bit) {
for (unsigned i = 0; i < 4; i++) {
if ((write_mask >> i) & 1) {
Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
values[i] = Operand(tmp);
}
}
}
enabled_channels = util_widen_mask(write_mask, 2);
compr = true;
break;
case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
@ -10764,23 +10652,7 @@ export_fs_mrt_color(isel_context* ctx, int slot)
default: return false;
}
if ((bool)compr_op) {
for (int i = 0; i < 2; i++) {
/* check if at least one of the values to be compressed is enabled */
bool enabled = (write_mask >> (i * 2)) & 0x3;
if (enabled) {
enabled_channels |= 0x3 << (i * 2);
values[i] = bld.vop3(
compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
} else {
values[i] = Operand(v1);
}
}
values[2] = Operand(v1);
values[3] = Operand(v1);
compr = true;
} else if (!compr) {
if (!compr) {
for (int i = 0; i < 4; i++)
values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
}

View file

@ -3872,6 +3872,169 @@ radv_lower_vs_input(nir_shader *nir, const struct radv_pipeline_key *pipeline_ke
return progress;
}
static bool
radv_lower_fs_output(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
bool progress = false;
nir_builder b;
nir_builder_init(&b, impl);
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
continue;
int slot = nir_intrinsic_base(intrin) - FRAG_RESULT_DATA0;
if (slot < 0)
continue;
unsigned write_mask = nir_intrinsic_write_mask(intrin);
unsigned col_format = (pipeline_key->ps.col_format >> (4 * slot)) & 0xf;
bool is_int8 = (pipeline_key->ps.is_int8 >> slot) & 1;
bool is_int10 = (pipeline_key->ps.is_int10 >> slot) & 1;
bool is_16bit = intrin->src[0].ssa->bit_size == 16;
if (col_format == V_028714_SPI_SHADER_ZERO)
continue;
b.cursor = nir_before_instr(instr);
nir_ssa_def *values[4];
/* Extract the export values. */
for (unsigned i = 0; i < 4; i++) {
if (write_mask & (1 << i)) {
values[i] = nir_channel(&b, intrin->src[0].ssa, i);
} else {
values[i] = nir_ssa_undef(&b, 1, 32);
}
}
/* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
if (pipeline_key->ps.enable_mrt_output_nan_fixup && !nir->info.internal && !is_16bit &&
(col_format == V_028714_SPI_SHADER_32_R ||
col_format == V_028714_SPI_SHADER_32_GR ||
col_format == V_028714_SPI_SHADER_32_AR ||
col_format == V_028714_SPI_SHADER_32_ABGR ||
col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
u_foreach_bit(i, write_mask) {
const bool save_exact = b.exact;
b.exact = true;
nir_ssa_def *isnan = nir_fneu(&b, values[i], values[i]);
b.exact = save_exact;
values[i] = nir_bcsel(&b, isnan, nir_imm_zero(&b, 1, 32), values[i]);
}
}
if (col_format == V_028714_SPI_SHADER_FP16_ABGR ||
col_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
col_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
col_format == V_028714_SPI_SHADER_UINT16_ABGR ||
col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
/* Convert and/or clamp the export values. */
switch (col_format) {
case V_028714_SPI_SHADER_UINT16_ABGR: {
unsigned max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
u_foreach_bit(i, write_mask) {
if (is_int8 || is_int10) {
values[i] = nir_umin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 3u)
: nir_imm_int(&b, max_rgb));
} else if (is_16bit) {
values[i] = nir_u2u32(&b, values[i]);
}
}
break;
}
case V_028714_SPI_SHADER_SINT16_ABGR: {
unsigned max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
unsigned min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
u_foreach_bit(i, write_mask) {
if (is_int8 || is_int10) {
values[i] = nir_imin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 1u)
: nir_imm_int(&b, max_rgb));
values[i] = nir_imax(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, -2u)
: nir_imm_int(&b, min_rgb));
} else if (is_16bit) {
values[i] = nir_i2i32(&b, values[i]);
}
}
break;
}
case V_028714_SPI_SHADER_UNORM16_ABGR:
case V_028714_SPI_SHADER_SNORM16_ABGR:
u_foreach_bit(i, write_mask) {
if (is_16bit) {
values[i] = nir_f2f32(&b, values[i]);
}
}
break;
default:
break;
}
/* Only nir_pack_32_2x16_split needs 16-bit inputs. */
bool input_16_bit = col_format == V_028714_SPI_SHADER_FP16_ABGR && is_16bit;
unsigned new_write_mask = 0;
/* Pack the export values. */
for (unsigned i = 0; i < 2; i++) {
bool enabled = (write_mask >> (i * 2)) & 0x3;
if (!enabled) {
values[i] = nir_ssa_undef(&b, 1, 32);
continue;
}
nir_ssa_def *src0 = values[i * 2];
nir_ssa_def *src1 = values[i * 2 + 1];
if (!(write_mask & (1 << (i * 2))))
src0 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
if (!(write_mask & (1 << (i * 2 + 1))))
src1 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
if (col_format == V_028714_SPI_SHADER_FP16_ABGR) {
if (is_16bit) {
values[i] = nir_pack_32_2x16_split(&b, src0, src1);
} else {
values[i] = nir_pack_half_2x16_split(&b, src0, src1);
}
} else if (col_format == V_028714_SPI_SHADER_UNORM16_ABGR) {
values[i] = nir_pack_unorm_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_SNORM16_ABGR) {
values[i] = nir_pack_snorm_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR) {
values[i] = nir_pack_uint_2x16(&b, nir_vec2(&b, src0, src1));
} else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
values[i] = nir_pack_sint_2x16(&b, nir_vec2(&b, src0, src1));
}
new_write_mask |= 1 << i;
}
/* Update the write mask for compressed outputs. */
nir_intrinsic_set_write_mask(intrin, new_write_mask);
intrin->num_components = util_last_bit(new_write_mask);
}
nir_ssa_def *new_src = nir_vec(&b, values, intrin->num_components);
nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(new_src));
progress = true;
}
}
return progress;
}
VkResult
radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
struct radv_device *device, struct radv_pipeline_cache *cache,
@ -4010,6 +4173,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
NIR_PASS_V(nir[MESA_SHADER_VERTEX], radv_lower_vs_input, pipeline_key);
}
if (nir[MESA_SHADER_FRAGMENT] && !radv_use_llvm_for_stage(device, MESA_SHADER_FRAGMENT)) {
/* TODO: Convert the LLVM backend. */
NIR_PASS_V(nir[MESA_SHADER_FRAGMENT], radv_lower_fs_output, pipeline_key);
}
radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir);
bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) ||