ac,aco,radeonsi: replace SampleMaskIn with 1 << SampleID if full sample shading

Since the sample mask is always 1 << sample_id with full sample shading,
just use that instead of loading sample_mask_in. Set it to 0 if it's
a helper invocation. This removes the sample mask input VGPR.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33024>
This commit is contained in:
Marek Olšák 2025-01-02 12:39:03 -05:00
parent b1fc34f290
commit d7d4d56f5b
5 changed files with 67 additions and 24 deletions

View file

@ -254,12 +254,19 @@ lower_ps_load_sample_mask_in(nir_builder *b, nir_intrinsic_instr *intrin, lower_
* The samplemask loaded by hardware is always the coverage of the
* entire pixel/fragment, so mask bits out based on the sample ID.
*/
uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples);
nir_def *sampleid = nir_load_sample_id(b);
nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), sampleid);
nir_def *replacement;
nir_def *sample_mask = nir_load_sample_mask_in(b);
nir_def *replacement = nir_iand(b, sample_mask, submask);
/* Set ps_iter_samples=8 if full sample shading is enabled even for 2x and 4x MSAA
* to get this fast path that fully replaces sample_mask_in with sample_id.
*/
if (s->options->ps_iter_samples == 8) {
replacement = nir_bcsel(b, nir_load_helper_invocation(b, 1), nir_imm_int(b, 0),
nir_ishl(b, nir_imm_int(b, 1), nir_load_sample_id(b)));
} else {
uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples);
nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), nir_load_sample_id(b));
replacement = nir_iand(b, nir_load_sample_mask_in(b), submask);
}
nir_def_replace(&intrin->def, replacement);
return true;

View file

@ -11632,13 +11632,29 @@ overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* fin
Temp ancillary = get_arg(ctx, ctx->args->ancillary);
Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
Operand::c32(4u));
Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
Temp samplemask;
uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
if (finfo->samplemask_log_ps_iter == 3) {
Temp is_helper_invoc =
bld.pseudo(aco_opcode::p_is_helper, bld.def(bld.lm), Operand(exec, bld.lm));
ctx->program->needs_exact = true;
Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
/* samplemask = is_helper ? 0 : (1 << sample_id); */
samplemask =
bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, Operand::c32(1u));
samplemask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), samplemask,
Operand::c32(0u), is_helper_invoc);
} else {
/* samplemask &= ps_iter_mask << sample_id; */
uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
Builder::Op mask = ctx->options->gfx_level >= GFX11
? Operand::c32(ps_iter_mask)
: bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
samplemask = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, mask);
samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
get_arg(ctx, ctx->args->sample_coverage), samplemask);
}
ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
}

View file

@ -2895,9 +2895,13 @@ static void si_fixup_spi_ps_input_config(struct si_shader *shader)
if (!(shader->config.spi_ps_input_ena & 0x7f))
shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
/* Samplemask fixup requires the sample ID. */
/* The sample mask fixup requires the sample ID. */
if (key->ps.part.prolog.samplemask_log_ps_iter)
shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
/* The sample mask fixup has an optimization that replaces the sample mask with the sample ID. */
if (key->ps.part.prolog.samplemask_log_ps_iter == 3)
shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
}
static void

View file

@ -631,20 +631,29 @@ void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part
* entire pixel/fragment, so mask bits out based on the sample ID.
*/
if (key->ps_prolog.states.samplemask_log_ps_iter) {
uint32_t ps_iter_mask =
ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
LLVMValueRef sampleid = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
LLVMValueRef samplemask = ac_get_arg(&ctx->ac, args->ac.sample_coverage);
LLVMValueRef sample_id = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
LLVMValueRef sample_mask_in;
samplemask = ac_to_integer(&ctx->ac, samplemask);
samplemask =
LLVMBuildAnd(ctx->ac.builder, samplemask,
LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
sampleid, ""),
"");
samplemask = ac_to_float(&ctx->ac, samplemask);
/* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA
* to get this fast path that fully replaces sample_mask_in with sample_id.
*/
if (key->ps_prolog.states.samplemask_log_ps_iter == 3) {
sample_mask_in =
LLVMBuildSelect(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac),
ctx->ac.i32_0,
LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, sample_id, ""), "");
} else {
uint32_t ps_iter_mask =
ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
sample_mask_in =
LLVMBuildAnd(ctx->ac.builder,
ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, args->ac.sample_coverage)),
LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
sample_id, ""), "");
}
ret = insert_ret_of_arg(ctx, ret, samplemask, args->ac.sample_coverage.arg_index);
sample_mask_in = ac_to_float(&ctx->ac, sample_mask_in);
ret = insert_ret_of_arg(ctx, ret, sample_mask_in, args->ac.sample_coverage.arg_index);
}
/* Tell LLVM to insert WQM instruction sequence when needed. */

View file

@ -2816,9 +2816,16 @@ void si_ps_key_update_sample_shading(struct si_context *sctx)
union si_shader_key *key = &sctx->shader.ps.key;
unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
assert(ps_iter_samples <= MAX2(1, sctx->framebuffer.nr_color_samples));
if (ps_iter_samples > 1 && sel->info.reads_samplemask) {
key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(ps_iter_samples);
/* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA
* to get the fast path that fully replaces sample_mask_in with sample_id.
*/
if (ps_iter_samples == sctx->framebuffer.nr_color_samples)
key->ps.part.prolog.samplemask_log_ps_iter = 3;
else
key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(ps_iter_samples);
} else {
key->ps.part.prolog.samplemask_log_ps_iter = 0;
}