diff --git a/src/amd/common/nir/ac_nir_lower_ps_early.c b/src/amd/common/nir/ac_nir_lower_ps_early.c index f4f0369f786..ce61485b98a 100644 --- a/src/amd/common/nir/ac_nir_lower_ps_early.c +++ b/src/amd/common/nir/ac_nir_lower_ps_early.c @@ -254,12 +254,19 @@ lower_ps_load_sample_mask_in(nir_builder *b, nir_intrinsic_instr *intrin, lower_ * The samplemask loaded by hardware is always the coverage of the * entire pixel/fragment, so mask bits out based on the sample ID. */ - uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples); - nir_def *sampleid = nir_load_sample_id(b); - nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), sampleid); + nir_def *replacement; - nir_def *sample_mask = nir_load_sample_mask_in(b); - nir_def *replacement = nir_iand(b, sample_mask, submask); + /* Set ps_iter_samples=8 if full sample shading is enabled even for 2x and 4x MSAA + * to get this fast path that fully replaces sample_mask_in with sample_id. + */ + if (s->options->ps_iter_samples == 8) { + replacement = nir_bcsel(b, nir_load_helper_invocation(b, 1), nir_imm_int(b, 0), + nir_ishl(b, nir_imm_int(b, 1), nir_load_sample_id(b))); + } else { + uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples); + nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), nir_load_sample_id(b)); + replacement = nir_iand(b, nir_load_sample_mask_in(b), submask); + } nir_def_replace(&intrin->def, replacement); return true; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 07e1465d13f..dc1fc21af74 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11632,13 +11632,29 @@ overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* fin Temp ancillary = get_arg(ctx, ctx->args->ancillary); Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u), Operand::c32(4u)); - Temp samplemask = get_arg(ctx, ctx->args->sample_coverage); + Temp samplemask; - uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter); - Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask)); + if (finfo->samplemask_log_ps_iter == 3) { + Temp is_helper_invoc = + bld.pseudo(aco_opcode::p_is_helper, bld.def(bld.lm), Operand(exec, bld.lm)); + ctx->program->needs_exact = true; - Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask); - samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask); + /* samplemask = is_helper ? 0 : (1 << sample_id); */ + samplemask = + bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, Operand::c32(1u)); + samplemask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), samplemask, + Operand::c32(0u), is_helper_invoc); + } else { + /* samplemask &= ps_iter_mask << sample_id; */ + uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter); + Builder::Op mask = ctx->options->gfx_level >= GFX11 + ? Operand::c32(ps_iter_mask) + : bld.copy(bld.def(v1), Operand::c32(ps_iter_mask)); + + samplemask = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, mask); + samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), + get_arg(ctx, ctx->args->sample_coverage), samplemask); + } ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask; } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 1bd425ee0c7..35cc931e513 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2895,9 +2895,13 @@ static void si_fixup_spi_ps_input_config(struct si_shader *shader) if (!(shader->config.spi_ps_input_ena & 0x7f)) shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - /* Samplemask fixup requires the sample ID. */ + /* The sample mask fixup requires the sample ID. */ if (key->ps.part.prolog.samplemask_log_ps_iter) shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); + + /* The sample mask fixup has an optimization that replaces the sample mask with the sample ID. */ + if (key->ps.part.prolog.samplemask_log_ps_iter == 3) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; } static void diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c index 7a585a45f3f..18a3037106b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c @@ -631,20 +631,29 @@ void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part * entire pixel/fragment, so mask bits out based on the sample ID. */ if (key->ps_prolog.states.samplemask_log_ps_iter) { - uint32_t ps_iter_mask = - ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter); - LLVMValueRef sampleid = si_unpack_param(ctx, args->ac.ancillary, 8, 4); - LLVMValueRef samplemask = ac_get_arg(&ctx->ac, args->ac.sample_coverage); + LLVMValueRef sample_id = si_unpack_param(ctx, args->ac.ancillary, 8, 4); + LLVMValueRef sample_mask_in; - samplemask = ac_to_integer(&ctx->ac, samplemask); - samplemask = - LLVMBuildAnd(ctx->ac.builder, samplemask, - LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), - sampleid, ""), - ""); - samplemask = ac_to_float(&ctx->ac, samplemask); + /* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA + * to get this fast path that fully replaces sample_mask_in with sample_id. + */ + if (key->ps_prolog.states.samplemask_log_ps_iter == 3) { + sample_mask_in = + LLVMBuildSelect(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac), + ctx->ac.i32_0, + LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, sample_id, ""), ""); + } else { + uint32_t ps_iter_mask = + ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter); + sample_mask_in = + LLVMBuildAnd(ctx->ac.builder, + ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, args->ac.sample_coverage)), + LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), + sample_id, ""), ""); + } - ret = insert_ret_of_arg(ctx, ret, samplemask, args->ac.sample_coverage.arg_index); + sample_mask_in = ac_to_float(&ctx->ac, sample_mask_in); + ret = insert_ret_of_arg(ctx, ret, sample_mask_in, args->ac.sample_coverage.arg_index); } /* Tell LLVM to insert WQM instruction sequence when needed. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index a2f023ce783..e4377bc91f5 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2816,9 +2816,16 @@ void si_ps_key_update_sample_shading(struct si_context *sctx) union si_shader_key *key = &sctx->shader.ps.key; unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); + assert(ps_iter_samples <= MAX2(1, sctx->framebuffer.nr_color_samples)); if (ps_iter_samples > 1 && sel->info.reads_samplemask) { - key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(ps_iter_samples); + /* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA + * to get the fast path that fully replaces sample_mask_in with sample_id. + */ + if (ps_iter_samples == sctx->framebuffer.nr_color_samples) + key->ps.part.prolog.samplemask_log_ps_iter = 3; + else + key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(ps_iter_samples); } else { key->ps.part.prolog.samplemask_log_ps_iter = 0; }