diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 9fe2f9ae8d5..982885e3669 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -51,29 +51,44 @@ emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo); } +bool +can_use_shared_vgprs(isel_context* ctx) +{ + /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists + * of multiple binaries, because the VGPR use is not known when choosing + * which registers to use for the shared VGPRs. + */ + return ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 && + ctx->program->wave_size == 64 && !ctx->program->info.ps.has_epilog && + !ctx->program->info.merged_shader_compiled_separately && + !ctx->program->info.vs.has_prolog && ctx->stage != raytracing_cs; +} + +void +enable_shared_vgprs(isel_context* ctx) +{ + assert(can_use_shared_vgprs(ctx)); + if (ctx->program->config->num_shared_vgprs) + return; + + /* We need one pair of shared VGPRs: + * Note, that these have twice the allocation granularity of normal VGPRs + */ + ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; +} + Temp emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) { if (index.regClass() == s1) return bld.readlane(bld.def(s1), data, index); - /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists - * of multiple binaries, because the VGPR use is not known when choosing - * which registers to use for the shared VGPRs. - */ - const bool avoid_shared_vgprs = - ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 && - ctx->program->wave_size == 64 && - (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately || - ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs); - - if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) { - /* GFX6-7: there is no bpermute instruction */ - return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm), - bld.def(bld.lm, vcc), index, data); - } else if (ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level <= GFX11_5 && - ctx->program->wave_size == 64) { - + if ((ctx->options->gfx_level >= GFX8 && ctx->options->gfx_level < GFX10) || + ctx->options->gfx_level >= GFX12 || ctx->program->wave_size == 32) { + /* wave32 or GFX8-9, GFX12+: bpermute works normally */ + Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); + return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); + } else if (ctx->options->gfx_level >= GFX11 || can_use_shared_vgprs(ctx)) { /* GFX10-11.5 wave64 mode: emulate full-wave bpermute */ Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index); @@ -86,10 +101,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); if (ctx->options->gfx_level <= GFX10_3) { - /* We need one pair of shared VGPRs: - * Note, that these have twice the allocation granularity of normal VGPRs - */ - ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; + enable_shared_vgprs(ctx); return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, data, same_half); @@ -98,9 +110,9 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half); } } else { - /* wave32 or GFX8-9, GFX12+: bpermute works normally */ - Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); - return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); + /* GFX6-7: there is no bpermute instruction */ + return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm), + bld.def(bld.lm, vcc), index, data); } }