From 86c227c10cb729aef05bc8a26b56957e3fa7683b Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Mon, 5 Oct 2020 17:50:37 +0200 Subject: [PATCH] aco: Use strong typing to model SW<->HW stage mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Timur Kristóf Acked-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_assembler.cpp | 9 +- src/amd/compiler/aco_insert_exec_mask.cpp | 2 +- .../compiler/aco_instruction_selection.cpp | 41 +++--- src/amd/compiler/aco_instruction_selection.h | 2 +- .../aco_instruction_selection_setup.cpp | 103 +++++++------- src/amd/compiler/aco_ir.h | 127 ++++++++++++------ src/amd/compiler/aco_lower_to_hw_instr.cpp | 2 +- src/amd/compiler/aco_scheduler.cpp | 3 +- 8 files changed, 174 insertions(+), 115 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 755bed09e1a..24cb39e656c 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -732,7 +732,7 @@ void fix_exports(asm_context& ctx, std::vector& out, Program* program) { if ((*it)->format == Format::EXP) { Export_instruction* exp = static_cast((*it).get()); - if (program->stage & (hw_vs | hw_ngg_gs)) { + if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) { if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) { exp->done = true; exported = true; @@ -752,7 +752,8 @@ void fix_exports(asm_context& ctx, std::vector& out, Program* program) if (!exported) { /* Abort in order to avoid a GPU hang. */ - aco_err(program, "Missing export in %s shader:", (program->stage & (hw_vs | hw_ngg_gs)) ? "vertex or NGG" : "fragment"); + bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS); + aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); aco_print_program(program, stderr); abort(); } @@ -917,7 +918,9 @@ unsigned emit_program(Program* program, { asm_context ctx(program); - if (program->stage & (hw_vs | hw_fs | hw_ngg_gs)) + if (program->stage.hw == HWStage::VS || + program->stage.hw == HWStage::FS || + program->stage.hw == HWStage::NGG_GS) fix_exports(ctx, code, program); for (Block& block : program->blocks) { diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index ebd44ade4cd..c6070e7531e 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -379,7 +379,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, bld.insert(std::move(startpgm)); /* exec seems to need to be manually initialized with combined shaders */ - if (util_bitcount(ctx.program->stage & sw_mask) > 1 || (ctx.program->stage & hw_ngg_gs)) { + if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG_GS) { bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); instructions[0]->definitions.pop_back(); } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c93032b495c..babd769aa42 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4252,7 +4252,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs || ctx->stage == ngg_vertex_geometry_gs || ctx->stage == ngg_tess_eval_geometry_gs) { /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */ - unsigned itemsize = (ctx->stage & sw_vs) + unsigned itemsize = ctx->stage.has(SWStage::VS) ? ctx->program->info->vs.es_info.esgs_itemsize : ctx->program->info->tes.es_info.esgs_itemsize; Temp vertex_idx = thread_id_in_threadgroup(ctx); @@ -4363,9 +4363,9 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction"); abort(); } - } else if ((ctx->stage & (hw_ls | hw_es)) || + } else if ((ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::ES) || (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || - ((ctx->stage & sw_gs) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) { + (ctx->stage.has(SWStage::GS) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) { visit_store_ls_or_es_output(ctx, instr); } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { visit_store_tcs_output(ctx, instr, false); @@ -7588,7 +7588,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_load_view_index: { - if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) { + if (ctx->stage.has(SWStage::VS) || + ctx->stage.has(SWStage::GS) || + ctx->stage.has(SWStage::TCS) || + ctx->stage.has(SWStage::TES)) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); break; @@ -8348,21 +8351,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_emit_vertex_with_counter: { - if (ctx->stage & hw_ngg_gs) + if (ctx->stage.hw == HWStage::NGG_GS) ngg_visit_emit_vertex_with_counter(ctx, instr); else visit_emit_vertex_with_counter(ctx, instr); break; } case nir_intrinsic_end_primitive_with_counter: { - if ((ctx->stage & hw_ngg_gs) == 0) { + if (ctx->stage.hw != HWStage::NGG_GS) { unsigned stream = nir_intrinsic_stream_id(instr); bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream)); } break; } case nir_intrinsic_set_vertex_and_primitive_count: { - if (ctx->stage & hw_ngg_gs) + if (ctx->stage.hw == HWStage::NGG_GS) ngg_visit_set_vertex_and_primitive_count(ctx, instr); /* unused in the legacy pipeline, the HW keeps track of this for us */ break; @@ -10079,9 +10082,9 @@ static bool visit_cf_list(isel_context *ctx, static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) { - assert(ctx->stage & (hw_vs | hw_ngg_gs)); + assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS); - int offset = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs)) + int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot] : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; uint64_t mask = ctx->outputs.mask[slot]; @@ -10176,15 +10179,15 @@ static void create_export_phis(isel_context *ctx) static void create_vs_exports(isel_context *ctx) { - assert(ctx->stage & (hw_vs | hw_ngg_gs)); + assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS); - radv_vs_output_info *outinfo = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs)) + radv_vs_output_info *outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) ? &ctx->program->info->tes.outinfo : &ctx->program->info->vs.outinfo; - if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) { + if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG_GS) { ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; - if (ctx->stage & sw_tes) + if (ctx->stage.has(SWStage::TES)) ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.tes_patch_id); else ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id); @@ -10646,7 +10649,7 @@ static void emit_stream_output(isel_context *ctx, Temp out[4]; bool all_undef = true; - assert(ctx->stage & hw_vs); + assert(ctx->stage.hw == HWStage::VS); for (unsigned i = 0; i < num_comps; i++) { out[i] = ctx->outputs.temps[loc * 4 + start + i]; all_undef = all_undef && !out[i].id(); @@ -11055,7 +11058,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Tem /* VS/TES: we infer the vertex and primitive count from arguments * GS: the caller needs to supply them */ - assert((ctx->stage & sw_gs) + assert(ctx->stage.has(SWStage::GS) ? (vtx_cnt.id() && prm_cnt.id()) : (!vtx_cnt.id() && !prm_cnt.id())); @@ -11117,7 +11120,7 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive Builder bld(ctx->program, ctx->block); Temp prim_exp_arg; - if (!(ctx->stage & sw_gs) && ctx->args->options->key.vs_common_out.as_ngg_passthrough) + if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough) prim_exp_arg = get_arg(ctx, ctx->args->gs_vtx_offset[0]); else prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null); @@ -11144,7 +11147,7 @@ void ngg_nogs_export_primitives(isel_context *ctx) constexpr unsigned max_vertices_per_primitive = 3; unsigned num_vertices_per_primitive = max_vertices_per_primitive; - assert(!(ctx->stage & sw_gs)); + assert(!ctx->stage.has(SWStage::GS)); if (ctx->stage == ngg_vertex_gs) { /* TODO: optimize for points & lines */ @@ -11711,10 +11714,10 @@ void select_program(Program *program, visit_cf_list(&ctx, &func->body); - if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs)) + if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS) emit_streamout(&ctx, 0); - if (ctx.stage & hw_vs) { + if (ctx.stage.hw == HWStage::VS) { create_vs_exports(&ctx); ctx.block->kind |= block_kind_export_end; } else if (ngg_no_gs && ctx.ngg_nogs_early_prim_export) { diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 5aa7f0a3599..107d116d9fd 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -61,7 +61,7 @@ struct isel_context { Block *block; std::unique_ptr allocated; std::unordered_map> allocated_vec; - Stage stage; /* Stage */ + Stage stage; bool has_gfx10_wave64_bpermute = false; struct { bool has_branch; diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index cc5b78f0bb0..c119c6eaeeb 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -435,7 +435,7 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) ctx->options->key.vs_common_out.export_clip_dists, outinfo); /* TODO: NGG streamout */ - if (ctx->stage & hw_ngg_gs) + if (ctx->stage.hw == HWStage::NGG_GS) assert(!ctx->args->shader_info->so.num_outputs); /* TODO: check if the shader writes edge flags (not in Vulkan) */ @@ -481,9 +481,9 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1; } - if (ctx->stage & sw_vs) + if (ctx->stage.has(SWStage::VS)) ctx->program->info->gs.es_type = MESA_SHADER_VERTEX; - else if (ctx->stage & sw_tes) + else if (ctx->stage.has(SWStage::TES)) ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL; } @@ -550,7 +550,7 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir) ctx->options->key.vs_common_out.export_clip_dists, outinfo); /* TODO: NGG streamout */ - if (ctx->stage & hw_ngg_gs) + if (ctx->stage.hw == HWStage::NGG_GS) assert(!ctx->args->shader_info->so.num_outputs); /* Tess eval shaders can't write edge flags, so this can be always true. */ @@ -644,7 +644,7 @@ void init_context(isel_context *ctx, nir_shader *shader) /* we'll need this for isel */ nir_metadata_require(impl, nir_metadata_block_index); - if (!(ctx->stage & sw_gs_copy) && ctx->options->dump_preoptir) { + if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) { fprintf(stderr, "NIR shader before instruction selection:\n"); nir_print_shader(shader, stderr); } @@ -1022,26 +1022,26 @@ setup_isel_context(Program* program, struct radv_shader_args *args, bool is_gs_copy_shader) { - Stage stage = 0; + SWStage sw_stage = SWStage::None; for (unsigned i = 0; i < shader_count; i++) { switch (shaders[i]->info.stage) { case MESA_SHADER_VERTEX: - stage |= sw_vs; + sw_stage = sw_stage | SWStage::VS; break; case MESA_SHADER_TESS_CTRL: - stage |= sw_tcs; + sw_stage = sw_stage | SWStage::TCS; break; case MESA_SHADER_TESS_EVAL: - stage |= sw_tes; + sw_stage = sw_stage | SWStage::TES; break; case MESA_SHADER_GEOMETRY: - stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs; + sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS); break; case MESA_SHADER_FRAGMENT: - stage |= sw_fs; + sw_stage = sw_stage | SWStage::FS; break; case MESA_SHADER_COMPUTE: - stage |= sw_cs; + sw_stage = sw_stage | SWStage::CS; break; default: unreachable("Shader stage not implemented"); @@ -1049,44 +1049,45 @@ setup_isel_context(Program* program, } bool gfx9_plus = args->options->chip_class >= GFX9; bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10; - if (stage == sw_vs && args->shader_info->vs.as_es && !ngg) - stage |= hw_es; - else if (stage == sw_vs && !args->shader_info->vs.as_ls && !ngg) - stage |= hw_vs; - else if (stage == sw_vs && ngg) - stage |= hw_ngg_gs; /* GFX10/NGG: VS without GS uses the HW GS stage */ - else if (stage == sw_gs) - stage |= hw_gs; - else if (stage == sw_fs) - stage |= hw_fs; - else if (stage == sw_cs) - stage |= hw_cs; - else if (stage == sw_gs_copy) - stage |= hw_vs; - else if (stage == (sw_vs | sw_gs) && gfx9_plus && !ngg) - stage |= hw_gs; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */ - else if (stage == (sw_vs | sw_gs) && ngg) - stage |= hw_ngg_gs; /* GFX10+: VS+GS merged into an NGG GS */ - else if (stage == sw_vs && args->shader_info->vs.as_ls) - stage |= hw_ls; /* GFX6-8: VS is a Local Shader, when tessellation is used */ - else if (stage == sw_tcs) - stage |= hw_hs; /* GFX6-8: TCS is a Hull Shader */ - else if (stage == (sw_vs | sw_tcs)) - stage |= hw_hs; /* GFX9-10: VS+TCS merged into a Hull Shader */ - else if (stage == sw_tes && !args->shader_info->tes.as_es && !ngg) - stage |= hw_vs; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */ - else if (stage == sw_tes && !args->shader_info->tes.as_es && ngg) - stage |= hw_ngg_gs; /* GFX10/NGG: TES without GS uses the HW GS stage */ - else if (stage == sw_tes && args->shader_info->tes.as_es && !ngg) - stage |= hw_es; /* GFX6-8: TES is an Export Shader */ - else if (stage == (sw_tes | sw_gs) && gfx9_plus && !ngg) - stage |= hw_gs; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */ - else if (stage == (sw_tes | sw_gs) && ngg) - stage |= hw_ngg_gs; /* GFX10+: TES+GS merged into an NGG GS */ + HWStage hw_stage { }; + if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg) + hw_stage = HWStage::ES; + else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg) + hw_stage = HWStage::VS; + else if (sw_stage == SWStage::VS && ngg) + hw_stage = HWStage::NGG_GS; /* GFX10/NGG: VS without GS uses the HW GS stage */ + else if (sw_stage == SWStage::GS) + hw_stage = HWStage::GS; + else if (sw_stage == SWStage::FS) + hw_stage = HWStage::FS; + else if (sw_stage == SWStage::CS) + hw_stage = HWStage::CS; + else if (sw_stage == SWStage::GSCopy) + hw_stage = HWStage::VS; + else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg) + hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */ + else if (sw_stage == SWStage::VS_GS && ngg) + hw_stage = HWStage::NGG_GS; /* GFX10+: VS+GS merged into an NGG GS */ + else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls) + hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */ + else if (sw_stage == SWStage::TCS) + hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */ + else if (sw_stage == SWStage::VS_TCS) + hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */ + else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg) + hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */ + else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg) + hw_stage = HWStage::NGG_GS; /* GFX10/NGG: TES without GS uses the HW GS stage */ + else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg) + hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */ + else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg) + hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */ + else if (sw_stage == SWStage::TES_GS && ngg) + hw_stage = HWStage::NGG_GS; /* GFX10+: TES+GS merged into an NGG GS */ else unreachable("Shader stage not implemented"); - init_program(program, stage, args->shader_info, + init_program(program, Stage { hw_stage, sw_stage }, args->shader_info, args->options->chip_class, args->options->family, config); isel_context ctx = {}; @@ -1096,7 +1097,7 @@ setup_isel_context(Program* program, ctx.stage = program->stage; /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */ - if (program->stage & (hw_vs | hw_fs)) { + if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) { /* PS and legacy VS have separate waves, no workgroups */ program->workgroup_size = program->wave_size; } else if (program->stage == compute_cs) { @@ -1104,10 +1105,10 @@ setup_isel_context(Program* program, program->workgroup_size = shaders[0]->info.cs.local_size[0] * shaders[0]->info.cs.local_size[1] * shaders[0]->info.cs.local_size[2]; - } else if ((program->stage & hw_es) || program->stage == geometry_gs) { + } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) { /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa) */ program->workgroup_size = program->wave_size; - } else if (program->stage & hw_gs) { + } else if (program->stage.hw == HWStage::GS) { /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */ assert(program->chip_class >= GFX9); uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); @@ -1125,7 +1126,7 @@ setup_isel_context(Program* program, /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */ setup_tcs_info(&ctx, shaders[1], shaders[0]); program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices); - } else if (program->stage & hw_ngg_gs) { + } else if (program->stage.hw == HWStage::NGG_GS) { gfx10_ngg_info &ngg_info = args->shader_info->ngg_info; /* Max ES (SW VS) threads */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index d0df37bc6a5..d17a4fb99b5 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1500,50 +1500,101 @@ struct Block { Block() : index(0) {} }; -using Stage = uint16_t; +/* + * Shader stages as provided in Vulkan by the application. Contrast this to HWStage. + */ +enum class SWStage : uint8_t { + None = 0, + VS = 1 << 0, /* Vertex Shader */ + GS = 1 << 1, /* Geometry Shader */ + TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ + TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ + FS = 1 << 4, /* Fragment aka Pixel Shader */ + CS = 1 << 5, /* Compute Shader */ + GSCopy = 1 << 6, /* GS Copy Shader (internal) */ -/* software stages */ -static constexpr Stage sw_vs = 1 << 0; -static constexpr Stage sw_gs = 1 << 1; -static constexpr Stage sw_tcs = 1 << 2; -static constexpr Stage sw_tes = 1 << 3; -static constexpr Stage sw_fs = 1 << 4; -static constexpr Stage sw_cs = 1 << 5; -static constexpr Stage sw_gs_copy = 1 << 6; -static constexpr Stage sw_mask = 0x7f; + /* Stage combinations merged to run on a single HWStage */ + VS_GS = VS | GS, + VS_TCS = VS | TCS, + TES_GS = TES | GS, +}; -/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ -static constexpr Stage hw_vs = 1 << 7; -static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */ -static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */ -static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ -static constexpr Stage hw_fs = 1 << 13; -static constexpr Stage hw_cs = 1 << 14; -static constexpr Stage hw_mask = 0xff << 7; +constexpr SWStage operator|(SWStage a, SWStage b) { + return static_cast(static_cast(a) | static_cast(b)); +} + +/* + * Shader stages as running on the AMD GPU. + * + * The relation between HWStages and SWStages is not a one-to-one mapping: + * Some SWStages are merged by ACO to run on a single HWStage. + * See README.md for details. + */ +enum class HWStage : uint8_t { + VS, + ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ + GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ + NGG_GS, /* Geometry shader on GFX10/NGG. */ + LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ + HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ + FS, + CS, +}; + +/* + * Set of SWStages to be merged into a single shader paired with the + * HWStage it will run on. + */ +struct Stage { + constexpr Stage() = default; + + explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { } + + /* Check if the given SWStage is included */ + constexpr bool has(SWStage stage) const { + return (static_cast(sw) & static_cast(stage)); + } + + unsigned num_sw_stages() const { + return util_bitcount(static_cast(sw)); + } + + constexpr bool operator==(const Stage& other) const { + return sw == other.sw && hw == other.hw; + } + + constexpr bool operator!=(const Stage& other) const { + return sw != other.sw || hw != other.hw; + } + + /* Mask of merged software stages */ + SWStage sw = SWStage::None; + + /* Active hardware stage */ + HWStage hw {}; +}; /* possible settings of Program::stage */ -static constexpr Stage vertex_vs = sw_vs | hw_vs; -static constexpr Stage fragment_fs = sw_fs | hw_fs; -static constexpr Stage compute_cs = sw_cs | hw_cs; -static constexpr Stage tess_eval_vs = sw_tes | hw_vs; -static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs; +static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS); +static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS); +static constexpr Stage compute_cs(HWStage::CS, SWStage::CS); +static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES); +static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy); /* GFX10/NGG */ -static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs; -static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs; -static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs; -static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs; +static constexpr Stage ngg_vertex_gs(HWStage::NGG_GS, SWStage::VS); +static constexpr Stage ngg_vertex_geometry_gs(HWStage::NGG_GS, SWStage::VS_GS); +static constexpr Stage ngg_tess_eval_gs(HWStage::NGG_GS, SWStage::TES); +static constexpr Stage ngg_tess_eval_geometry_gs(HWStage::NGG_GS, SWStage::TES_GS); /* GFX9 (and GFX10 if NGG isn't used) */ -static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs; -static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; -static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; +static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS); +static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS); +static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS); /* pre-GFX9 */ -static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ -static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */ -static constexpr Stage tess_control_hs = sw_tcs | hw_hs; -static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */ -static constexpr Stage geometry_gs = sw_gs | hw_gs; +static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */ +static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */ +static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS); +static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */ +static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS); enum statistic { statistic_hash, @@ -1574,7 +1625,7 @@ public: enum radeon_family family; unsigned wave_size; RegClass lane_mask; - Stage stage; /* Stage */ + Stage stage; bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ bool wb_smem_l1_on_end = false; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 8fab8e6da48..ca8865b96ea 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1726,7 +1726,7 @@ void lower_to_hw_instr(Program* program) /* don't bother with an early exit near the end of the program */ if ((block->instructions.size() - 1 - j) <= 4 && block->instructions.back()->opcode == aco_opcode::s_endpgm) { - unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; + unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; bool ignore_early_exit = true; for (unsigned k = j + 1; k < block->instructions.size(); ++k) { diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 5e0ab735802..83e5dcf0b80 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -862,7 +862,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v } } - if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) { + if ((program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) && + (block->kind & block_kind_export_end)) { /* Try to move position exports as far up as possible, to reduce register * usage and because ISA reference guides say so. */ for (unsigned idx = 0; idx < block->instructions.size(); idx++) {