From 86c227c10cb729aef05bc8a26b56957e3fa7683b Mon Sep 17 00:00:00 2001
From: Tony Wasserka <tony.wasserka@gmx.de>
Date: Mon, 5 Oct 2020 17:50:37 +0200
Subject: [PATCH] aco: Use strong typing to model SW<->HW stage mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7094>
---
 src/amd/compiler/aco_assembler.cpp            |   9 +-
 src/amd/compiler/aco_insert_exec_mask.cpp     |   2 +-
 .../compiler/aco_instruction_selection.cpp    |  41 +++---
 src/amd/compiler/aco_instruction_selection.h  |   2 +-
 .../aco_instruction_selection_setup.cpp       | 103 +++++++-------
 src/amd/compiler/aco_ir.h                     | 127 ++++++++++++------
 src/amd/compiler/aco_lower_to_hw_instr.cpp    |   2 +-
 src/amd/compiler/aco_scheduler.cpp            |   3 +-
 8 files changed, 174 insertions(+), 115 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 755bed09e1a..24cb39e656c 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -732,7 +732,7 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
       {
          if ((*it)->format == Format::EXP) {
             Export_instruction* exp = static_cast<Export_instruction*>((*it).get());
-            if (program->stage & (hw_vs | hw_ngg_gs)) {
+            if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) {
                if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) {
                   exp->done = true;
                   exported = true;
@@ -752,7 +752,8 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
 
    if (!exported) {
       /* Abort in order to avoid a GPU hang. */
-      aco_err(program, "Missing export in %s shader:", (program->stage & (hw_vs | hw_ngg_gs)) ? "vertex or NGG" : "fragment");
+      bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS);
+      aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
       aco_print_program(program, stderr);
       abort();
    }
@@ -917,7 +918,9 @@ unsigned emit_program(Program* program,
 {
    asm_context ctx(program);
 
-   if (program->stage & (hw_vs | hw_fs | hw_ngg_gs))
+   if (program->stage.hw == HWStage::VS ||
+       program->stage.hw == HWStage::FS ||
+       program->stage.hw == HWStage::NGG_GS)
       fix_exports(ctx, code, program);
 
    for (Block& block : program->blocks) {
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index ebd44ade4cd..c6070e7531e 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -379,7 +379,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       bld.insert(std::move(startpgm));
 
       /* exec seems to need to be manually initialized with combined shaders */
-      if (util_bitcount(ctx.program->stage & sw_mask) > 1 || (ctx.program->stage & hw_ngg_gs)) {
+      if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG_GS) {
          bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX));
          instructions[0]->definitions.pop_back();
       }
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index c93032b495c..babd769aa42 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4252,7 +4252,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs ||
           ctx->stage == ngg_vertex_geometry_gs || ctx->stage == ngg_tess_eval_geometry_gs) {
          /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
-         unsigned itemsize = (ctx->stage & sw_vs)
+         unsigned itemsize = ctx->stage.has(SWStage::VS)
                              ? ctx->program->info->vs.es_info.esgs_itemsize
                              : ctx->program->info->tes.es_info.esgs_itemsize;
          Temp vertex_idx = thread_id_in_threadgroup(ctx);
@@ -4363,9 +4363,9 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
          abort();
       }
-   } else if ((ctx->stage & (hw_ls | hw_es)) ||
+   } else if ((ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::ES) ||
               (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
-              ((ctx->stage & sw_gs) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) {
+              (ctx->stage.has(SWStage::GS) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) {
       visit_store_ls_or_es_output(ctx, instr);
    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
       visit_store_tcs_output(ctx, instr, false);
@@ -7588,7 +7588,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_load_view_index: {
-      if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
+      if (ctx->stage.has(SWStage::VS) ||
+          ctx->stage.has(SWStage::GS) ||
+          ctx->stage.has(SWStage::TCS) ||
+          ctx->stage.has(SWStage::TES)) {
          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
          break;
@@ -8348,21 +8351,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_emit_vertex_with_counter: {
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
          ngg_visit_emit_vertex_with_counter(ctx, instr);
       else
          visit_emit_vertex_with_counter(ctx, instr);
       break;
    }
    case nir_intrinsic_end_primitive_with_counter: {
-      if ((ctx->stage & hw_ngg_gs) == 0) {
+      if (ctx->stage.hw != HWStage::NGG_GS) {
          unsigned stream = nir_intrinsic_stream_id(instr);
          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
       }
       break;
    }
    case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
          ngg_visit_set_vertex_and_primitive_count(ctx, instr);
       /* unused in the legacy pipeline, the HW keeps track of this for us */
       break;
@@ -10079,9 +10082,9 @@ static bool visit_cf_list(isel_context *ctx,
 
 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
 {
-   assert(ctx->stage & (hw_vs | hw_ngg_gs));
+   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS);
 
-   int offset = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs))
+   int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
                 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
                 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
    uint64_t mask = ctx->outputs.mask[slot];
@@ -10176,15 +10179,15 @@ static void create_export_phis(isel_context *ctx)
 
 static void create_vs_exports(isel_context *ctx)
 {
-   assert(ctx->stage & (hw_vs | hw_ngg_gs));
+   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS);
 
-   radv_vs_output_info *outinfo = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs))
+   radv_vs_output_info *outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
                                   ? &ctx->program->info->tes.outinfo
                                   : &ctx->program->info->vs.outinfo;
 
-   if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
+   if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG_GS) {
       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
-      if (ctx->stage & sw_tes)
+      if (ctx->stage.has(SWStage::TES))
          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.tes_patch_id);
       else
          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
@@ -10646,7 +10649,7 @@ static void emit_stream_output(isel_context *ctx,
 
    Temp out[4];
    bool all_undef = true;
-   assert(ctx->stage & hw_vs);
+   assert(ctx->stage.hw == HWStage::VS);
    for (unsigned i = 0; i < num_comps; i++) {
       out[i] = ctx->outputs.temps[loc * 4 + start + i];
       all_undef = all_undef && !out[i].id();
@@ -11055,7 +11058,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Tem
    /* VS/TES: we infer the vertex and primitive count from arguments
     * GS: the caller needs to supply them
     */
-   assert((ctx->stage & sw_gs)
+   assert(ctx->stage.has(SWStage::GS)
           ? (vtx_cnt.id() && prm_cnt.id())
           : (!vtx_cnt.id() && !prm_cnt.id()));
 
@@ -11117,7 +11120,7 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive
    Builder bld(ctx->program, ctx->block);
    Temp prim_exp_arg;
 
-   if (!(ctx->stage & sw_gs) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
+   if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
       prim_exp_arg = get_arg(ctx, ctx->args->gs_vtx_offset[0]);
    else
       prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null);
@@ -11144,7 +11147,7 @@ void ngg_nogs_export_primitives(isel_context *ctx)
    constexpr unsigned max_vertices_per_primitive = 3;
    unsigned num_vertices_per_primitive = max_vertices_per_primitive;
 
-   assert(!(ctx->stage & sw_gs));
+   assert(!ctx->stage.has(SWStage::GS));
 
    if (ctx->stage == ngg_vertex_gs) {
       /* TODO: optimize for points & lines */
@@ -11711,10 +11714,10 @@ void select_program(Program *program,
 
       visit_cf_list(&ctx, &func->body);
 
-      if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
+      if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
          emit_streamout(&ctx, 0);
 
-      if (ctx.stage & hw_vs) {
+      if (ctx.stage.hw == HWStage::VS) {
          create_vs_exports(&ctx);
          ctx.block->kind |= block_kind_export_end;
       } else if (ngg_no_gs && ctx.ngg_nogs_early_prim_export) {
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 5aa7f0a3599..107d116d9fd 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -61,7 +61,7 @@ struct isel_context {
    Block *block;
    std::unique_ptr<Temp[]> allocated;
    std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
-   Stage stage; /* Stage */
+   Stage stage;
    bool has_gfx10_wave64_bpermute = false;
    struct {
       bool has_branch;
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index cc5b78f0bb0..c119c6eaeeb 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -435,7 +435,7 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 
       /* TODO: NGG streamout */
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
          assert(!ctx->args->shader_info->so.num_outputs);
 
       /* TODO: check if the shader writes edge flags (not in Vulkan) */
@@ -481,9 +481,9 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
       ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
    }
 
-   if (ctx->stage & sw_vs)
+   if (ctx->stage.has(SWStage::VS))
       ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
-   else if (ctx->stage & sw_tes)
+   else if (ctx->stage.has(SWStage::TES))
       ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
 }
 
@@ -550,7 +550,7 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 
       /* TODO: NGG streamout */
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
          assert(!ctx->args->shader_info->so.num_outputs);
 
       /* Tess eval shaders can't write edge flags, so this can be always true. */
@@ -644,7 +644,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
    /* we'll need this for isel */
    nir_metadata_require(impl, nir_metadata_block_index);
 
-   if (!(ctx->stage & sw_gs_copy) && ctx->options->dump_preoptir) {
+   if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) {
       fprintf(stderr, "NIR shader before instruction selection:\n");
       nir_print_shader(shader, stderr);
    }
@@ -1022,26 +1022,26 @@ setup_isel_context(Program* program,
                    struct radv_shader_args *args,
                    bool is_gs_copy_shader)
 {
-   Stage stage = 0;
+   SWStage sw_stage = SWStage::None;
    for (unsigned i = 0; i < shader_count; i++) {
       switch (shaders[i]->info.stage) {
       case MESA_SHADER_VERTEX:
-         stage |= sw_vs;
+         sw_stage = sw_stage | SWStage::VS;
          break;
       case MESA_SHADER_TESS_CTRL:
-         stage |= sw_tcs;
+         sw_stage = sw_stage | SWStage::TCS;
          break;
       case MESA_SHADER_TESS_EVAL:
-         stage |= sw_tes;
+         sw_stage = sw_stage | SWStage::TES;
          break;
       case MESA_SHADER_GEOMETRY:
-         stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs;
+         sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);
          break;
       case MESA_SHADER_FRAGMENT:
-         stage |= sw_fs;
+         sw_stage = sw_stage | SWStage::FS;
          break;
       case MESA_SHADER_COMPUTE:
-         stage |= sw_cs;
+         sw_stage = sw_stage | SWStage::CS;
          break;
       default:
          unreachable("Shader stage not implemented");
@@ -1049,44 +1049,45 @@ setup_isel_context(Program* program,
    }
    bool gfx9_plus = args->options->chip_class >= GFX9;
    bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
-   if (stage == sw_vs && args->shader_info->vs.as_es && !ngg)
-      stage |= hw_es;
-   else if (stage == sw_vs && !args->shader_info->vs.as_ls && !ngg)
-      stage |= hw_vs;
-   else if (stage == sw_vs && ngg)
-      stage |= hw_ngg_gs; /* GFX10/NGG: VS without GS uses the HW GS stage */
-   else if (stage == sw_gs)
-      stage |= hw_gs;
-   else if (stage == sw_fs)
-      stage |= hw_fs;
-   else if (stage == sw_cs)
-      stage |= hw_cs;
-   else if (stage == sw_gs_copy)
-      stage |= hw_vs;
-   else if (stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
-      stage |= hw_gs; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
-   else if (stage == (sw_vs | sw_gs) && ngg)
-      stage |= hw_ngg_gs; /* GFX10+: VS+GS merged into an NGG GS */
-   else if (stage == sw_vs && args->shader_info->vs.as_ls)
-      stage |= hw_ls; /* GFX6-8: VS is a Local Shader, when tessellation is used */
-   else if (stage == sw_tcs)
-      stage |= hw_hs; /* GFX6-8: TCS is a Hull Shader */
-   else if (stage == (sw_vs | sw_tcs))
-      stage |= hw_hs; /* GFX9-10: VS+TCS merged into a Hull Shader */
-   else if (stage == sw_tes && !args->shader_info->tes.as_es && !ngg)
-      stage |= hw_vs; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
-   else if (stage == sw_tes && !args->shader_info->tes.as_es && ngg)
-      stage |= hw_ngg_gs; /* GFX10/NGG: TES without GS uses the HW GS stage */
-   else if (stage == sw_tes && args->shader_info->tes.as_es && !ngg)
-      stage |= hw_es; /* GFX6-8: TES is an Export Shader */
-   else if (stage == (sw_tes | sw_gs) && gfx9_plus && !ngg)
-      stage |= hw_gs; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
-   else if (stage == (sw_tes | sw_gs) && ngg)
-      stage |= hw_ngg_gs; /* GFX10+: TES+GS merged into an NGG GS */
+   HWStage hw_stage { };
+   if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
+      hw_stage = HWStage::ES;
+   else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
+      hw_stage = HWStage::VS;
+   else if (sw_stage == SWStage::VS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10/NGG: VS without GS uses the HW GS stage */
+   else if (sw_stage == SWStage::GS)
+      hw_stage = HWStage::GS;
+   else if (sw_stage == SWStage::FS)
+      hw_stage = HWStage::FS;
+   else if (sw_stage == SWStage::CS)
+      hw_stage = HWStage::CS;
+   else if (sw_stage == SWStage::GSCopy)
+      hw_stage = HWStage::VS;
+   else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg)
+      hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
+   else if (sw_stage == SWStage::VS_GS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10+: VS+GS merged into an NGG GS */
+   else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)
+      hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */
+   else if (sw_stage == SWStage::TCS)
+      hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */
+   else if (sw_stage == SWStage::VS_TCS)
+      hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */
+   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)
+      hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
+   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10/NGG: TES without GS uses the HW GS stage */
+   else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)
+      hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */
+   else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)
+      hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
+   else if (sw_stage == SWStage::TES_GS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10+: TES+GS merged into an NGG GS */
    else
       unreachable("Shader stage not implemented");
 
-   init_program(program, stage, args->shader_info,
+   init_program(program, Stage { hw_stage, sw_stage }, args->shader_info,
                 args->options->chip_class, args->options->family, config);
 
    isel_context ctx = {};
@@ -1096,7 +1097,7 @@ setup_isel_context(Program* program,
    ctx.stage = program->stage;
 
    /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
-   if (program->stage & (hw_vs | hw_fs)) {
+   if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {
       /* PS and legacy VS have separate waves, no workgroups */
       program->workgroup_size = program->wave_size;
    } else if (program->stage == compute_cs) {
@@ -1104,10 +1105,10 @@ setup_isel_context(Program* program,
       program->workgroup_size = shaders[0]->info.cs.local_size[0] *
                                 shaders[0]->info.cs.local_size[1] *
                                 shaders[0]->info.cs.local_size[2];
-   } else if ((program->stage & hw_es) || program->stage == geometry_gs) {
+   } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
       /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa)  */
       program->workgroup_size = program->wave_size;
-   } else if (program->stage & hw_gs) {
+   } else if (program->stage.hw == HWStage::GS) {
       /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
       assert(program->chip_class >= GFX9);
       uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
@@ -1125,7 +1126,7 @@ setup_isel_context(Program* program,
       /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
       setup_tcs_info(&ctx, shaders[1], shaders[0]);
       program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
-   } else if (program->stage & hw_ngg_gs) {
+   } else if (program->stage.hw == HWStage::NGG_GS) {
       gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;
 
       /* Max ES (SW VS) threads */
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index d0df37bc6a5..d17a4fb99b5 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1500,50 +1500,101 @@ struct Block {
    Block() : index(0) {}
 };
 
-using Stage = uint16_t;
+/*
+ * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
+ */
+enum class SWStage : uint8_t {
+    None = 0,
+    VS = 1 << 0,     /* Vertex Shader */
+    GS = 1 << 1,     /* Geometry Shader */
+    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
+    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
+    FS = 1 << 4,     /* Fragment aka Pixel Shader */
+    CS = 1 << 5,     /* Compute Shader */
+    GSCopy = 1 << 6, /* GS Copy Shader (internal) */
 
-/* software stages */
-static constexpr Stage sw_vs = 1 << 0;
-static constexpr Stage sw_gs = 1 << 1;
-static constexpr Stage sw_tcs = 1 << 2;
-static constexpr Stage sw_tes = 1 << 3;
-static constexpr Stage sw_fs = 1 << 4;
-static constexpr Stage sw_cs = 1 << 5;
-static constexpr Stage sw_gs_copy = 1 << 6;
-static constexpr Stage sw_mask = 0x7f;
+    /* Stage combinations merged to run on a single HWStage */
+    VS_GS = VS | GS,
+    VS_TCS = VS | TCS,
+    TES_GS = TES | GS,
+};
 
-/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
-static constexpr Stage hw_vs = 1 << 7;
-static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
-static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
-static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
-static constexpr Stage hw_fs = 1 << 13;
-static constexpr Stage hw_cs = 1 << 14;
-static constexpr Stage hw_mask = 0xff << 7;
+constexpr SWStage operator|(SWStage a, SWStage b) {
+    return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
+}
+
+/*
+ * Shader stages as running on the AMD GPU.
+ *
+ * The relation between HWStages and SWStages is not a one-to-one mapping:
+ * Some SWStages are merged by ACO to run on a single HWStage.
+ * See README.md for details.
+ */
+enum class HWStage : uint8_t {
+    VS,
+    ES,     /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
+    GS,     /* Geometry shader on GFX10/legacy and GFX6-9. */
+    NGG_GS, /* Geometry shader on GFX10/NGG. */
+    LS,     /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
+    HS,     /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
+    FS,
+    CS,
+};
+
+/*
+ * Set of SWStages to be merged into a single shader paired with the
+ * HWStage it will run on.
+ */
+struct Stage {
+    constexpr Stage() = default;
+
+    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { }
+
+    /* Check if the given SWStage is included */
+    constexpr bool has(SWStage stage) const {
+        return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
+    }
+
+    unsigned num_sw_stages() const {
+        return util_bitcount(static_cast<uint8_t>(sw));
+    }
+
+    constexpr bool operator==(const Stage& other) const {
+        return sw == other.sw && hw == other.hw;
+    }
+
+    constexpr bool operator!=(const Stage& other) const {
+        return sw != other.sw || hw != other.hw;
+    }
+
+    /* Mask of merged software stages */
+    SWStage sw = SWStage::None;
+
+    /* Active hardware stage */
+    HWStage hw {};
+};
 
 /* possible settings of Program::stage */
-static constexpr Stage vertex_vs = sw_vs | hw_vs;
-static constexpr Stage fragment_fs = sw_fs | hw_fs;
-static constexpr Stage compute_cs = sw_cs | hw_cs;
-static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
-static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
+static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
+static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
+static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
+static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
+static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
 /* GFX10/NGG */
-static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
-static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
-static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
-static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
+static constexpr Stage ngg_vertex_gs(HWStage::NGG_GS, SWStage::VS);
+static constexpr Stage ngg_vertex_geometry_gs(HWStage::NGG_GS, SWStage::VS_GS);
+static constexpr Stage ngg_tess_eval_gs(HWStage::NGG_GS, SWStage::TES);
+static constexpr Stage ngg_tess_eval_geometry_gs(HWStage::NGG_GS, SWStage::TES_GS);
 /* GFX9 (and GFX10 if NGG isn't used) */
-static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
-static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
-static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
+static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
+static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
+static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
 /* pre-GFX9 */
-static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
-static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
-static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
-static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
-static constexpr Stage geometry_gs = sw_gs | hw_gs;
+static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
+static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
+static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
+static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */
+static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
 
 enum statistic {
    statistic_hash,
@@ -1574,7 +1625,7 @@ public:
    enum radeon_family family;
    unsigned wave_size;
    RegClass lane_mask;
-   Stage stage; /* Stage */
+   Stage stage;
    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
    bool needs_wqm = false; /* there exists a p_wqm instruction */
    bool wb_smem_l1_on_end = false;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 8fab8e6da48..ca8865b96ea 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1726,7 +1726,7 @@ void lower_to_hw_instr(Program* program)
                /* don't bother with an early exit near the end of the program */
                if ((block->instructions.size() - 1 - j) <= 4 &&
                     block->instructions.back()->opcode == aco_opcode::s_endpgm) {
-                  unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
+                  unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
                   bool ignore_early_exit = true;
 
                   for (unsigned k = j + 1; k < block->instructions.size(); ++k) {
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 5e0ab735802..83e5dcf0b80 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -862,7 +862,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
       }
    }
 
-   if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
+   if ((program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) &&
+       (block->kind & block_kind_export_end)) {
       /* Try to move position exports as far up as possible, to reduce register
        * usage and because ISA reference guides say so. */
       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {