aco: align scratch size during assembly

This lets us use less scratch if both VGPR spilling and scratch intrinsics are used. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20534>
2025-12-24 21:50:12 +01:00 · 2023-01-05 14:01:21 +00:00 · 2023-01-05 14:01:21 +00:00 · 810ced93f3
commit 810ced93f3
parent c9846158cd
3 changed files with 5 additions and 4 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -1154,6 +1154,9 @@ emit_program(Program* program, std::vector<uint32_t>& code)
   code.insert(code.end(), (uint32_t*)program->constant_data.data(),
               (uint32_t*)(program->constant_data.data() + program->constant_data.size()));

+   program->config->scratch_bytes_per_wave = align(
+      program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule);
+
   return exec_size;
 }

--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -903,8 +903,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
   for (unsigned i = 0; i < shader_count; i++)
      scratch_size = std::max(scratch_size, shaders[i]->scratch_size);

-   ctx.program->config->scratch_bytes_per_wave =
-      align(scratch_size * ctx.program->wave_size, ctx.program->dev.scratch_alloc_granule);
+   ctx.program->config->scratch_bytes_per_wave = scratch_size * ctx.program->wave_size;

   unsigned nir_num_blocks = 0;
   for (unsigned i = 0; i < shader_count; i++)
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@ -1856,8 +1856,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
   }

   /* update required scratch memory */
-   ctx.program->config->scratch_bytes_per_wave += align(
-      ctx.vgpr_spill_slots * 4 * ctx.program->wave_size, ctx.program->dev.scratch_alloc_granule);
+   ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size;

   /* SSA elimination inserts copies for logical phis right before p_logical_end
    * So if a linear vgpr is used between that p_logical_end and the branch,