diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 95d62f6b0f1..1feb9b1cc90 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -1311,8 +1311,10 @@ schedule_program(Program* program)
    RegisterDemand demand;
    for (Block& block : program->blocks)
       demand.update(block.register_demand);
-   demand.vgpr += program->config->num_shared_vgprs / 2;
-   demand.update(program->fixed_reg_demand);
+
+   RegisterDemand usage = demand;
+   usage.vgpr += program->config->num_shared_vgprs / 2;
+   usage.update(program->fixed_reg_demand);
 
    sched_ctx ctx;
    ctx.gfx_level = program->gfx_level;
@@ -1326,7 +1328,7 @@ schedule_program(Program* program)
    /* If we already have less waves than the minimum, don't reduce them further.
     * Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling.
     */
-   int vgpr_demand = std::max<int>(24, demand.vgpr) + 12 * reg_file_multiple;
+   int vgpr_demand = std::max<int>(24, usage.vgpr) + 12 * reg_file_multiple;
    int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand);
    target_waves = max_suitable_waves(program, std::min<int>(program->num_waves, target_waves));
    assert(target_waves >= program->min_waves);
@@ -1334,6 +1336,14 @@ schedule_program(Program* program)
    ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves);
    ctx.mv.max_registers.vgpr -= 2;
 
+   /* If this is a callee, don't use unneeded preserved VGPRs. */
+   if (program->is_callee) {
+      RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
+      RegisterDemand max_clobbered_regs = program->callee_abi.numClobbered(limit);
+      ctx.mv.max_registers.vgpr = std::min(ctx.mv.max_registers.vgpr, max_clobbered_regs.vgpr);
+      ctx.mv.max_registers.update(demand);
+   }
+
    /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
    ctx.occupancy_factor = target_waves / wave_factor;