diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 91db85bc14f..c3841aea5dc 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -1689,25 +1689,16 @@ agx_lower_front_face(struct nir_builder *b, nir_instr *instr, UNUSED void *data)
    return true;
 }
 
+/*
+ * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
+ * again at shader variant compile time. Unless there was a complex shader key,
+ * the latter run should be almost a no-op.
+ */
 static void
-agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
+agx_optimize_loop_nir(nir_shader *nir)
 {
    bool progress;
 
-   nir_lower_idiv_options idiv_options = {
-      .allow_fp16 = true,
-   };
-
-   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
-   NIR_PASS_V(nir, nir_lower_int64);
-   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
-   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
-   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
-   NIR_PASS_V(nir, agx_lower_sincos);
-   NIR_PASS_V(nir, nir_shader_instructions_pass, agx_lower_front_face,
-              nir_metadata_block_index | nir_metadata_dominance, NULL);
-
    do {
       progress = false;
 
@@ -1730,6 +1721,12 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
 
       NIR_PASS(progress, nir, nir_opt_loop_unroll);
    } while (progress);
+}
+
+static void
+agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
+{
+   agx_optimize_loop_nir(nir);
 
    NIR_PASS_V(nir, agx_nir_lower_address);
    NIR_PASS_V(nir, nir_lower_int64);
@@ -2003,6 +2000,17 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
 /*
  * Preprocess NIR. In particular, this lowers I/O. Drivers should call this
  * as soon as they don't need unlowered I/O.
+ *
+ * This also lowers as much as possible. After preprocessing NIR, the following
+ * NIR passes are called by the GL driver:
+ *
+ *    - nir_lower_blend
+ *    - nir_lower_texcoord_replace_late
+ *    - agx_nir_lower_vbo
+ *    - agx_nir_lower_tilebuffer
+ *
+ * Unless an instruction is constructed by one of the above passes, it should be
+ * lowered here to avoid duplicate work with shader variants.
  */
 void
 agx_preprocess_nir(nir_shader *nir)
@@ -2042,9 +2050,6 @@ agx_preprocess_nir(nir_shader *nir)
                  ~agx_fp32_varying_mask(nir), false);
    }
 
-   NIR_PASS_V(nir, agx_nir_lower_ubo);
-   NIR_PASS_V(nir, nir_lower_ssbo);
-
    /* Varying output is scalar, other I/O is vector */
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
@@ -2054,7 +2059,39 @@ agx_preprocess_nir(nir_shader *nir)
    NIR_PASS_V(nir, nir_opt_dce);
    NIR_PASS_V(nir, agx_nir_lower_texture);
 
+   nir_lower_idiv_options idiv_options = {
+      .allow_fp16 = true,
+   };
+
+   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
+   NIR_PASS_V(nir, nir_lower_int64);
+   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
+   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
+   NIR_PASS_V(nir, agx_lower_sincos);
+   NIR_PASS_V(nir, nir_shader_instructions_pass, agx_lower_front_face,
+              nir_metadata_block_index | nir_metadata_dominance, NULL);
+
+   /* After lowering, run through the standard suite of NIR optimizations. We
+    * will run through the loop later, once we have the shader key, but if we
+    * run now, that run will ideally be almost a no-op.
+    */
+   agx_optimize_loop_nir(nir);
+
+   /* We're lowered away all variables. Remove them all for smaller shaders. */
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
    nir->info.io_lowered = true;
+
+   /* Move before lowering */
+   nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
+                               nir_move_load_input | nir_move_comparisons |
+                               nir_move_copies | nir_move_load_ssbo;
+
+   NIR_PASS_V(nir, nir_opt_sink, move_all);
+   NIR_PASS_V(nir, nir_opt_move, move_all);
+   NIR_PASS_V(nir, agx_nir_lower_ubo);
+   NIR_PASS_V(nir, nir_lower_ssbo);
 }
 
 void
@@ -2090,6 +2127,13 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
          out->depth_layout = layout;
    }
 
+   /* Late blend lowering creates vectors */
+   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+
+   /* Late VBO lowering creates constant udiv instructions */
+   NIR_PASS_V(nir, nir_opt_idiv_const, 16);
+
    out->push_count = key->reserved_preamble;
    agx_optimize_nir(nir, &out->push_count);