intel: rework CL pre-compile

Stolen from asahi_clc :) We drop the nasty LLVM17+ workaround code (Thanks Alyssa!) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Tested-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Dylan Baker <None> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33014>
2025-12-24 08:50:13 +01:00 · 2025-01-14 17:20:59 +02:00 · 2025-01-14 17:20:59 +02:00 · 6768eb31e5
commit 6768eb31e5
parent 5adac011b8
7 changed files with 233 additions and 601 deletions
--- a/src/gallium/drivers/iris/iris_program_cache.c
+++ b/src/gallium/drivers/iris/iris_program_cache.c
@ -325,11 +325,9 @@ iris_destroy_program_cache(struct iris_context *ice)

 static void
 link_libintel_shaders(nir_shader *nir,
-                      const struct intel_device_info *devinfo,
                      const uint32_t *spv_code, uint32_t spv_size)
 {
-   nir_shader *libintel = brw_nir_from_spirv(nir, devinfo->ver,
-                                             spv_code, spv_size, true);
+   nir_shader *libintel = brw_nir_from_spirv(nir, spv_code, spv_size);

   nir_link_shader_functions(nir, libintel);
   NIR_PASS_V(nir, nir_inline_functions);
@ -342,6 +340,7 @@ link_libintel_shaders(nir_shader *nir,
              nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
                 nir_var_mem_global,
              nir_address_format_62bit_generic);
+   NIR_PASS_V(nir, nir_lower_scratch_to_var);
 }

 void
@ -378,7 +377,7 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch)

   nir_shader *nir = b.shader;

-   link_libintel_shaders(nir, screen->devinfo, spv_code, spv_size);
+   link_libintel_shaders(nir, spv_code, spv_size);

   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
   NIR_PASS_V(nir, nir_opt_cse);
--- a/src/intel/compiler/brw_kernel.h
+++ b/src/intel/compiler/brw_kernel.h
@ -66,10 +66,6 @@ brw_kernel_from_spirv(struct brw_compiler *compiler,
                      const char *entrypoint_name,
                      char **error_str);

-nir_shader *
-brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version,
-                   const uint32_t *spirv, size_t spirv_size, bool llvm17_wa);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@ -294,8 +294,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
 bool brw_nir_uses_inline_data(nir_shader *shader);

 nir_shader *
-brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
-                   size_t spirv_size, bool llvm17_wa);
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size);

 #ifdef __cplusplus
 }
--- a/src/intel/compiler/brw_spirv.c
+++ b/src/intel/compiler/brw_spirv.c
@ -13,167 +13,70 @@
 #include "dev/intel_debug.h"
 #include "util/u_dynarray.h"

-static nir_def *
-rebuild_value_from_store(struct util_dynarray *stores,
-                         nir_def *value, unsigned read_offset)
-{
-   unsigned read_size = value->num_components * value->bit_size / 8;
-
-   util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
-      nir_intrinsic_instr *store = *_store;
-
-      unsigned write_offset = nir_src_as_uint(store->src[1]);
-      unsigned write_size = nir_src_num_components(store->src[0]) *
-                            nir_src_bit_size(store->src[0]) / 8;
-      if (write_offset <= read_offset &&
-          (write_offset + write_size) >= (read_offset + read_size)) {
-         assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
-         assert(write_size == read_size);
-         return store->src[0].ssa;
-      }
-   }
-   unreachable("Matching scratch store not found");
-}
-
-/**
- * Remove temporary variables stored to scratch to be then reloaded
- * immediately. Remap the load to the store SSA value.
- *
- * This workaround is only meant to be applied to shaders in src/intel/shaders
- * were we know there should be no issue. More complex cases might not work
- * with this approach.
- */
-static bool
-nir_remove_llvm17_scratch(nir_shader *nir)
-{
-   struct util_dynarray scratch_stores;
-   void *mem_ctx = ralloc_context(NULL);
-
-   util_dynarray_init(&scratch_stores, mem_ctx);
-
-   nir_foreach_function_impl(func, nir) {
-      nir_foreach_block(block, func) {
-         nir_foreach_instr(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-            if (intrin->intrinsic != nir_intrinsic_store_scratch)
-               continue;
-
-            nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
-            if (offset != NULL) {
-               util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
-            }
-         }
-      }
-   }
-
-   bool progress = false;
-   if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
-      nir_foreach_function_impl(func, nir) {
-         nir_foreach_block(block, func) {
-            nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
-
-               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-               if (intrin->intrinsic != nir_intrinsic_load_scratch)
-                  continue;
-
-               nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
-               if (offset == NULL)
-                  continue;
-
-               nir_def_replace(&intrin->def,
-                               rebuild_value_from_store(&scratch_stores, &intrin->def, nir_src_as_uint(intrin->src[0])));
-
-               progress = true;
-            }
-         }
-      }
-   }
-
-   util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
-      nir_intrinsic_instr *store = *_store;
-      nir_instr_remove(&store->instr);
-   }
-
-   /* Quick sanity check */
-   assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
-          progress);
-
-   ralloc_free(mem_ctx);
-
-   return progress;
-}
-
 static void
-cleanup_llvm17_scratch(nir_shader *nir)
+optimize(nir_shader *nir)
 {
-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+   bool progress;
+   do {
+      progress = false;

-   nir_remove_llvm17_scratch(nir);
+      NIR_PASS(progress, nir, nir_split_var_copies);
+      NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp);
+      NIR_PASS(progress, nir, nir_lower_var_copies);
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);

-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+      NIR_PASS(progress, nir, nir_opt_phi_precision);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      NIR_PASS(progress, nir, nir_opt_deref);
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_undef);
+      NIR_PASS(progress, nir, nir_lower_undef_to_zero);
+
+      NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+
+   } while (progress);
 }

-static const struct spirv_capabilities spirv_caps = {
-   .Addresses = true,
-   .Float16 = true,
-   .Float64 = true,
-   .Groups = true,
-   .StorageImageWriteWithoutFormat = true,
-   .Int8 = true,
-   .Int16 = true,
-   .Int64 = true,
-   .Int64Atomics = true,
-   .Kernel = true,
-   .Linkage = true, /* We receive linked kernel from clc */
-   .DenormFlushToZero = true,
-   .DenormPreserve = true,
-   .SignedZeroInfNanPreserve = true,
-   .RoundingModeRTE = true,
-   .RoundingModeRTZ = true,
-   .GenericPointer = true,
-   .GroupNonUniform = true,
-   .GroupNonUniformArithmetic = true,
-   .GroupNonUniformClustered = true,
-   .GroupNonUniformBallot = true,
-   .GroupNonUniformQuad = true,
-   .GroupNonUniformShuffle = true,
-   .GroupNonUniformVote = true,
-   .SubgroupDispatch = true,
-};
-
 nir_shader *
-brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
-                   size_t spirv_size, bool llvm17_wa)
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size)
 {
-   assert(gfx_version >= 9);
-
+   static const struct spirv_capabilities spirv_caps = {
+      .Addresses = true,
+      .Float16 = true,
+      .Float64 = true,
+      .Groups = true,
+      .StorageImageWriteWithoutFormat = true,
+      .Int8 = true,
+      .Int16 = true,
+      .Int64 = true,
+      .Int64Atomics = true,
+      .Kernel = true,
+      .Linkage = true, /* We receive linked kernel from clc */
+      .DenormFlushToZero = true,
+      .DenormPreserve = true,
+      .SignedZeroInfNanPreserve = true,
+      .RoundingModeRTE = true,
+      .RoundingModeRTZ = true,
+      .GenericPointer = true,
+      .GroupNonUniform = true,
+      .GroupNonUniformArithmetic = true,
+      .GroupNonUniformClustered = true,
+      .GroupNonUniformBallot = true,
+      .GroupNonUniformQuad = true,
+      .GroupNonUniformShuffle = true,
+      .GroupNonUniformVote = true,
+      .SubgroupDispatch = true,
+   };
   struct spirv_to_nir_options spirv_options = {
      .environment = NIR_SPIRV_OPENCL,
      .capabilities = &spirv_caps,
@ -197,163 +100,79 @@ brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
   ralloc_steal(mem_ctx, nir);
   nir->info.name = ralloc_strdup(nir, "library");

-   if (INTEL_DEBUG(DEBUG_CS)) {
-      /* Re-index SSA defs so we print more sensible numbers. */
-      nir_foreach_function_impl(impl, nir) {
-         nir_index_ssa_defs(impl);
-      }
+   nir_fixup_is_exported(nir);

-      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
-      nir_print_shader(nir, stderr);
-   }
+   NIR_PASS(_, nir, nir_lower_system_values);
+   NIR_PASS(_, nir, nir_lower_calls_to_builtins);

-   nir_lower_printf_options printf_opts = {
-      .ptr_bit_size               = 64,
-      .use_printf_base_identifier = true,
-   };
-   NIR_PASS_V(nir, nir_lower_printf, &printf_opts);
+   NIR_PASS_V(nir, nir_lower_printf, &(const struct nir_lower_printf_options) {
+         .ptr_bit_size               = 64,
+         .use_printf_base_identifier = true,
+      });

-   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+   NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_returns);
+   NIR_PASS(_, nir, nir_inline_functions);
+   //nir_remove_non_exported(nir);
+   NIR_PASS(_, nir, nir_copy_prop);
+   NIR_PASS(_, nir, nir_opt_deref);

-   /* We have to lower away local constant initializers right before we
-    * inline functions.  That way they get properly initialized at the top
-    * of the function and not at the top of its caller.
+   /* We can't deal with constant data, get rid of it */
+   nir_lower_constant_to_temp(nir);
+
+   /* We can go ahead and lower the rest of the constant initializers.  We do
+    * this here so that nir_remove_dead_variables and split_per_member_structs
+    * below see the corresponding stores.
    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
-                                                      nir_var_function_temp));
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
-              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
-   {
-      bool progress;
-      do
-      {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_undef);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+   NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);

-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
-   NIR_PASS_V(nir, nir_lower_returns);
-   NIR_PASS_V(nir, nir_inline_functions);
-
-   assert(nir->scratch_size == 0);
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
-
-   {
-      bool progress;
-      do
-      {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_undef);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_split_var_copies);
-         NIR_PASS(progress, nir, nir_lower_var_copies);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-         NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
-         NIR_PASS(progress, nir, nir_opt_dead_cf);
-         NIR_PASS(progress, nir, nir_opt_remove_phis);
-         NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-         NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
-         NIR_PASS(progress, nir, nir_opt_memcpy);
-      } while (progress);
-   }
-
-   NIR_PASS_V(nir, nir_scale_fdiv);
-
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
-              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
-
-
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
-
-   nir->scratch_size = 0;
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
-              nir_var_mem_global | nir_var_mem_constant,
-              glsl_get_cl_type_size_align);
-
-   // Lower memcpy - needs to wait until types are sized
-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_opt_memcpy);
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_split_var_copies);
-         NIR_PASS(progress, nir, nir_lower_var_copies);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-      } while (progress);
-   }
-   NIR_PASS_V(nir, nir_lower_memcpy);
-
-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
-              nir_address_format_32bit_offset_as_64bit);
-
-   NIR_PASS_V(nir, nir_lower_system_values);
-
-   /* Hopefully we can drop this once lower_vars_to_ssa has improved to not
-    * lower everything to scratch.
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
    */
-   if (llvm17_wa)
-      cleanup_llvm17_scratch(nir);
+   NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
+               nir_var_mem_shared | nir_var_mem_global,
+            glsl_get_cl_type_size_align);
+
+   optimize(nir);
+
+   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);

   /* Lower again, this time after dead-variables to get more compact variable
    * layouts.
    */
-   nir->global_mem_size = 0;
-   nir->scratch_size = 0;
-   nir->info.shared_size = 0;
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
-              glsl_get_cl_type_size_align);
-   if (nir->constant_data_size > 0) {
-      assert(nir->constant_data == NULL);
-      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
-      nir_gather_explicit_io_initializers(nir, nir->constant_data,
-                                          nir->constant_data_size,
-                                          nir_var_mem_constant);
-   }
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant,
+            glsl_get_cl_type_size_align);
+   assert(nir->constant_data_size == 0);

-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
-              nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_memcpy);

-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
-              nir_address_format_32bit_offset_as_64bit);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
+            nir_address_format_64bit_global);

-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_shader_temp | nir_var_function_temp |
-              nir_var_mem_shared | nir_var_mem_global,
-              nir_address_format_62bit_generic);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
+            nir_address_format_64bit_global);

-   if (INTEL_DEBUG(DEBUG_CS)) {
-      /* Re-index SSA defs so we print more sensible numbers. */
-      nir_foreach_function_impl(impl, nir) {
-         nir_index_ssa_defs(impl);
-      }
+   /* Note: we cannot lower explicit I/O here, because we need derefs in tact
+    * for function calls into the library to work.
+    */

-      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
-      nir_print_shader(nir, stderr);
-   }
+   NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
+   NIR_PASS(_, nir, nir_opt_if, 0);
+   NIR_PASS(_, nir, nir_opt_idiv_const, 16);
+
+   optimize(nir);

   return nir;
 }
--- a/src/intel/compiler/elk/elk_nir.h
+++ b/src/intel/compiler/elk/elk_nir.h
@ -275,8 +275,7 @@ const struct glsl_type *elk_nir_get_var_type(const struct nir_shader *nir,
 void elk_nir_adjust_payload(nir_shader *shader);

 nir_shader *
-elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
-                   size_t spirv_size, bool llvm17_wa);
+elk_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size);

 #ifdef __cplusplus
 }
--- a/src/intel/compiler/elk/elk_spirv.c
+++ b/src/intel/compiler/elk/elk_spirv.c
@ -13,167 +13,70 @@
 #include "dev/intel_debug.h"
 #include "util/u_dynarray.h"

-static nir_def *
-rebuild_value_from_store(struct util_dynarray *stores,
-                         nir_def *value, unsigned read_offset)
-{
-   unsigned read_size = value->num_components * value->bit_size / 8;
-
-   util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
-      nir_intrinsic_instr *store = *_store;
-
-      unsigned write_offset = nir_src_as_uint(store->src[1]);
-      unsigned write_size = nir_src_num_components(store->src[0]) *
-                            nir_src_bit_size(store->src[0]) / 8;
-      if (write_offset <= read_offset &&
-          (write_offset + write_size) >= (read_offset + read_size)) {
-         assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
-         assert(write_size == read_size);
-         return store->src[0].ssa;
-      }
-   }
-   unreachable("Matching scratch store not found");
-}
-
-/**
- * Remove temporary variables stored to scratch to be then reloaded
- * immediately. Remap the load to the store SSA value.
- *
- * This workaround is only meant to be applied to shaders in src/intel/shaders
- * were we know there should be no issue. More complex cases might not work
- * with this approach.
- */
-static bool
-nir_remove_llvm17_scratch(nir_shader *nir)
-{
-   struct util_dynarray scratch_stores;
-   void *mem_ctx = ralloc_context(NULL);
-
-   util_dynarray_init(&scratch_stores, mem_ctx);
-
-   nir_foreach_function_impl(func, nir) {
-      nir_foreach_block(block, func) {
-         nir_foreach_instr(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-            if (intrin->intrinsic != nir_intrinsic_store_scratch)
-               continue;
-
-            nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
-            if (offset != NULL) {
-               util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
-            }
-         }
-      }
-   }
-
-   bool progress = false;
-   if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
-      nir_foreach_function_impl(func, nir) {
-         nir_foreach_block(block, func) {
-            nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
-
-               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-               if (intrin->intrinsic != nir_intrinsic_load_scratch)
-                  continue;
-
-               nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
-               if (offset == NULL)
-                  continue;
-
-               nir_def_replace(&intrin->def,
-                               rebuild_value_from_store(&scratch_stores, &intrin->def, nir_src_as_uint(intrin->src[0])));
-
-               progress = true;
-            }
-         }
-      }
-   }
-
-   util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
-      nir_intrinsic_instr *store = *_store;
-      nir_instr_remove(&store->instr);
-   }
-
-   /* Quick sanity check */
-   assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
-          progress);
-
-   ralloc_free(mem_ctx);
-
-   return progress;
-}
-
 static void
-cleanup_llvm17_scratch(nir_shader *nir)
+optimize(nir_shader *nir)
 {
-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+   bool progress;
+   do {
+      progress = false;

-   nir_remove_llvm17_scratch(nir);
+      NIR_PASS(progress, nir, nir_split_var_copies);
+      NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp);
+      NIR_PASS(progress, nir, nir_lower_var_copies);
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);

-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+      NIR_PASS(progress, nir, nir_opt_phi_precision);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      NIR_PASS(progress, nir, nir_opt_deref);
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_undef);
+      NIR_PASS(progress, nir, nir_lower_undef_to_zero);
+
+      NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+
+   } while (progress);
 }

-static const struct spirv_capabilities spirv_caps = {
-   .Addresses = true,
-   .Float16 = true,
-   .Float64 = true,
-   .Groups = true,
-   .StorageImageWriteWithoutFormat = true,
-   .Int8 = true,
-   .Int16 = true,
-   .Int64 = true,
-   .Int64Atomics = true,
-   .Kernel = true,
-   .Linkage = true, /* We receive linked kernel from clc */
-   .DenormFlushToZero = true,
-   .DenormPreserve = true,
-   .SignedZeroInfNanPreserve = true,
-   .RoundingModeRTE = true,
-   .RoundingModeRTZ = true,
-   .GenericPointer = true,
-   .GroupNonUniform = true,
-   .GroupNonUniformArithmetic = true,
-   .GroupNonUniformClustered = true,
-   .GroupNonUniformBallot = true,
-   .GroupNonUniformQuad = true,
-   .GroupNonUniformShuffle = true,
-   .GroupNonUniformVote = true,
-   .SubgroupDispatch = true,
-};
-
 nir_shader *
-elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
-                   size_t spirv_size, bool llvm17_wa)
+elk_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size)
 {
-   assert(gfx_version < 9);
-
+   static const struct spirv_capabilities spirv_caps = {
+      .Addresses = true,
+      .Float16 = true,
+      .Float64 = true,
+      .Groups = true,
+      .StorageImageWriteWithoutFormat = true,
+      .Int8 = true,
+      .Int16 = true,
+      .Int64 = true,
+      .Int64Atomics = true,
+      .Kernel = true,
+      .Linkage = true, /* We receive linked kernel from clc */
+      .DenormFlushToZero = true,
+      .DenormPreserve = true,
+      .SignedZeroInfNanPreserve = true,
+      .RoundingModeRTE = true,
+      .RoundingModeRTZ = true,
+      .GenericPointer = true,
+      .GroupNonUniform = true,
+      .GroupNonUniformArithmetic = true,
+      .GroupNonUniformClustered = true,
+      .GroupNonUniformBallot = true,
+      .GroupNonUniformQuad = true,
+      .GroupNonUniformShuffle = true,
+      .GroupNonUniformVote = true,
+      .SubgroupDispatch = true,
+   };
   struct spirv_to_nir_options spirv_options = {
      .environment = NIR_SPIRV_OPENCL,
      .capabilities = &spirv_caps,
@ -197,163 +100,79 @@ elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv,
   ralloc_steal(mem_ctx, nir);
   nir->info.name = ralloc_strdup(nir, "library");

-   if (INTEL_DEBUG(DEBUG_CS)) {
-      /* Re-index SSA defs so we print more sensible numbers. */
-      nir_foreach_function_impl(impl, nir) {
-         nir_index_ssa_defs(impl);
-      }
+   nir_fixup_is_exported(nir);

-      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
-      nir_print_shader(nir, stderr);
-   }
+   NIR_PASS(_, nir, nir_lower_system_values);
+   NIR_PASS(_, nir, nir_lower_calls_to_builtins);

-   nir_lower_printf_options printf_opts = {
-      .ptr_bit_size               = 64,
-      .use_printf_base_identifier = true,
-   };
-   NIR_PASS_V(nir, nir_lower_printf, &printf_opts);
+   NIR_PASS_V(nir, nir_lower_printf, &(const struct nir_lower_printf_options) {
+         .ptr_bit_size               = 64,
+         .use_printf_base_identifier = true,
+      });

-   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+   NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_returns);
+   NIR_PASS(_, nir, nir_inline_functions);
+   //nir_remove_non_exported(nir);
+   NIR_PASS(_, nir, nir_copy_prop);
+   NIR_PASS(_, nir, nir_opt_deref);

-   /* We have to lower away local constant initializers right before we
-    * inline functions.  That way they get properly initialized at the top
-    * of the function and not at the top of its caller.
+   /* We can't deal with constant data, get rid of it */
+   nir_lower_constant_to_temp(nir);
+
+   /* We can go ahead and lower the rest of the constant initializers.  We do
+    * this here so that nir_remove_dead_variables and split_per_member_structs
+    * below see the corresponding stores.
    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
-                                                      nir_var_function_temp));
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
-              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
-   {
-      bool progress;
-      do
-      {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_undef);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-      } while (progress);
-   }
+   NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);

-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
-   NIR_PASS_V(nir, nir_lower_returns);
-   NIR_PASS_V(nir, nir_inline_functions);
-
-   assert(nir->scratch_size == 0);
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
-
-   {
-      bool progress;
-      do
-      {
-         progress = false;
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_opt_undef);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-         NIR_PASS(progress, nir, nir_split_var_copies);
-         NIR_PASS(progress, nir, nir_lower_var_copies);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_algebraic);
-         NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
-         NIR_PASS(progress, nir, nir_opt_dead_cf);
-         NIR_PASS(progress, nir, nir_opt_remove_phis);
-         NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-         NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
-         NIR_PASS(progress, nir, nir_opt_memcpy);
-      } while (progress);
-   }
-
-   NIR_PASS_V(nir, nir_scale_fdiv);
-
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
-              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
-
-
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
-
-   nir->scratch_size = 0;
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
-              nir_var_mem_global | nir_var_mem_constant,
-              glsl_get_cl_type_size_align);
-
-   // Lower memcpy - needs to wait until types are sized
-   {
-      bool progress;
-      do {
-         progress = false;
-         NIR_PASS(progress, nir, nir_opt_memcpy);
-         NIR_PASS(progress, nir, nir_copy_prop);
-         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-         NIR_PASS(progress, nir, nir_opt_deref);
-         NIR_PASS(progress, nir, nir_opt_dce);
-         NIR_PASS(progress, nir, nir_split_var_copies);
-         NIR_PASS(progress, nir, nir_lower_var_copies);
-         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-         NIR_PASS(progress, nir, nir_opt_constant_folding);
-         NIR_PASS(progress, nir, nir_opt_cse);
-      } while (progress);
-   }
-   NIR_PASS_V(nir, nir_lower_memcpy);
-
-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
-              nir_address_format_32bit_offset_as_64bit);
-
-   NIR_PASS_V(nir, nir_lower_system_values);
-
-   /* Hopefully we can drop this once lower_vars_to_ssa has improved to not
-    * lower everything to scratch.
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
    */
-   if (llvm17_wa)
-      cleanup_llvm17_scratch(nir);
+   NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
+               nir_var_mem_shared | nir_var_mem_global,
+            glsl_get_cl_type_size_align);
+
+   optimize(nir);
+
+   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);

   /* Lower again, this time after dead-variables to get more compact variable
    * layouts.
    */
-   nir->global_mem_size = 0;
-   nir->scratch_size = 0;
-   nir->info.shared_size = 0;
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
-              glsl_get_cl_type_size_align);
-   if (nir->constant_data_size > 0) {
-      assert(nir->constant_data == NULL);
-      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
-      nir_gather_explicit_io_initializers(nir, nir->constant_data,
-                                          nir->constant_data_size,
-                                          nir_var_mem_constant);
-   }
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant,
+            glsl_get_cl_type_size_align);
+   assert(nir->constant_data_size == 0);

-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
-              nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_memcpy);

-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
-              nir_address_format_32bit_offset_as_64bit);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
+            nir_address_format_64bit_global);

-   NIR_PASS_V(nir, nir_lower_explicit_io,
-              nir_var_shader_temp | nir_var_function_temp |
-              nir_var_mem_shared | nir_var_mem_global,
-              nir_address_format_62bit_generic);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
+            nir_address_format_64bit_global);

-   if (INTEL_DEBUG(DEBUG_CS)) {
-      /* Re-index SSA defs so we print more sensible numbers. */
-      nir_foreach_function_impl(impl, nir) {
-         nir_index_ssa_defs(impl);
-      }
+   /* Note: we cannot lower explicit I/O here, because we need derefs in tact
+    * for function calls into the library to work.
+    */

-      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
-      nir_print_shader(nir, stderr);
-   }
+   NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
+   NIR_PASS(_, nir, nir_opt_if, 0);
+   NIR_PASS(_, nir, nir_opt_idiv_const, 16);
+
+   optimize(nir);

   return nir;
 }
--- a/src/intel/vulkan/anv_internal_kernels.c
+++ b/src/intel/vulkan/anv_internal_kernels.c
@ -56,7 +56,7 @@ load_libanv(struct anv_device *device)

   void *mem_ctx = ralloc_context(NULL);

-   return brw_nir_from_spirv(mem_ctx, device->info->ver, spv_code, spv_size, true);
+   return brw_nir_from_spirv(mem_ctx, spv_code, spv_size);
 }

 static void
@ -73,6 +73,7 @@ link_libanv(nir_shader *nir, const nir_shader *libanv)
              nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
                 nir_var_mem_global,
              nir_address_format_62bit_generic);
+   NIR_PASS_V(nir, nir_lower_scratch_to_var);
 }

 static struct anv_shader_bin *