diff --git a/src/gallium/drivers/d3d12/d3d12_compiler.cpp b/src/gallium/drivers/d3d12/d3d12_compiler.cpp
index 143cfa9066c..8ade408bc7d 100644
--- a/src/gallium/drivers/d3d12/d3d12_compiler.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_compiler.cpp
@@ -117,8 +117,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
 
    NIR_PASS(_, nir, d3d12_lower_state_vars, shader);
 
-   const struct dxil_nir_lower_loads_stores_options loads_stores_options = {};
-   NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
+   NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
 
    if (key->stage == MESA_SHADER_FRAGMENT && key->fs.multisample_disabled)
       NIR_PASS(_, nir, d3d12_disable_multisampling);
diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c
index 525c96f023c..e687951f509 100644
--- a/src/microsoft/clc/clc_compiler.c
+++ b/src/microsoft/clc/clc_compiler.c
@@ -1109,9 +1109,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
               nir_address_format_32bit_index_offset);
    NIR_PASS(_, nir, clc_nir_lower_system_values, work_properties_var);
-   const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
-      .use_16bit_ssbo = false,
-   };
 
    /* Now that function-declared local vars have been sized, append args */
    for (unsigned i = 0; i < out_dxil->kernel->num_args; i++) {
@@ -1140,7 +1137,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
       nir->info.shared_size += size;
    }
 
-   NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
+   NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
    NIR_PASS(_, nir, dxil_nir_opt_alu_deref_srcs);
    NIR_PASS(_, nir, nir_lower_fp16_casts, nir_lower_fp16_all);
    NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c
index 9e4f56fb360..3e754c90843 100644
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@@ -39,166 +39,6 @@ cl_type_size_align(const struct glsl_type *type, unsigned *size,
    *align = glsl_get_cl_alignment(type);
 }
 
-static nir_def *
-load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
-                  nir_def **src_comps, unsigned num_src_comps,
-                  unsigned dst_bit_size)
-{
-   if (src_bit_size == dst_bit_size)
-      return nir_vec(b, src_comps, num_src_comps);
-   else if (src_bit_size > dst_bit_size)
-      return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size);
-
-   unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size);
-   unsigned comps_per_dst = dst_bit_size / src_bit_size;
-   nir_def *dst_comps[4];
-
-   for (unsigned i = 0; i < num_dst_comps; i++) {
-      unsigned src_offs = i * comps_per_dst;
-
-      dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size);
-      for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) {
-         nir_def *tmp = nir_ishl_imm(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size),
-                                         j * src_bit_size);
-         dst_comps[i] = nir_ior(b, dst_comps[i], tmp);
-      }
-   }
-
-   return nir_vec(b, dst_comps, num_dst_comps);
-}
-
-static bool
-lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
-{
-   unsigned bit_size = intr->def.bit_size;
-   unsigned num_components = intr->def.num_components;
-   unsigned num_bits = num_components * bit_size;
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_def *offset = intr->src[0].ssa;
-   if (intr->intrinsic == nir_intrinsic_load_shared)
-      offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
-   else
-      offset = nir_u2u32(b, offset);
-   nir_def *index = nir_ushr_imm(b, offset, 2);
-   nir_def *comps[NIR_MAX_VEC_COMPONENTS];
-   nir_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
-
-   /* We need to split loads in 32-bit accesses because the buffer
-    * is an i32 array and DXIL does not support type casts.
-    */
-   unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
-   for (unsigned i = 0; i < num_32bit_comps; i++)
-      comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i));
-   unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
-
-   for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
-      unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
-      unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
-      nir_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
-
-      /* If we have 16 bits or less to load we need to adjust the u32 value so
-       * we can always extract the LSB.
-       */
-      if (num_bits <= 16) {
-         nir_def *shift =
-            nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
-         vec32 = nir_ushr(b, vec32, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      unsigned dest_index = i * 32 / bit_size;
-      nir_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size);
-      for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index)
-         comps[dest_index] = nir_channel(b, temp_vec, comp);
-   }
-
-   nir_def *result = nir_vec(b, comps, num_components);
-   nir_def_replace(&intr->def, result);
-
-   return true;
-}
-
-static void
-lower_masked_store_vec32(nir_builder *b, nir_def *offset, nir_def *index,
-                         nir_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment)
-{
-   nir_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
-
-   /* If we have small alignments, we need to place them correctly in the u32 component. */
-   if (alignment <= 2) {
-      nir_def *shift =
-         nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
-
-      vec32 = nir_ishl(b, vec32, shift);
-      mask = nir_ishl(b, mask, shift);
-   }
-
-   if (var->data.mode == nir_var_mem_shared) {
-      /* Use the dedicated masked intrinsic */
-      nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
-      nir_deref_atomic(b, 32, &deref->def, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand);
-      nir_deref_atomic(b, 32, &deref->def, vec32, .atomic_op = nir_atomic_op_ior);
-   } else {
-      /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
-      nir_def *load = nir_load_array_var(b, var, index);
-
-      nir_def *new_val = nir_ior(b, vec32,
-                                     nir_iand(b,
-                                              nir_inot(b, mask),
-                                              load));
-
-      nir_store_array_var(b, var, index, new_val, 1);
-   }
-}
-
-static bool
-lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
-{
-   unsigned num_components = nir_src_num_components(intr->src[0]);
-   unsigned bit_size = nir_src_bit_size(intr->src[0]);
-   unsigned num_bits = num_components * bit_size;
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_def *offset = intr->src[1].ssa;
-   if (intr->intrinsic == nir_intrinsic_store_shared)
-      offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
-   else
-      offset = nir_u2u32(b, offset);
-   nir_def *comps[NIR_MAX_VEC_COMPONENTS];
-
-   unsigned comp_idx = 0;
-   for (unsigned i = 0; i < num_components; i++)
-      comps[i] = nir_channel(b, intr->src[0].ssa, i);
-
-   unsigned step = MAX2(bit_size, 32);
-   for (unsigned i = 0; i < num_bits; i += step) {
-      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */
-      unsigned substore_num_bits = MIN2(num_bits - i, step);
-      nir_def *local_offset = nir_iadd_imm(b, offset, i / 8);
-      nir_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx],
-                                             substore_num_bits / bit_size, 32);
-      nir_def *index = nir_ushr_imm(b, local_offset, 2);
-
-      /* For anything less than 32bits we need to use the masked version of the
-       * intrinsic to preserve data living in the same 32bit slot. */
-      if (substore_num_bits < 32) {
-         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr));
-      } else {
-         for (unsigned i = 0; i < vec32->num_components; ++i)
-            nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1);
-      }
-
-      comp_idx += substore_num_bits / bit_size;
-   }
-
-   nir_instr_remove(&intr->instr);
-
-   return true;
-}
-
 #define CONSTANT_LOCATION_UNVISITED 0
 #define CONSTANT_LOCATION_VALID 1
 #define CONSTANT_LOCATION_INVALID 2
@@ -654,83 +494,134 @@ dxil_nir_remove_oob_array_accesses(nir_shader *shader)
                                      NULL);
 }
 
-static bool
-lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
+/*
+ * This pass operates only on 32-bit scalars, so this callback instructs
+ * nir_lower_mem_access_bit_sizes_options to turn all shared access into
+ * 32-bit scalars. We don't want to use 8-bit accesses, since that would be
+ * challenging to optimize the resulting pack/unpack on some drivers. Larger
+ * 32-bit access however requires nontrivial tracking to extract/insert. Since
+ * nir_lower_mem_access_bit_sizes already has that code, we use it in this pass
+ * instead of NIH'ing it here.
+ */
+static nir_mem_access_size_align
+mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size,
+              uint32_t align, uint32_t align_offset, bool offset_is_const,
+              enum gl_access_qualifier access, const void *cb_data)
 {
+   return (nir_mem_access_size_align){
+      .num_components = 1,
+      .bit_size = 32,
+      .align = 4,
+      .shift = nir_mem_access_shift_method_scalar,
+   };
+}
+
+/*
+ * Thanks to nir_lower_mem_access_bit_sizes, we can lower shared intrinsics 1:1
+ * to word-based array access.
+ */
+static bool
+lower_shared_to_var(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   nir_variable *shared = data;
    b->cursor = nir_before_instr(&intr->instr);
 
-   nir_def *offset =
-      nir_iadd_imm(b, intr->src[0].ssa, nir_intrinsic_base(intr));
-   nir_def *index = nir_ushr_imm(b, offset, 2);
+   switch (intr->intrinsic) {
+      case nir_intrinsic_store_shared: {
+         nir_def *index = nir_udiv_aligned_4(b, intr->src[1].ssa);
+         nir_def *value = intr->src[0].ssa;
 
-   nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
-   nir_def *result;
-   if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
-      result = nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
-                                     .atomic_op = nir_intrinsic_atomic_op(intr));
-   else
-      result = nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
-                                .atomic_op = nir_intrinsic_atomic_op(intr));
+         index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
+         nir_store_array_var(b, shared, index, value, nir_component_mask(1));
+         break;
+      }
+      case nir_intrinsic_load_shared: {
+         nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
+
+         index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
+         nir_def_rewrite_uses(&intr->def, nir_load_array_var(b, shared, index));
+         break;
+      }
+      case nir_intrinsic_shared_atomic:
+      case nir_intrinsic_shared_atomic_swap: {
+         nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
+
+         index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
+         nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, shared), index);
+         
+         if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
+            nir_def_rewrite_uses(&intr->def, nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
+                                                                   .atomic_op = nir_intrinsic_atomic_op(intr)));
+         else
+            nir_def_rewrite_uses(&intr->def, nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
+                                                              .atomic_op = nir_intrinsic_atomic_op(intr)));
+         break;
+      }
+      default:
+         return false;
+   }
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+dxil_nir_lower_shared_to_var(nir_shader *nir)
+{
+   unsigned words = DIV_ROUND_UP(nir->info.shared_size, 4);
+
+   /* Early exit in the common case that scratch is not used. */
+   if (words == 0) {
+      return false;
+   }
+
+   /* First, lower bit sizes and vectors as required by lower_shared_to_var */
+   nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
+      .modes = nir_var_mem_shared,
+      .callback = mem_access_cb,
+      .may_lower_unaligned_stores_to_atomics = true,
+   };
+   NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
+
+   /* Then, back shared by an array of words and turn all shared access into
+    * array access. */
+   const glsl_type *type_ = glsl_array_type(glsl_uint_type(), words, 1);
+   nir_variable *var = nir_variable_create(nir, nir_var_mem_shared, type_, "shared");
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_function_intrinsics_pass(impl, lower_shared_to_var, nir_metadata_control_flow, var);
+   }
+
+   /* After lowering, we've eliminated all shared memory in the shader. */
+   nir->info.shared_size = 0;
+
+   /* Now clean up the mess we made */
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_dce);
+   } while (progress);
 
-   nir_def_replace(&intr->def, result);
    return true;
 }
 
 bool
-dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
-                                    const struct dxil_nir_lower_loads_stores_options *options)
+dxil_nir_scratch_and_shared_to_dxil(nir_shader *nir)
 {
    bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL);
-   nir_variable *shared_var = NULL;
-   if (nir->info.shared_size) {
-      shared_var = nir_variable_create(nir, nir_var_mem_shared,
-                                       glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4),
-                                       "lowered_shared_mem");
-   }
 
    unsigned ptr_size = nir->info.cs.ptr_size;
    if (nir->info.stage == MESA_SHADER_KERNEL) {
       /* All the derefs created here will be used as GEP indices so force 32-bit */
       nir->info.cs.ptr_size = 32;
    }
-   nir_foreach_function_impl(impl, nir) {
-      nir_builder b = nir_builder_create(impl);
-
-      nir_variable *scratch_var = NULL;
-      if (nir->scratch_size) {
-         const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4);
-         scratch_var = nir_local_variable_create(impl, scratch_type, "lowered_scratch_mem");
-      }
-
-      nir_foreach_block(block, impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-            switch (intr->intrinsic) {
-            case nir_intrinsic_load_shared:
-               progress |= lower_32b_offset_load(&b, intr, shared_var);
-               break;
-            case nir_intrinsic_load_scratch:
-               progress |= lower_32b_offset_load(&b, intr, scratch_var);
-               break;
-            case nir_intrinsic_store_shared:
-               progress |= lower_32b_offset_store(&b, intr, shared_var);
-               break;
-            case nir_intrinsic_store_scratch:
-               progress |= lower_32b_offset_store(&b, intr, scratch_var);
-               break;
-            case nir_intrinsic_shared_atomic:
-            case nir_intrinsic_shared_atomic_swap:
-               progress |= lower_shared_atomic(&b, intr, shared_var);
-               break;
-            default:
-               break;
-            }
-         }
-      }
-   }
+   progress |= nir_lower_scratch_to_var(nir);
+   progress |= dxil_nir_lower_shared_to_var(nir);
    if (nir->info.stage == MESA_SHADER_KERNEL) {
       nir->info.cs.ptr_size = ptr_size;
    }
diff --git a/src/microsoft/compiler/dxil_nir.h b/src/microsoft/compiler/dxil_nir.h
index 0a64633df33..267aa88f19d 100644
--- a/src/microsoft/compiler/dxil_nir.h
+++ b/src/microsoft/compiler/dxil_nir.h
@@ -41,11 +41,7 @@ bool dxil_nir_flatten_var_arrays(nir_shader *shader, nir_variable_mode modes);
 bool dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
                                  unsigned min_bit_size, unsigned max_bit_size);
 bool dxil_nir_remove_oob_array_accesses(nir_shader *shader);
-struct dxil_nir_lower_loads_stores_options {
-   bool use_16bit_ssbo;
-};
-bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader,
-                                         const struct dxil_nir_lower_loads_stores_options *options);
+bool dxil_nir_scratch_and_shared_to_dxil(nir_shader *shader);
 bool dxil_nir_lower_deref_ssbo(nir_shader *shader);
 bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader);
 bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size);
diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
index b749f050c6b..66a996e9f64 100644
--- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
+++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
@@ -1016,6 +1016,7 @@ dxil_spirv_nir_passes(nir_shader *nir,
                  shared_var_info);
       NIR_PASS(_, nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared);
       NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
+      NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
    } else {
       NIR_PASS(_, nir, nir_split_struct_vars, nir_var_mem_shared);
       NIR_PASS(_, nir, dxil_nir_flatten_var_arrays, nir_var_mem_shared);
@@ -1100,10 +1101,6 @@ dxil_spirv_nir_passes(nir_shader *nir,
    NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options);
 
    NIR_PASS(_, nir, dxil_nir_split_clip_cull_distance);
-   const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
-      .use_16bit_ssbo = conf->shader_model_max >= SHADER_MODEL_6_2,
-   };
-   NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
    NIR_PASS(_, nir, dxil_nir_split_typed_samplers);
    NIR_PASS(_, nir, dxil_nir_lower_ubo_array_one_to_static);
    NIR_PASS(_, nir, nir_opt_dce);