diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 1002901124a..c5b5193ed7c 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1256,10 +1256,6 @@ store("uniform_ir3", [], indices=[BASE])
 # vec4's.
 intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
 
-# DXIL specific intrinsics
-# src[] = { value, mask, index, offset }.
-intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
-
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c
index bcd8145eb47..a41718ebfd3 100644
--- a/src/microsoft/clc/clc_compiler.c
+++ b/src/microsoft/clc/clc_compiler.c
@@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
 
    NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);
 
-   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo));
+   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared | nir_var_function_temp);
 
    assert(nir->info.cs.ptr_size == 64);
    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c
index 2554d3e5f85..938848406a6 100644
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@@ -67,188 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
    return nir_vec(b, dst_comps, num_dst_comps);
 }
 
-static bool
-lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
-{
-   assert(intr->dest.is_ssa);
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   unsigned src_bit_size = nir_dest_bit_size(intr->dest);
-   unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
-   unsigned offset_mask = store_bit_size / 8 - 1;
-
-   nir_ssa_def *buffer = intr->src[0].ssa;
-   nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~offset_mask));
-   enum gl_access_qualifier access = nir_intrinsic_access(intr);
-   unsigned num_components = nir_dest_num_components(intr->dest);
-   unsigned num_bits = num_components * src_bit_size;
-
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned comp_idx = 0;
-
-   /* We need to split loads in 4-component chunks because that's the optimal
-    * granularity of bufferLoad(). Minimum alignment is 2-byte.
-    */
-   for (unsigned i = 0; i < num_bits; i += 4 * store_bit_size) {
-      /* For each 4-component chunk (or smaller) we generate a N-bit ssbo vec load. */
-      unsigned subload_num_bits = MIN2(num_bits - i, 4 * store_bit_size);
-
-      /* The number of components to store depends on the number of bytes. */
-      nir_ssa_def *result =
-         nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, store_bit_size), store_bit_size,
-                       buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)),
-                       .align_mul = store_bit_size / 8,
-                       .align_offset = 0,
-                       .access = access);
-
-      /* If we have an unaligned load we need to adjust the result value so
-       * we can always extract the LSB.
-       */
-      if (nir_intrinsic_align(intr) < store_bit_size / 8) {
-         nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)),
-                                          nir_imm_int(b, 8));
-         result = nir_ushr(b, result, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      nir_ssa_def *temp_vec = nir_extract_bits(b, &result, 1, 0, subload_num_bits / src_bit_size, src_bit_size);
-      for (unsigned comp = 0; comp < subload_num_bits / src_bit_size; ++comp, ++comp_idx)
-         comps[comp_idx] = nir_channel(b, temp_vec, comp);
-   }
-
-   assert(comp_idx == num_components);
-   nir_ssa_def *result = nir_vec(b, comps, num_components);
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static bool
-lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
-{
-   b->cursor = nir_before_instr(&intr->instr);
-
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
-   assert(intr->src[2].is_ssa);
-
-   nir_ssa_def *val = intr->src[0].ssa;
-   nir_ssa_def *buffer = intr->src[1].ssa;
-
-   unsigned src_bit_size = val->bit_size;
-   unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
-   unsigned masked_store_bit_size = 32;
-   unsigned num_components = val->num_components;
-   unsigned num_bits = num_components * src_bit_size;
-
-   unsigned offset_mask = store_bit_size / 8 - 1;
-   unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1;
-   nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask));
-   nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask));
-
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 };
-   unsigned comp_idx = 0;
-
-   unsigned write_mask = nir_intrinsic_write_mask(intr);
-   for (unsigned i = 0; i < num_components; i++)
-      if (write_mask & (1 << i))
-         comps[i] = nir_channel(b, val, i);
-
-   /* We split stores in 4-component chunks because that's the optimal granularity
-    * of bufferStore(). Minimum alignment is 2-byte. */
-   unsigned bit_offset = 0;
-   while (true) {
-      /* Skip over holes in the write mask */
-      while (comp_idx < num_components && comps[comp_idx] == NULL) {
-         comp_idx++;
-         bit_offset += src_bit_size;
-      }
-      if (comp_idx >= num_components)
-         break;
-
-      /* For each 4-component chunk (or smaller) we generate a ssbo vec
-       * store. If a component is skipped by the write mask, do a smaller
-       * sub-store
-       */
-      unsigned num_src_comps_stored = 0, substore_num_bits = 0;
-      while(num_src_comps_stored + comp_idx < num_components &&
-            substore_num_bits + bit_offset < num_bits &&
-            substore_num_bits < 4 * store_bit_size &&
-            comps[comp_idx + num_src_comps_stored]) {
-         ++num_src_comps_stored;
-         substore_num_bits += src_bit_size;
-      }
-      bool force_masked = false;
-      if (substore_num_bits > store_bit_size &&
-          substore_num_bits % store_bit_size != 0) {
-         /* Split this into two, one unmasked store of the first bits,
-          * and then the second loop iteration will handle a masked store
-          * for the rest. */
-         assert(num_src_comps_stored == 3);
-         if (store_bit_size == 16) {
-            assert(substore_num_bits < 32);
-            /* If we're already doing atomics to store, just do one
-             * 32bit masked store instead of a 16bit store and a masked
-             * store for the other 8 bits. */
-            force_masked = true;
-         } else {
-            --num_src_comps_stored;
-            substore_num_bits = store_bit_size;
-         }
-      }
-      nir_intrinsic_instr *store;
-
-      if (substore_num_bits < store_bit_size || force_masked) {
-         nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
-                                                    num_src_comps_stored, masked_store_bit_size);
-         nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size);
-
-        /* If we have small alignments we need to place them correctly in the component. */
-         if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) {
-            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask));
-            nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
-
-            store_vec = nir_ishl(b, store_vec, shift);
-            mask = nir_ishl(b, mask, shift);
-         }
-
-         nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8));
-         store = nir_intrinsic_instr_create(b->shader,
-                                            nir_intrinsic_store_ssbo_masked_dxil);
-         store->src[0] = nir_src_for_ssa(store_vec);
-         store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
-         store->src[2] = nir_src_for_ssa(buffer);
-         store->src[3] = nir_src_for_ssa(local_offset);
-      } else {
-         nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
-         nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
-                                                    num_src_comps_stored, store_bit_size);
-         store = nir_intrinsic_instr_create(b->shader,
-                                            nir_intrinsic_store_ssbo);
-         store->src[0] = nir_src_for_ssa(store_vec);
-         store->src[1] = nir_src_for_ssa(buffer);
-         store->src[2] = nir_src_for_ssa(local_offset);
-
-         nir_intrinsic_set_align(store, store_bit_size / 8, 0);
-      }
-
-      /* The number of components to store depends on the number of bits. */
-      store->num_components = DIV_ROUND_UP(substore_num_bits, store_bit_size);
-      nir_builder_instr_insert(b, &store->instr);
-      comp_idx += num_src_comps_stored;
-      bit_offset += substore_num_bits;
-
-      if (nir_intrinsic_has_write_mask(store))
-         nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1);
-   }
-
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
 static bool
 lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
 {
@@ -872,18 +690,12 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
             case nir_intrinsic_load_scratch:
                progress |= lower_32b_offset_load(&b, intr, scratch_var);
                break;
-            case nir_intrinsic_load_ssbo:
-               progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
-               break;
             case nir_intrinsic_store_shared:
                progress |= lower_32b_offset_store(&b, intr, shared_var);
                break;
             case nir_intrinsic_store_scratch:
                progress |= lower_32b_offset_store(&b, intr, scratch_var);
                break;
-            case nir_intrinsic_store_ssbo:
-               progress |= lower_store_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
-               break;
             case nir_intrinsic_shared_atomic:
             case nir_intrinsic_shared_atomic_swap:
                progress |= lower_shared_atomic(&b, intr, shared_var);
diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c
index ae561b1e254..484ae9b0fd7 100644
--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@@ -3531,32 +3531,6 @@ emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
       emit_bufferstore_call(ctx, handle, coord, value, write_mask, overload);
 }
 
-static bool
-emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr)
-{
-   const struct dxil_value *value =
-      get_src(ctx, &intr->src[0], 0, nir_type_uint);
-   const struct dxil_value *mask =
-      get_src(ctx, &intr->src[1], 0, nir_type_uint);
-   const struct dxil_value* handle = get_resource_handle(ctx, &intr->src[2], DXIL_RESOURCE_CLASS_UAV, DXIL_RESOURCE_KIND_RAW_BUFFER);
-   const struct dxil_value *offset =
-      get_src(ctx, &intr->src[3], 0, nir_type_uint);
-   if (!value || !mask || !handle || !offset)
-      return false;
-
-   const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
-   if (!int32_undef)
-      return false;
-
-   const struct dxil_value *coord[3] = {
-      offset, int32_undef, int32_undef
-   };
-
-   return
-      emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL &&
-      emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL;
-}
-
 static bool
 emit_load_ubo_vec4(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -4833,8 +4807,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
       return emit_load_ssbo(ctx, intr);
    case nir_intrinsic_store_ssbo:
       return emit_store_ssbo(ctx, intr);
-   case nir_intrinsic_store_ssbo_masked_dxil:
-      return emit_store_ssbo_masked(ctx, intr);
    case nir_intrinsic_load_deref:
       return emit_load_deref(ctx, intr);
    case nir_intrinsic_store_deref:
@@ -6217,18 +6189,49 @@ lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
                               const void *cb_data)
 {
    const struct lower_mem_bit_sizes_data *data = cb_data;
-   unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64;
+   unsigned max_bit_size = 32;
    unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16;
    unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in));
-   /* UBO loads can be done at whatever (supported) bit size, but require 16 byte
-    * alignment and can load up to 16 bytes per instruction. However this pass requires
-    * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
-    * which can deal with unaligned vec4s, so for this pass let's just deal with bit size
-    * and total size restrictions. */
+   if (intrin == nir_intrinsic_load_ubo) {
+      /* UBO loads can be done at whatever (supported) bit size, but require 16 byte
+       * alignment and can load up to 16 bytes per instruction. However this pass requires
+       * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
+       * which can deal with unaligned vec4s, so for this pass let's just deal with bit size
+       * and total size restrictions. */
+      return (nir_mem_access_size_align) {
+         .align = closest_bit_size / 8,
+         .bit_size = closest_bit_size,
+         .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
+      };
+   }
+
+   assert(intrin == nir_intrinsic_load_ssbo || intrin == nir_intrinsic_store_ssbo);
+   uint32_t align = nir_combined_align(align_mul, align_offset);
+   if (align < min_bit_size / 8) {
+      /* Unaligned load/store, use the minimum bit size, up to 4 components */
+      unsigned ideal_num_components = intrin == nir_intrinsic_load_ssbo ?
+         DIV_ROUND_UP(bytes * 8, min_bit_size) :
+         (bytes * 8 / min_bit_size);
+      return (nir_mem_access_size_align) {
+         .align = min_bit_size / 8,
+         .bit_size = min_bit_size,
+         .num_components = MIN2(4, ideal_num_components),
+      };
+   }
+
+   /* Increase/decrease bit size to try to get closer to the requested byte size/align */
+   unsigned bit_size = closest_bit_size;
+   unsigned target = MIN2(bytes, align);
+   while (target < bit_size / 8 && bit_size > min_bit_size)
+      bit_size /= 2;
+   while (target > bit_size / 8 * 4 && bit_size < max_bit_size)
+      bit_size *= 2;
+
+   /* This is the best we can do */
    return (nir_mem_access_size_align) {
-      .align = closest_bit_size / 8,
-      .bit_size = closest_bit_size,
-      .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
+      .align = bit_size / 8,
+      .bit_size = bit_size,
+      .num_components = MIN2(4, DIV_ROUND_UP(bytes * 8, bit_size)),
    };
 }
 
@@ -6540,12 +6543,12 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
     * might be too opaque for the pass to see that they're next to each other. */
    optimize_nir(s, opts);
 
-   /* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better
+   /* Vectorize UBO/SSBO accesses aggressively. This can help increase alignment to enable us to do better
     * chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll
     * address them with lower_mem_access_bit_sizes */
    nir_load_store_vectorize_options vectorize_opts = {
       .callback = vectorize_filter,
-      .modes = nir_var_mem_ubo,
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo,
    };
    NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts);
 
@@ -6553,8 +6556,9 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
     * a single load/store op. */
    struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts };
    nir_lower_mem_access_bit_sizes_options mem_size_options = {
-      .modes = nir_var_mem_ubo,
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo,
       .callback = lower_mem_access_bit_sizes_cb,
+      .may_lower_unaligned_stores_to_atomics = true,
       .cb_data = &mem_size_data
    };
    NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options);
diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
index 556c4ed967f..25a43e45a96 100644
--- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
+++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
@@ -1048,7 +1048,6 @@ dxil_spirv_nir_passes(nir_shader *nir,
    }
 
    NIR_PASS_V(nir, nir_opt_deref);
-   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo);
 
    if (conf->inferred_read_only_images_as_srvs) {
       const nir_opt_access_options opt_access_options = {