diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index f5fc109d7cd..85279bb22f3 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -69,6 +69,14 @@ is_output(nir_intrinsic_instr *intrin)
 struct brw_lower_urb_cb_data {
    const struct intel_device_info *devinfo;
 
+   /* If true, all access is guaranteed to be vec4 (128-bit) aligned.
+    * offset and base are in units of 128-bit vec4 slots.
+    *
+    * If false, all access is guaranteed to be 32-bit aligned.
+    * offset is in 32-bit units, but base is still in 128-bit vec4 units,
+    */
+   bool vec4_access;
+
    /** Map from VARYING_SLOT_* to a vec4 slot index */
    const int8_t *varying_to_slot;
 
@@ -83,6 +91,23 @@ struct brw_lower_urb_cb_data {
    int tes_per_patch_slots;
 };
 
+/**
+ * Given an URB offset in 32-bit units, determine whether (offset % 4)
+ * is statically known.  If so, add this to the value of first_component.
+ */
+static bool
+io_vec4_static_mod(nir_def *offset_32b, unsigned *first_component)
+{
+   unsigned mod;
+   const bool mod_known =
+      nir_mod_analysis(nir_get_scalar(offset_32b, 0), nir_type_uint, 4, &mod);
+
+   if (mod_known)
+      *first_component += mod;
+
+   return mod_known;
+}
+
 static unsigned
 io_component(nir_intrinsic_instr *io)
 {
@@ -115,6 +140,10 @@ urb_offset(nir_builder *b,
 {
    nir_def *offset = nir_get_io_offset_src(io)->ssa;
 
+   /* Convert vec4 slot offset to 32-bit dwords */
+   if (!cb_data->vec4_access)
+      offset = nir_ishl_imm(b, offset, 2);
+
    nir_src *index = nir_get_io_arrayed_index_src(io);
    if (index) {
       nir_def *stride = cb_data->dynamic_tes
@@ -156,24 +185,51 @@ load_urb(nir_builder *b,
    const unsigned base = io_base_slot(intrin, cb_data);
 
    if (devinfo->ver >= 20) {
-      nir_def *addr = nir_iadd(b, handle, nir_ishl_imm(b, offset, 4));
-      return nir_load_urb_lsc_intel(b, intrin->def.num_components, bits, addr,
+      offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
+      return nir_load_urb_lsc_intel(b, intrin->def.num_components, bits,
+                                    nir_iadd(b, handle, offset),
                                     16 * base + 4 * io_component(intrin),
                                     .access = access);
    }
 
-   /* Load a whole vec4 and return the desired portion */
-   const unsigned first_component = io_component(intrin);
-   const unsigned components = intrin->def.num_components + first_component;
-   assert(components <= 4);
+   /* Load a whole vec4 or vec8 and return the desired portion */
+   unsigned first_component = io_component(intrin);
+   nir_component_mask_t mask = nir_component_mask(intrin->def.num_components);
+
+   /* If the offset is in vec4 units, do a straightforward load */
+   if (cb_data->vec4_access) {
+      assert(intrin->def.num_components <= 4);
+      nir_def *load =
+         nir_load_urb_vec4_intel(b, 4, bits, handle, offset,
+                                 .base = base, .access = access);
+      return nir_channels(b, load, mask << first_component);
+   }
+
+   /* Otherwise, the offset is in 32-bit units.  Split it into a vec4-aligned
+    * slot offset and a 32-bit component offset.
+    */
+   nir_def *mod = nir_iand_imm(b, offset, 0x3);
+   nir_def *vec4_offset = nir_ishr_imm(b, offset, 2);
+
+   const bool static_mod = io_vec4_static_mod(offset, &first_component);
+   const bool single_vec4 = (static_mod || intrin->def.num_components == 1)
+      && first_component + intrin->def.num_components <= 4;
 
    nir_def *load =
-      nir_load_urb_vec4_intel(b, components, bits, handle, offset,
-                              .base = base, .access = access);
-   nir_component_mask_t mask =
-      nir_component_mask(intrin->def.num_components) << first_component;
+      nir_load_urb_vec4_intel(b, single_vec4 ? 4 : 8, bits, handle,
+                              vec4_offset, .base = base, .access = access);
 
-   return nir_channels(b, load, mask);
+   if (static_mod) {
+      return nir_channels(b, load, mask << first_component);
+   } else {
+      nir_def *comps[NIR_MAX_VEC_COMPONENTS];
+      for (unsigned i = 0; i < intrin->def.num_components; i++) {
+         comps[i] =
+            nir_vector_extract(b, load,
+                               nir_iadd_imm(b, mod, first_component + i));
+      }
+      return nir_vec(b, comps, intrin->def.num_components);
+   }
 }
 
 static void
@@ -185,13 +241,15 @@ store_urb(nir_builder *b,
 {
    const struct intel_device_info *devinfo = cb_data->devinfo;
    const unsigned base = io_base_slot(intrin, cb_data);
+   unsigned first_component = io_component(intrin);
 
    nir_def *src = intrin->src[0].ssa;
 
    unsigned mask = nir_intrinsic_write_mask(intrin);
 
    if (devinfo->ver >= 20) {
-      nir_def *addr = nir_iadd(b, urb_handle, nir_ishl_imm(b, offset, 4));
+      offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
+      nir_def *addr = nir_iadd(b, urb_handle, offset);
       while (mask) {
          int start, count;
          u_bit_scan_consecutive_range(&mask, &start, &count);
@@ -203,18 +261,42 @@ store_urb(nir_builder *b,
          nir_store_urb_lsc_intel(b, nir_channels(b, src, cur_mask), addr,
                                  .base = cur_base);
       }
-   } else {
-      const unsigned first_component = io_component(intrin);
-      if (first_component) {
-         const unsigned components = src->num_components + first_component;
-         assert(components <= 4);
-
-         mask <<= first_component;
-         src = nir_shift_channels(b, src, first_component, components);
-      }
-      nir_store_urb_vec4_intel(b, src, urb_handle, offset,
-                               nir_imm_int(b, mask), .base = base);
+      return;
    }
+
+   nir_def *channel_mask = nir_imm_int(b, mask);
+
+   const bool static_mod = cb_data->vec4_access ||
+                           io_vec4_static_mod(offset, &first_component);
+
+   if (static_mod) {
+      src = nir_shift_channels(b, src, first_component,
+                               align(src->num_components + first_component, 4));
+      channel_mask = nir_ishl_imm(b, channel_mask, first_component);
+   } else {
+      offset = nir_iadd_imm(b, offset, first_component);
+
+      nir_def *undef = nir_undef(b, 1, src->bit_size);
+      nir_def *mod = nir_iand_imm(b, offset, 0x3);
+      channel_mask = nir_ishl(b, channel_mask, mod);
+
+      nir_def *comps[8];
+      for (unsigned i = 0; i < 8; i++) {
+         nir_def *cond = nir_i2b(b, nir_iand_imm(b, channel_mask, 1u << i));
+         nir_def *src_idx = nir_imax_imm(b, nir_isub_imm(b, i, mod), 0);
+         nir_def *src_comp = src->num_components == 1 ? src :
+            nir_vector_extract(b, src, src_idx);
+
+         comps[i] = nir_bcsel(b, cond, src_comp, undef);
+      }
+      src = nir_vec(b, comps, 8);
+   }
+
+   nir_def *vec4_offset =
+      cb_data->vec4_access ? offset : nir_ishr_imm(b, offset, 2);
+
+   nir_store_urb_vec4_intel(b, src, urb_handle, vec4_offset, channel_mask,
+                            .base = base);
 }
 
 static nir_def *
@@ -248,7 +330,7 @@ try_load_push_input(nir_builder *b,
                     nir_intrinsic_instr *io,
                     nir_def *offset)
 {
-   if (!nir_def_is_const(offset))
+   if (!nir_def_is_const(offset) || !cb_data->vec4_access)
       return NULL;
 
    const unsigned base = io_base_slot(io, cb_data) +
@@ -839,6 +921,7 @@ brw_nir_lower_tes_inputs(nir_shader *nir,
 
    const struct brw_lower_urb_cb_data cb_data = {
       .devinfo = devinfo,
+      .vec4_access = true,
       .varying_to_slot = vue_map->varying_to_slot,
       .per_vertex_stride = vue_map->num_per_vertex_slots * 16,
       .dynamic_tes = vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE,
@@ -1116,6 +1199,7 @@ brw_nir_lower_tcs_inputs(nir_shader *nir,
 
    const struct brw_lower_urb_cb_data cb_data = {
       .devinfo = devinfo,
+      .vec4_access = true,
       .varying_to_slot = input_vue_map->varying_to_slot,
    };
    NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, &cb_data);
@@ -1142,6 +1226,7 @@ brw_nir_lower_tcs_outputs(nir_shader *nir,
 
    const struct brw_lower_urb_cb_data cb_data = {
       .devinfo = devinfo,
+      .vec4_access = true,
       .varying_to_slot = vue_map->varying_to_slot,
       .per_vertex_stride = vue_map->num_per_vertex_slots * 16,
    };