diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index f721e218541..0389682ff0f 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -2324,34 +2324,17 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, nir_intrin
    LLVMValueRef offset = get_src(ctx, instr->src[1]);
    int num_components = instr->num_components;
 
+   assert(instr->dest.ssa.bit_size >= 32 && instr->dest.ssa.bit_size % 32 == 0);
+
    if (ctx->abi->load_ubo)
       rsrc = ctx->abi->load_ubo(ctx->abi, rsrc);
 
-   /* Convert to a scalar 32-bit load. */
+   /* Convert to a 32-bit load. */
    if (instr->dest.ssa.bit_size == 64)
       num_components *= 2;
-   else if (instr->dest.ssa.bit_size == 16)
-      num_components = DIV_ROUND_UP(num_components, 2);
-   else if (instr->dest.ssa.bit_size == 8)
-      num_components = DIV_ROUND_UP(num_components, 4);
 
-   ret =
-      ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL,
-                           ctx->ac.f32, 0, true, true);
-
-   /* Convert to the original type. */
-   if (instr->dest.ssa.bit_size == 64) {
-      ret = LLVMBuildBitCast(ctx->ac.builder, ret,
-                             LLVMVectorType(ctx->ac.i64, num_components / 2), "");
-   } else if (instr->dest.ssa.bit_size == 16) {
-      ret = LLVMBuildBitCast(ctx->ac.builder, ret,
-                             LLVMVectorType(ctx->ac.i16, num_components * 2), "");
-   } else if (instr->dest.ssa.bit_size == 8) {
-      ret = LLVMBuildBitCast(ctx->ac.builder, ret,
-                             LLVMVectorType(ctx->ac.i8, num_components * 4), "");
-   }
-
-   ret = ac_trim_vector(&ctx->ac, ret, instr->num_components);
+   ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL,
+                              ctx->ac.f32, 0, true, true);
    ret = LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
 
    return exit_waterfall(ctx, &wctx, ret);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3382e416679..f16b21706a4 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3182,6 +3182,12 @@ radv_postprocess_nir(struct radv_pipeline *pipeline,
       }
    }
 
+   NIR_PASS(_, stage->nir, ac_nir_lower_subdword_loads,
+            (ac_nir_lower_subdword_options) {
+               .modes_1_comp = nir_var_mem_ubo,
+               .modes_N_comps = nir_var_mem_ubo
+            });
+
    progress = false;
    NIR_PASS(progress, stage->nir, nir_vk_lower_ycbcr_tex, ycbcr_conversion_lookup, pipeline_layout);
    /* Gather info in the case that nir_vk_lower_ycbcr_tex might have emitted resinfo instructions. */
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 19786d0dcbb..624e4116472 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -25,6 +25,7 @@
 #include "nir_builder.h"
 #include "nir_xfb_info.h"
 #include "si_pipe.h"
+#include "ac_nir.h"
 
 
 static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
@@ -358,6 +359,11 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
 
    nir_lower_io_passes(nir);
 
+   NIR_PASS_V(nir, ac_nir_lower_subdword_loads,
+              (ac_nir_lower_subdword_options) {
+                 .modes_1_comp = nir_var_mem_ubo,
+                 .modes_N_comps = nir_var_mem_ubo
+              });
    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
 
    /* Remove dead derefs, so that we can remove uniforms. */