diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 7424dba5760..06584171c27 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2574,17 +2574,39 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
         if (nir_src_is_const(instr->src[0])) {
                 int offset = (nir_intrinsic_base(instr) +
                              nir_src_as_uint(instr->src[0]));
-                assert(offset % 4 == 0);
-                /* We need dwords */
-                offset = offset / 4;
-                for (int i = 0; i < instr->num_components; i++) {
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset + i));
+
+                /* Even though ldunif is strictly 32-bit we can still use it
+                 * to load scalar 16-bit uniforms so long as their offset is
+                 * 32-bit aligned. In this case, ldunif would still load 32-bit
+                 * into the destination with the 16-bit uniform data in the LSB
+                 * and garbage in the MSB, but that is fine because we don't
+                 * access the MSB of a 16-bit register.
+                 *
+                 * FIXME: if in the future we improve our register allocator to
+                 * pack 2 16-bit variables in the MSB and LSB of the same
+                 * register then this optimization would not be valid as is,
+                 * since the load clobbers the MSB.
+                 */
+                if (offset % 4 == 0) {
+                        /* We need dwords */
+                        offset = offset / 4;
+
+                        /* We scalarize general TMU access for anything that
+                         * is not 32-bit.
+                         */
+                        assert(nir_dest_bit_size(instr->dest) == 32 ||
+                               instr->num_components == 1);
+
+                        for (int i = 0; i < instr->num_components; i++) {
+                                ntq_store_dest(c, &instr->dest, i,
+                                               vir_uniform(c, QUNIFORM_UNIFORM,
+                                                           offset + i));
+                        }
+                        return;
                 }
-        } else {
-               ntq_emit_tmu_general(c, instr, false);
         }
+
+        ntq_emit_tmu_general(c, instr, false);
 }
 
 static void