diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 6ea0e8b5679..116273dd07a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -647,6 +647,12 @@ struct v3d_compile {
          */
         bool disable_tmu_pipelining;
 
+        /* Disable sorting of UBO loads with constant offset. This may
+         * increase the chances of being able to compile shaders with high
+         * register pressure.
+         */
+        bool disable_constant_ubo_load_sorting;
+
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
          * register pressure or to disable the optimization during uniform
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 941dc5b0fbc..5468364a6b0 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                  void *debug_output_data,
                  int program_id, int variant_id,
                  uint32_t min_threads_for_reg_alloc,
+                 bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
                  bool fallback_scheduler)
 {
@@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
+        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -1101,6 +1103,248 @@ should_split_wrmask(const nir_instr *instr, const void *data)
         }
 }
 
+static nir_intrinsic_instr *
+nir_instr_as_constant_ubo_load(nir_instr *inst)
+{
+        if (inst->type != nir_instr_type_intrinsic)
+                return NULL;
+
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
+        if (intr->intrinsic != nir_intrinsic_load_ubo)
+                return NULL;
+
+        assert(nir_src_is_const(intr->src[0]));
+        if (!nir_src_is_const(intr->src[1]))
+                return NULL;
+
+        return intr;
+}
+
+static bool
+v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
+{
+        bool progress = false;
+
+        nir_instr *ref_inst = &ref->instr;
+        uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
+        uint32_t ref_index = nir_src_as_uint(ref->src[0]);
+
+        /* Go through all instructions after ref searching for constant UBO
+         * loads for the same UBO index.
+         */
+        bool seq_break = false;
+        nir_instr *inst = &ref->instr;
+        nir_instr *next_inst = NULL;
+        while (true) {
+                inst = next_inst ? next_inst : nir_instr_next(inst);
+                if (!inst)
+                        break;
+
+                next_inst = NULL;
+
+                if (inst->type != nir_instr_type_intrinsic)
+                        continue;
+
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
+                if (intr->intrinsic != nir_intrinsic_load_ubo)
+                        continue;
+
+                /* We only produce unifa sequences for non-divergent loads */
+                if (nir_src_is_divergent(intr->src[1]))
+                        continue;
+
+                /* If there are any UBO loads that are not constant or that
+                 * use a different UBO index in between the reference load and
+                 * any other constant load for the same index, they would break
+                 * the unifa sequence. We will flag that so we can then move
+                 * all constant UBO loads for the reference index before these
+                 * and not just the ones that are not ordered to avoid breaking
+                 * the sequence and reduce unifa writes.
+                 */
+                if (!nir_src_is_const(intr->src[1])) {
+                        seq_break = true;
+                        continue;
+                }
+                uint32_t offset = nir_src_as_uint(intr->src[1]);
+
+                assert(nir_src_is_const(intr->src[0]));
+                uint32_t index = nir_src_as_uint(intr->src[0]);
+                if (index != ref_index) {
+                       seq_break = true;
+                       continue;
+                }
+
+                /* Only move loads with an offset that is close enough to the
+                 * reference offset, since otherwise we would not be able to
+                 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
+                 */
+                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+                        continue;
+
+                /* We will move this load if its offset is smaller than ref's
+                 * (in which case we will move it before ref) or if the offset
+                 * is larger than ref's but there are sequence breakers in
+                 * in between (in which case we will move it after ref and
+                 * before the sequence breakers).
+                 */
+                if (!seq_break && offset >= ref_offset)
+                        continue;
+
+                /* Find where exactly we want to move this load:
+                 *
+                 * If we are moving it before ref, we want to check any other
+                 * UBO loads we placed before ref and make sure we insert this
+                 * one properly ordered with them. Likewise, if we are moving
+                 * it after ref.
+                 */
+                nir_instr *pos = ref_inst;
+                nir_instr *tmp = pos;
+                do {
+                        if (offset < ref_offset)
+                                tmp = nir_instr_prev(tmp);
+                        else
+                                tmp = nir_instr_next(tmp);
+
+                        if (!tmp || tmp == inst)
+                                break;
+
+                        /* Ignore non-unifa UBO loads */
+                        if (tmp->type != nir_instr_type_intrinsic)
+                                continue;
+
+                        nir_intrinsic_instr *tmp_intr =
+                                nir_instr_as_intrinsic(tmp);
+                        if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
+                                continue;
+
+                        if (nir_src_is_divergent(tmp_intr->src[1]))
+                                continue;
+
+                        /* Stop if we find a unifa UBO load that breaks the
+                         * sequence.
+                         */
+                        if (!nir_src_is_const(tmp_intr->src[1]))
+                                break;
+
+                        if (nir_src_as_uint(tmp_intr->src[0]) != index)
+                                break;
+
+                        uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
+                        if (offset < ref_offset) {
+                                if (tmp_offset < offset ||
+                                    tmp_offset >= ref_offset) {
+                                        break;
+                                } else {
+                                        pos = tmp;
+                                }
+                        } else {
+                                if (tmp_offset > offset ||
+                                    tmp_offset <= ref_offset) {
+                                        break;
+                                } else {
+                                        pos = tmp;
+                                }
+                        }
+                } while (true);
+
+                /* We can't move the UBO load before the instruction that
+                 * defines its constant offset. If that instruction is placed
+                 * in between the new location (pos) and the current location
+                 * of this load, we will have to move that instruction too.
+                 *
+                 * We don't care about the UBO index definition because that
+                 * is optimized to be reused by all UBO loads for the same
+                 * index and therefore is certain to be defined before the
+                 * first UBO load that uses it.
+                 */
+                nir_instr *offset_inst = NULL;
+                tmp = inst;
+                while ((tmp = nir_instr_prev(tmp)) != NULL) {
+                        if (pos == tmp) {
+                                /* We reached the target location without
+                                 * finding the instruction that defines the
+                                 * offset, so that instruction must be before
+                                 * the new position and we don't have to fix it.
+                                 */
+                                break;
+                        }
+                        if (intr->src[1].ssa->parent_instr == tmp) {
+                                offset_inst = tmp;
+                                break;
+                        }
+                }
+
+                if (offset_inst) {
+                        exec_node_remove(&offset_inst->node);
+                        exec_node_insert_node_before(&pos->node,
+                                                     &offset_inst->node);
+                }
+
+                /* Since we are moving the instruction before its current
+                 * location, grab its successor before the move so that
+                 * we can continue the next iteration of the main loop from
+                 * that instruction.
+                 */
+                next_inst = nir_instr_next(inst);
+
+                /* Move this load to the selected location */
+                exec_node_remove(&inst->node);
+                if (offset < ref_offset)
+                        exec_node_insert_node_before(&pos->node, &inst->node);
+                else
+                        exec_node_insert_after(&pos->node, &inst->node);
+
+                progress = true;
+        }
+
+        return progress;
+}
+
+static bool
+v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
+                                      nir_block *block)
+{
+        bool progress = false;
+        bool local_progress;
+        do {
+                local_progress = false;
+                nir_foreach_instr_safe(inst, block) {
+                        nir_intrinsic_instr *intr =
+                                nir_instr_as_constant_ubo_load(inst);
+                        if (intr) {
+                                local_progress |=
+                                        v3d_nir_sort_constant_ubo_load(block, intr);
+                        }
+                }
+                progress |= local_progress;
+        } while (local_progress);
+
+        return progress;
+}
+
+/**
+ * Sorts constant UBO loads in each block by offset to maximize chances of
+ * skipping unifa writes when converting to VIR. This can increase register
+ * pressure.
+ */
+static bool
+v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
+{
+        bool progress = false;
+        nir_foreach_function(function, s) {
+                if (function->impl) {
+                        nir_foreach_block(block, function->impl) {
+                                progress |=
+                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
+                        }
+                        nir_metadata_preserve(function->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+        return progress;
+}
+
 static void
 v3d_attempt_compile(struct v3d_compile *c)
 {
@@ -1211,6 +1455,9 @@ v3d_attempt_compile(struct v3d_compile *c)
         };
         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
 
+        if (!c->disable_constant_ubo_load_sorting)
+                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+
         v3d_nir_to_vir(c);
 }
 
@@ -1284,7 +1531,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 const char *name;
                 uint32_t min_threads_for_reg_alloc;
         } static const strategies[] = {
-                { "default",                  1 },
+                { "default",                  4 },
+                { "disable UBO load sorting", 1 },
                 { "disable TMU pipelining",   1 },
                 { "fallback scheduler",       1 }
         };
@@ -1294,8 +1542,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                      debug_output, debug_output_data,
                                      program_id, variant_id,
                                      strategies[i].min_threads_for_reg_alloc,
-                                     i > 0, /* Disable TMU pipelining */
-                                     i > 1  /* Fallback_scheduler */);
+                                     i > 0, /* Disable UBO load sorting */
+                                     i > 1, /* Disable TMU pipelining */
+                                     i > 2  /* Fallback_scheduler */);
 
                 v3d_attempt_compile(c);