diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c
index 65cbf05c080..561581fb943 100644
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@@ -288,6 +288,9 @@ typedef struct jay_ra_state {
    /** Size of each register file */
    unsigned num_regs[JAY_NUM_RA_FILES];
 
+   /** Counter for roundrobin register allocation */
+   unsigned roundrobin[JAY_NUM_RA_FILES];
+
    /** First GPR that may be used for EOT sends */
    unsigned eot_offs;
 
@@ -764,8 +767,23 @@ pick_regs(jay_ra_state *ra,
       ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity;
 
    assert(alignment >= size && "alignment must be a multiple of size");
+   unsigned nr = DIV_ROUND_UP((end + 1 - size - first), alignment);
+   unsigned roundrobin = (ra->roundrobin[file]++) % nr;
+   unsigned rr_al = roundrobin * alignment, nr_al = nr * alignment;
+
+   for (unsigned i = rr_al; i < rr_al + nr_al; i += alignment) {
+      /* We select registers roundrobin. This has several benefits:
+       *
+       * 1. Easier coalescing since we are less likely statistically to allocate
+       *    a register that a future instruction has an affinity.
+       *
+       * 2. More freedom for post-RA scheduling thanks to fewer dependencies.
+       *
+       * 3. Less stalling due to SWSB annotations from register reuse.
+       */
+      unsigned r = first + (i >= nr_al ? (i - nr_al) : i);
+      assert(r >= first && r + size <= end);
 
-   for (unsigned r = first; r + size <= end; r += alignment) {
       unsigned cost = 0;
       bool tied = last_killed && last_killed->reg == r;
       enum jay_stride stride =