diff --git a/src/intel/compiler/brw_analysis.h b/src/intel/compiler/brw_analysis.h
index 2b423edf0d4..d4b63904d96 100644
--- a/src/intel/compiler/brw_analysis.h
+++ b/src/intel/compiler/brw_analysis.h
@@ -550,13 +550,20 @@ struct brw_performance {
 
    /**
     * Estimate of the throughput of the whole program in
-    * invocations-per-cycle units.
+    * invocations-per-cycle-per-EU units.
     *
-    * Note that this might be lower than the ratio between the dispatch
-    * width of the program and its latency estimate in cases where
-    * performance doesn't scale without limits as a function of its thread
-    * parallelism, e.g. due to the existence of a bottleneck in a shared
-    * function.
+    * This gives the expected throughput of a whole EU under the
+    * heuristic assumption that it is fully loaded instead of the
+    * throughput of a single thread, this is in order to be able to
+    * account for the reduction in parallelism that xe3+ EUs
+    * experience with increasing register use.  Earlier platforms use
+    * a fixed factor as EU thread count instead.
+    *
+    * Note that this number might be lower than expected from the
+    * reciprocal of the latency estimate in cases where performance
+    * doesn't scale without limits as a function of its thread
+    * parallelism, e.g. due to the existence of a bottleneck in a
+    * shared function.
     */
    float throughput;
 
diff --git a/src/intel/compiler/brw_analysis_performance.cpp b/src/intel/compiler/brw_analysis_performance.cpp
index a114992adde..202c9b5516d 100644
--- a/src/intel/compiler/brw_analysis_performance.cpp
+++ b/src/intel/compiler/brw_analysis_performance.cpp
@@ -24,6 +24,7 @@
 #include "brw_eu.h"
 #include "brw_shader.h"
 #include "brw_cfg.h"
+#include <algorithm>
 
 namespace {
    /**
@@ -1000,6 +1001,33 @@ namespace {
       return 1.0 / busy;
    }
 
+   /**
+    * Calculate the number of threads of this program that can run
+    * concurrently in an EU based on the estimate of register pressure
+    * derived from liveness information (pre-RA) or on the actual
+    * number of GRFs used if available (post-RA).  Platforms prior to
+    * xe3 don't support VRT so we can just return the constant value
+    * from device info.
+    */
+   unsigned
+   calculate_threads_per_eu(const brw_shader *s)
+   {
+      if (s->devinfo->ver >= 30) {
+         unsigned grf_used = s->grf_used;
+
+         if (!grf_used) {
+            const brw_register_pressure &rp = s->regpressure_analysis.require();
+            const unsigned max_regs_live = *std::max_element(rp.regs_live_at_ip,
+               rp.regs_live_at_ip + s->cfg->total_instructions);
+            grf_used = DIV_ROUND_UP(max_regs_live, reg_unit(s->devinfo));
+         }
+
+         return 32 / MAX2(3, ptl_register_blocks(grf_used) + 1);
+      } else {
+         return s->devinfo->num_thread_per_eu;
+      }
+   }
+
    /**
     * Estimate the performance of the specified shader.
     */
@@ -1066,7 +1094,8 @@ namespace {
       }
 
       p.latency = elapsed;
-      p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
+      p.throughput = dispatch_width * calculate_threads_per_eu(s) *
+                     calculate_thread_throughput(st, elapsed);
    }
 }