diff --git a/src/intel/compiler/brw_analysis.h b/src/intel/compiler/brw_analysis.h index 2b423edf0d4..d4b63904d96 100644 --- a/src/intel/compiler/brw_analysis.h +++ b/src/intel/compiler/brw_analysis.h @@ -550,13 +550,20 @@ struct brw_performance { /** * Estimate of the throughput of the whole program in - * invocations-per-cycle units. + * invocations-per-cycle-per-EU units. * - * Note that this might be lower than the ratio between the dispatch - * width of the program and its latency estimate in cases where - * performance doesn't scale without limits as a function of its thread - * parallelism, e.g. due to the existence of a bottleneck in a shared - * function. + * This gives the expected throughput of a whole EU under the + * heuristic assumption that it is fully loaded instead of the + * throughput of a single thread, this is in order to be able to + * account for the reduction in parallelism that xe3+ EUs + * experience with increasing register use. Earlier platforms use + * a fixed factor as EU thread count instead. + * + * Note that this number might be lower than expected from the + * reciprocal of the latency estimate in cases where performance + * doesn't scale without limits as a function of its thread + * parallelism, e.g. due to the existence of a bottleneck in a + * shared function. */ float throughput; diff --git a/src/intel/compiler/brw_analysis_performance.cpp b/src/intel/compiler/brw_analysis_performance.cpp index a114992adde..202c9b5516d 100644 --- a/src/intel/compiler/brw_analysis_performance.cpp +++ b/src/intel/compiler/brw_analysis_performance.cpp @@ -24,6 +24,7 @@ #include "brw_eu.h" #include "brw_shader.h" #include "brw_cfg.h" +#include namespace { /** @@ -1000,6 +1001,33 @@ namespace { return 1.0 / busy; } + /** + * Calculate the number of threads of this program that can run + * concurrently in an EU based on the estimate of register pressure + * derived from liveness information (pre-RA) or on the actual + * number of GRFs used if available (post-RA). Platforms prior to + * xe3 don't support VRT so we can just return the constant value + * from device info. + */ + unsigned + calculate_threads_per_eu(const brw_shader *s) + { + if (s->devinfo->ver >= 30) { + unsigned grf_used = s->grf_used; + + if (!grf_used) { + const brw_register_pressure &rp = s->regpressure_analysis.require(); + const unsigned max_regs_live = *std::max_element(rp.regs_live_at_ip, + rp.regs_live_at_ip + s->cfg->total_instructions); + grf_used = DIV_ROUND_UP(max_regs_live, reg_unit(s->devinfo)); + } + + return 32 / MAX2(3, ptl_register_blocks(grf_used) + 1); + } else { + return s->devinfo->num_thread_per_eu; + } + } + /** * Estimate the performance of the specified shader. */ @@ -1066,7 +1094,8 @@ namespace { } p.latency = elapsed; - p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed); + p.throughput = dispatch_width * calculate_threads_per_eu(s) * + calculate_thread_throughput(st, elapsed); } }