intel: Add INTEL_DEBUG=no-vrt
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Add support for disabling the VRT (Variable Register Thread) feature.
The strategy here is to force the old BRW_MAX_GRF limit for the
register allocator (locks the upper limit) and make sure
ptl_register_blocks() always return that amount of blocks (locks
the lower limit).

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35781>
This commit is contained in:
Caio Oliveira 2025-06-26 19:36:24 -07:00 committed by Marge Bot
parent 636c37990a
commit 887642b0f2
8 changed files with 24 additions and 12 deletions

View file

@ -1413,7 +1413,7 @@ iris_init_render_context(struct iris_batch *batch)
#if GFX_VER >= 30
iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
cm.EnableVariableRegisterSizeAllocationMask = 1;
cm.EnableVariableRegisterSizeAllocation = true;
cm.EnableVariableRegisterSizeAllocation = !INTEL_DEBUG(DEBUG_NO_VRT);
}
#endif
@ -1549,7 +1549,7 @@ iris_init_compute_context(struct iris_batch *batch)
iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER >= 30
cm.EnableVariableRegisterSizeAllocationMask = 1;
cm.EnableVariableRegisterSizeAllocation = true;
cm.EnableVariableRegisterSizeAllocation = !INTEL_DEBUG(DEBUG_NO_VRT);
#endif
#if GFX_VER >= 20
cm.AsyncComputeThreadLimit = pixel_async_compute_thread_limit;

View file

@ -5,6 +5,8 @@
#include <stdlib.h>
#include "dev/intel_debug.h"
#include "intel_common.h"
#include "intel_engine.h"
@ -47,7 +49,7 @@ intel_compute_engine_async_threads_limit(const struct intel_device_info *devinfo
uint8_t pixel_async_compute_thread_limit = 2;
uint8_t z_pass_async_compute_thread_limit = 0;
uint8_t np_z_async_throttle_settings = 0;
bool has_vrt = devinfo->verx10 >= 300;
bool has_vrt = devinfo->verx10 >= 300 && !INTEL_DEBUG(DEBUG_NO_VRT);
/* When VRT is enabled async threads limits don't have effect */
if (!slm_or_barrier_enabled || has_vrt) {

View file

@ -253,6 +253,7 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
DEBUG_DO32,
DEBUG_SOFT64,
DEBUG_NO_SEND_GATHER,
DEBUG_NO_VRT,
};
for (uint32_t i = 0; i < ARRAY_SIZE(debug_bits); i++) {
insert_u64_bit(&config, INTEL_DEBUG(debug_bits[i]));
@ -385,3 +386,13 @@ brw_stage_prog_data_add_printf(struct brw_stage_prog_data *prog_data,
print->arg_sizes, sizeof(print->arg_sizes[0]) *print->num_args);
}
}
unsigned
ptl_register_blocks(unsigned grf_used)
{
if (INTEL_DEBUG(DEBUG_NO_VRT))
return (BRW_MAX_GRF / 32) - 1;
const unsigned n = DIV_ROUND_UP(grf_used, 32) - 1;
return (n < 6 ? n : 7);
}

View file

@ -652,12 +652,7 @@ struct brw_stage_prog_data {
* Convert a number of GRF registers used (grf_used in prog_data) into
* a number of GRF register blocks supported by the hardware on PTL+.
*/
static inline unsigned
ptl_register_blocks(unsigned grf_used)
{
const unsigned n = DIV_ROUND_UP(grf_used, 32) - 1;
return (n < 6 ? n : 7);
}
unsigned ptl_register_blocks(unsigned grf_used);
static inline uint32_t *
brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data,

View file

@ -29,6 +29,7 @@
#include "brw_shader.h"
#include "brw_builder.h"
#include "brw_cfg.h"
#include "dev/intel_debug.h"
#include "util/set.h"
#include "util/register_allocate.h"
@ -80,7 +81,8 @@ extern "C" void
brw_alloc_reg_sets(struct brw_compiler *compiler)
{
const struct intel_device_info *devinfo = compiler->devinfo;
int base_reg_count = (devinfo->ver >= 30 ? XE3_MAX_GRF / reg_unit(devinfo) :
int base_reg_count = (devinfo->ver >= 30 && !INTEL_DEBUG(DEBUG_NO_VRT) ?
XE3_MAX_GRF / reg_unit(devinfo) :
BRW_MAX_GRF);
/* The registers used to make up almost all values handled in the compiler

View file

@ -121,6 +121,7 @@ static const struct debug_control_bitset debug_control[] = {
OPT1("shader-print", DEBUG_SHADER_PRINT),
OPT1("cl-quiet", DEBUG_CL_QUIET),
OPT1("no-send-gather", DEBUG_NO_SEND_GATHER),
OPT1("no-vrt", DEBUG_NO_VRT),
OPT1("shaders-lineno", DEBUG_SHADERS_LINENO),
OPT1("show_shader_stage", DEBUG_SHOW_SHADER_STAGE),
{ NULL, }

View file

@ -92,6 +92,7 @@ enum intel_debug_flag {
DEBUG_BVH_TLAS_IR_AS,
DEBUG_BVH_NO_BUILD,
DEBUG_NO_SEND_GATHER,
DEBUG_NO_VRT,
DEBUG_RT_NO_TRACE,
DEBUG_SHADERS_LINENO,
DEBUG_SHOW_SHADER_STAGE,

View file

@ -636,7 +636,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
#if GFX_VERx10 >= 125
anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER >= 30
cm.EnableVariableRegisterSizeAllocation = true;
cm.EnableVariableRegisterSizeAllocation = !INTEL_DEBUG(DEBUG_NO_VRT);
#endif
cm.Mask1 = 0xffff;
#if GFX_VERx10 >= 200
@ -774,7 +774,7 @@ init_compute_queue_state(struct anv_queue *queue)
anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER >= 30
cm.EnableVariableRegisterSizeAllocationMask = 1;
cm.EnableVariableRegisterSizeAllocation = true;
cm.EnableVariableRegisterSizeAllocation = !INTEL_DEBUG(DEBUG_NO_VRT);
#endif
#if GFX_VER >= 20
cm.AsyncComputeThreadLimit = ACTL_Max8;