iris: Emit STATE_COMPUTE_MODE before COMPUTE_WALKER when new async compute limits are needed

Cc: stable
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35563>
This commit is contained in:
José Roberto de Souza 2025-06-16 09:57:27 -07:00 committed by Marge Bot
parent 59d361043e
commit 52bd6fae0e
2 changed files with 65 additions and 7 deletions

View file

@ -1159,6 +1159,11 @@ struct iris_context {
struct pipe_resource *pixel_hashing_tables;
bool use_tbimr;
/* For compute engine only */
uint8_t pixel_async_compute_thread_limit;
uint8_t z_pass_async_compute_thread_limit;
uint8_t np_z_async_throttle_settings;
} state;
};

View file

@ -117,6 +117,7 @@
#include "intel/common/intel_genX_state_elk.h"
#endif
#include "intel/common/intel_common.h"
#include "intel/common/intel_guardband.h"
#include "intel/common/intel_pixel_hash.h"
#include "intel/common/intel_tiled_render.h"
@ -1535,25 +1536,35 @@ iris_init_compute_context(struct iris_batch *batch)
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_FLUSH_HDC);
uint8_t pixel_async_compute_thread_limit, z_pass_async_compute_thread_limit,
np_z_async_throttle_settings;
intel_compute_engine_async_threads_limit(devinfo, 0, false,
&pixel_async_compute_thread_limit,
&z_pass_async_compute_thread_limit,
&np_z_async_throttle_settings);
batch->ice->state.pixel_async_compute_thread_limit = pixel_async_compute_thread_limit;
batch->ice->state.z_pass_async_compute_thread_limit = z_pass_async_compute_thread_limit;
batch->ice->state.np_z_async_throttle_settings = np_z_async_throttle_settings;
iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER >= 30
cm.EnableVariableRegisterSizeAllocationMask = 1;
cm.EnableVariableRegisterSizeAllocation = true;
#endif
#if GFX_VER >= 20
cm.AsyncComputeThreadLimit = ACTL_Max8;
cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
cm.AsyncComputeThreadLimit = pixel_async_compute_thread_limit;
cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit;
cm.ZAsyncThrottlesettings = np_z_async_throttle_settings;
cm.AsyncComputeThreadLimitMask = 0x7;
cm.ZPassAsyncComputeThreadLimitMask = 0x7;
cm.ZAsyncThrottlesettingsMask = 0x3;
#else
cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
cm.PixelAsyncComputeThreadLimit = pixel_async_compute_thread_limit;
cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit;
cm.PixelAsyncComputeThreadLimitMask = 0x7;
cm.ZPassAsyncComputeThreadLimitMask = 0x7;
if (intel_device_info_is_mtl_or_arl(devinfo)) {
cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
cm.ZAsyncThrottlesettings = np_z_async_throttle_settings;
cm.ZAsyncThrottlesettingsMask = 0x3;
}
#endif
@ -9111,6 +9122,7 @@ iris_upload_compute_walker(struct iris_context *ice,
const struct iris_cs_data *cs_data = iris_cs_data(shader);
const struct intel_cs_dispatch_info dispatch =
iris_get_cs_dispatch_info(devinfo, shader, grid->block);
uint32_t total_shared = shader->total_shared + grid->variable_shared_mem;
trace_intel_begin_compute(&batch->trace);
@ -9124,7 +9136,48 @@ iris_upload_compute_walker(struct iris_context *ice,
}
}
uint32_t total_shared = shader->total_shared + grid->variable_shared_mem;
/* Not need with VRT enabled */
#if GFX_VERx10 < 300
uint8_t pixel_async_compute_thread_limit, z_pass_async_compute_thread_limit,
np_z_async_throttle_settings;
bool slm_or_barrier_enabled = total_shared != 0 || cs_data->uses_barrier;
intel_compute_engine_async_threads_limit(devinfo, dispatch.threads,
slm_or_barrier_enabled,
&pixel_async_compute_thread_limit,
&z_pass_async_compute_thread_limit,
&np_z_async_throttle_settings);
if (ice->state.pixel_async_compute_thread_limit != pixel_async_compute_thread_limit ||
ice->state.z_pass_async_compute_thread_limit != z_pass_async_compute_thread_limit ||
ice->state.np_z_async_throttle_settings != np_z_async_throttle_settings) {
batch->ice->state.pixel_async_compute_thread_limit = pixel_async_compute_thread_limit;
batch->ice->state.z_pass_async_compute_thread_limit = z_pass_async_compute_thread_limit;
batch->ice->state.np_z_async_throttle_settings = np_z_async_throttle_settings;
iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER >= 20
cm.AsyncComputeThreadLimit = pixel_async_compute_thread_limit;
cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit;
cm.ZAsyncThrottlesettings = np_z_async_throttle_settings;
cm.AsyncComputeThreadLimitMask = 0x7;
cm.ZPassAsyncComputeThreadLimitMask = 0x7;
cm.ZAsyncThrottlesettingsMask = 0x3;
#else
cm.PixelAsyncComputeThreadLimit = pixel_async_compute_thread_limit;
cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit;
cm.PixelAsyncComputeThreadLimitMask = 0x7;
cm.ZPassAsyncComputeThreadLimitMask = 0x7;
if (intel_device_info_is_mtl_or_arl(devinfo)) {
cm.ZAsyncThrottlesettings = np_z_async_throttle_settings;
cm.ZAsyncThrottlesettingsMask = 0x3;
}
#endif
}
}
#endif /* GFX_VERx10 < 300 */
struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
idd.KernelStartPointer =
KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);