diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index a3e233fa03f..8e4c0013b2c 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -1159,6 +1159,11 @@ struct iris_context { struct pipe_resource *pixel_hashing_tables; bool use_tbimr; + + /* For compute engine only */ + uint8_t pixel_async_compute_thread_limit; + uint8_t z_pass_async_compute_thread_limit; + uint8_t np_z_async_throttle_settings; } state; }; diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index d201b10a89e..4a1f7bbdaba 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -117,6 +117,7 @@ #include "intel/common/intel_genX_state_elk.h" #endif +#include "intel/common/intel_common.h" #include "intel/common/intel_guardband.h" #include "intel/common/intel_pixel_hash.h" #include "intel/common/intel_tiled_render.h" @@ -1535,25 +1536,35 @@ iris_init_compute_context(struct iris_batch *batch) PIPE_CONTROL_INSTRUCTION_INVALIDATE | PIPE_CONTROL_FLUSH_HDC); + uint8_t pixel_async_compute_thread_limit, z_pass_async_compute_thread_limit, + np_z_async_throttle_settings; + intel_compute_engine_async_threads_limit(devinfo, 0, false, + &pixel_async_compute_thread_limit, + &z_pass_async_compute_thread_limit, + &np_z_async_throttle_settings); + batch->ice->state.pixel_async_compute_thread_limit = pixel_async_compute_thread_limit; + batch->ice->state.z_pass_async_compute_thread_limit = z_pass_async_compute_thread_limit; + batch->ice->state.np_z_async_throttle_settings = np_z_async_throttle_settings; + iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) { #if GFX_VER >= 30 cm.EnableVariableRegisterSizeAllocationMask = 1; cm.EnableVariableRegisterSizeAllocation = true; #endif #if GFX_VER >= 20 - cm.AsyncComputeThreadLimit = ACTL_Max8; - cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60; - cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit; + cm.AsyncComputeThreadLimit = pixel_async_compute_thread_limit; + cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit; + cm.ZAsyncThrottlesettings = np_z_async_throttle_settings; cm.AsyncComputeThreadLimitMask = 0x7; cm.ZPassAsyncComputeThreadLimitMask = 0x7; cm.ZAsyncThrottlesettingsMask = 0x3; #else - cm.PixelAsyncComputeThreadLimit = PACTL_Max24; - cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60; + cm.PixelAsyncComputeThreadLimit = pixel_async_compute_thread_limit; + cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit; cm.PixelAsyncComputeThreadLimitMask = 0x7; cm.ZPassAsyncComputeThreadLimitMask = 0x7; if (intel_device_info_is_mtl_or_arl(devinfo)) { - cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit; + cm.ZAsyncThrottlesettings = np_z_async_throttle_settings; cm.ZAsyncThrottlesettingsMask = 0x3; } #endif @@ -9111,6 +9122,7 @@ iris_upload_compute_walker(struct iris_context *ice, const struct iris_cs_data *cs_data = iris_cs_data(shader); const struct intel_cs_dispatch_info dispatch = iris_get_cs_dispatch_info(devinfo, shader, grid->block); + uint32_t total_shared = shader->total_shared + grid->variable_shared_mem; trace_intel_begin_compute(&batch->trace); @@ -9124,7 +9136,48 @@ iris_upload_compute_walker(struct iris_context *ice, } } - uint32_t total_shared = shader->total_shared + grid->variable_shared_mem; +/* Not need with VRT enabled */ +#if GFX_VERx10 < 300 + uint8_t pixel_async_compute_thread_limit, z_pass_async_compute_thread_limit, + np_z_async_throttle_settings; + bool slm_or_barrier_enabled = total_shared != 0 || cs_data->uses_barrier; + + intel_compute_engine_async_threads_limit(devinfo, dispatch.threads, + slm_or_barrier_enabled, + &pixel_async_compute_thread_limit, + &z_pass_async_compute_thread_limit, + &np_z_async_throttle_settings); + + if (ice->state.pixel_async_compute_thread_limit != pixel_async_compute_thread_limit || + ice->state.z_pass_async_compute_thread_limit != z_pass_async_compute_thread_limit || + ice->state.np_z_async_throttle_settings != np_z_async_throttle_settings) { + + batch->ice->state.pixel_async_compute_thread_limit = pixel_async_compute_thread_limit; + batch->ice->state.z_pass_async_compute_thread_limit = z_pass_async_compute_thread_limit; + batch->ice->state.np_z_async_throttle_settings = np_z_async_throttle_settings; + + iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) { +#if GFX_VER >= 20 + cm.AsyncComputeThreadLimit = pixel_async_compute_thread_limit; + cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit; + cm.ZAsyncThrottlesettings = np_z_async_throttle_settings; + cm.AsyncComputeThreadLimitMask = 0x7; + cm.ZPassAsyncComputeThreadLimitMask = 0x7; + cm.ZAsyncThrottlesettingsMask = 0x3; +#else + cm.PixelAsyncComputeThreadLimit = pixel_async_compute_thread_limit; + cm.ZPassAsyncComputeThreadLimit = z_pass_async_compute_thread_limit; + cm.PixelAsyncComputeThreadLimitMask = 0x7; + cm.ZPassAsyncComputeThreadLimitMask = 0x7; + if (intel_device_info_is_mtl_or_arl(devinfo)) { + cm.ZAsyncThrottlesettings = np_z_async_throttle_settings; + cm.ZAsyncThrottlesettingsMask = 0x3; + } +#endif + } + } +#endif /* GFX_VERx10 < 300 */ + struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {}; idd.KernelStartPointer = KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);