diff --git a/src/intel/common/mi_builder.h b/src/intel/common/mi_builder.h index f5ba0026766..7bb5dccc711 100644 --- a/src/intel/common/mi_builder.h +++ b/src/intel/common/mi_builder.h @@ -1164,7 +1164,7 @@ mi_store_address(struct mi_builder *b, struct mi_value addr_reg) } static inline void -mi_self_mod_barrier(struct mi_builder *b) +mi_self_mod_barrier(struct mi_builder *b, unsigned cs_prefetch_size) { /* First make sure all the memory writes from previous modifying commands * have landed. We want to do this before going through the CS cache, @@ -1177,7 +1177,7 @@ mi_self_mod_barrier(struct mi_builder *b) * but experiment show it doesn't work properly, so for now just get over * the CS prefetch. */ - for (uint32_t i = 0; i < (b->devinfo->cs_prefetch_size / 4); i++) + for (uint32_t i = 0; i < (cs_prefetch_size / 4); i++) mi_builder_emit(b, GENX(MI_NOOP), noop); } diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c index baf68d4acf1..1d555d534c1 100644 --- a/src/intel/dev/intel_device_info.c +++ b/src/intel/dev/intel_device_info.c @@ -100,7 +100,6 @@ static const struct intel_device_info intel_device_info_gfx3 = { .max_eus_per_subslice = 8, .num_thread_per_eu = 4, .timestamp_frequency = 12500000, - .cs_prefetch_size = 512, }; static const struct intel_device_info intel_device_info_i965 = { @@ -119,7 +118,6 @@ static const struct intel_device_info intel_device_info_i965 = { }, .timestamp_frequency = 12500000, .simulator_id = -1, - .cs_prefetch_size = 512, }; static const struct intel_device_info intel_device_info_g4x = { @@ -141,7 +139,6 @@ static const struct intel_device_info intel_device_info_g4x = { }, .timestamp_frequency = 12500000, .simulator_id = -1, - .cs_prefetch_size = 512, }; static const struct intel_device_info intel_device_info_ilk = { @@ -162,7 +159,6 @@ static const struct intel_device_info intel_device_info_ilk = { }, .timestamp_frequency = 12500000, .simulator_id = -1, - .cs_prefetch_size = 512, }; static const struct intel_device_info intel_device_info_snb_gt1 = { @@ -193,7 +189,6 @@ static const struct intel_device_info intel_device_info_snb_gt1 = { }, .timestamp_frequency = 12500000, .simulator_id = -1, - .cs_prefetch_size = 512, }; static const struct intel_device_info intel_device_info_snb_gt2 = { @@ -224,7 +219,6 @@ static const struct intel_device_info intel_device_info_snb_gt2 = { }, .timestamp_frequency = 12500000, .simulator_id = -1, - .cs_prefetch_size = 512, }; #define GFX7_FEATURES \ @@ -236,8 +230,7 @@ static const struct intel_device_info intel_device_info_snb_gt2 = { .has_64bit_float = true, \ .has_surface_tile_offset = true, \ .timestamp_frequency = 12500000, \ - .max_constant_urb_size_kb = 16, \ - .cs_prefetch_size = 512 + .max_constant_urb_size_kb = 16 static const struct intel_device_info intel_device_info_ivb_gt1 = { GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1, @@ -439,8 +432,7 @@ static const struct intel_device_info intel_device_info_hsw_gt3 = { .max_wm_threads = 384, \ .max_threads_per_psd = 64, \ .timestamp_frequency = 12500000, \ - .max_constant_urb_size_kb = 32, \ - .cs_prefetch_size = 512 + .max_constant_urb_size_kb = 32 static const struct intel_device_info intel_device_info_bdw_gt1 = { GFX8_FEATURES, .gt = 1, @@ -550,7 +542,6 @@ static const struct intel_device_info intel_device_info_chv = { .max_threads_per_psd = 64, \ .max_cs_threads = 56, \ .timestamp_frequency = 12000000, \ - .cs_prefetch_size = 512, \ .urb = { \ .min_entries = { \ [MESA_SHADER_VERTEX] = 64, \ @@ -835,8 +826,7 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = { .max_tcs_threads = 224, \ .max_tes_threads = 364, \ .max_threads_per_psd = 64, \ - .max_cs_threads = 56, \ - .cs_prefetch_size = 512 + .max_cs_threads = 56 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform) \ GFX8_FEATURES, \ @@ -971,8 +961,7 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = { .has_integer_dword_mul = false, \ .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ .simulator_id = 22, \ - .max_eus_per_subslice = 16, \ - .cs_prefetch_size = 512 + .max_eus_per_subslice = 16 #define dual_subslices(args...) { args, } @@ -1061,8 +1050,7 @@ static const struct intel_device_info intel_device_info_sg1 = { .has_llc = false, \ .has_local_mem = true, \ .has_aux_map = false, \ - .simulator_id = 29, \ - .cs_prefetch_size = 1024 + .simulator_id = 29 #define DG2_FEATURES \ /* (Sub)slice info comes from the kernel topology info */ \ @@ -1930,6 +1918,27 @@ init_max_scratch_ids(struct intel_device_info *devinfo) } } +static unsigned +intel_device_info_calc_engine_prefetch(const struct intel_device_info *devinfo, + enum drm_i915_gem_engine_class engine_class) +{ + if (devinfo->verx10 < 125) + return 512; + + if (intel_device_info_is_mtl(devinfo)) { + switch (engine_class) { + case I915_ENGINE_CLASS_RENDER: + return 2048; + case I915_ENGINE_CLASS_COMPUTE: + return 1024; + default: + return 512; + } + } + + return 1024; +} + bool intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo) { @@ -2045,6 +2054,11 @@ intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo) init_max_scratch_ids(devinfo); + for (enum drm_i915_gem_engine_class engine = I915_ENGINE_CLASS_RENDER; + engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++) + devinfo->engine_class_prefetch[engine] = + intel_device_info_calc_engine_prefetch(devinfo, engine); + return true; } diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h index 1239e999e82..994da2a49a2 100644 --- a/src/intel/dev/intel_device_info.h +++ b/src/intel/dev/intel_device_info.h @@ -28,6 +28,8 @@ #include #include +#include "drm-uapi/i915_drm.h" + #include "util/macros.h" #include "compiler/shader_enums.h" @@ -349,7 +351,7 @@ struct intel_device_info * Size of the command streamer prefetch. This is important to know for * self modifying batches. */ - unsigned cs_prefetch_size; + unsigned engine_class_prefetch[I915_ENGINE_CLASS_COMPUTE + 1]; /** * For the longest time the timestamp frequency for Gen's timestamp counter diff --git a/src/intel/dev/intel_device_info_override_test.c b/src/intel/dev/intel_device_info_override_test.c index b632f5ce2ba..e6b2b28c1aa 100644 --- a/src/intel/dev/intel_device_info_override_test.c +++ b/src/intel/dev/intel_device_info_override_test.c @@ -71,6 +71,7 @@ main(int argc, char *argv[]) fprintf(stderr, "%u\n", devinfo.verx10); assert(devinfo.verx10 == verx10); verify_device_info(&devinfo); + assert(devinfo.engine_class_prefetch[I915_ENGINE_CLASS_RENDER] > 0); } return 0; diff --git a/src/intel/dev/intel_device_info_test.h b/src/intel/dev/intel_device_info_test.h index 7d0c82c624d..55a7aa4d603 100644 --- a/src/intel/dev/intel_device_info_test.h +++ b/src/intel/dev/intel_device_info_test.h @@ -12,7 +12,6 @@ verify_device_info(const struct intel_device_info *devinfo) assert(devinfo->max_eus_per_subslice != 0); assert(devinfo->num_thread_per_eu != 0); assert(devinfo->timestamp_frequency != 0); - assert(devinfo->cs_prefetch_size > 0); assert(devinfo->ver < 7 || devinfo->max_constant_urb_size_kb > 0); assert(devinfo->ver < 8 || devinfo->max_threads_per_psd > 0); diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 28e2b8a3aaa..44e2bab3f11 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -879,10 +879,10 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) */ if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) { const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class; /* Careful to have everything in signed integer. */ - int32_t prefetch_len = devinfo->cs_prefetch_size; - int32_t batch_len = - cmd_buffer->batch.next - cmd_buffer->batch.start; + int32_t prefetch_len = devinfo->engine_class_prefetch[engine_class]; + int32_t batch_len = cmd_buffer->batch.next - cmd_buffer->batch.start; for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4) anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 219942b79d6..0541b52cc5c 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -994,7 +994,9 @@ void genX(CmdBeginQueryIndexedEXT)( assert(reloc_idx == pdevice->n_perf_query_commands); - mi_self_mod_barrier(&b); + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class; + mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]); anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.CommandStreamerStallEnable = true; diff --git a/src/intel/vulkan_hasvk/anv_batch_chain.c b/src/intel/vulkan_hasvk/anv_batch_chain.c index 459747e0a29..89a29f55dac 100644 --- a/src/intel/vulkan_hasvk/anv_batch_chain.c +++ b/src/intel/vulkan_hasvk/anv_batch_chain.c @@ -1019,10 +1019,10 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) */ if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) { const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class; /* Careful to have everything in signed integer. */ - int32_t prefetch_len = devinfo->cs_prefetch_size; - int32_t batch_len = - cmd_buffer->batch.next - cmd_buffer->batch.start; + int32_t prefetch_len = devinfo->engine_class_prefetch[engine_class]; + int batch_len = cmd_buffer->batch.next - cmd_buffer->batch.start; for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4) anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); diff --git a/src/intel/vulkan_hasvk/genX_query.c b/src/intel/vulkan_hasvk/genX_query.c index 8c20e2cdfe1..5084ea82473 100644 --- a/src/intel/vulkan_hasvk/genX_query.c +++ b/src/intel/vulkan_hasvk/genX_query.c @@ -1015,7 +1015,9 @@ void genX(CmdBeginQueryIndexedEXT)( assert(reloc_idx == pdevice->n_perf_query_commands); - mi_self_mod_barrier(&b); + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class; + mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]); anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.CommandStreamerStallEnable = true;