diff --git a/src/intel/common/mi_builder.h b/src/intel/common/mi_builder.h
index f5ba0026766..7bb5dccc711 100644
--- a/src/intel/common/mi_builder.h
+++ b/src/intel/common/mi_builder.h
@@ -1164,7 +1164,7 @@ mi_store_address(struct mi_builder *b, struct mi_value addr_reg)
 }
 
 static inline void
-mi_self_mod_barrier(struct mi_builder *b)
+mi_self_mod_barrier(struct mi_builder *b, unsigned cs_prefetch_size)
 {
    /* First make sure all the memory writes from previous modifying commands
     * have landed. We want to do this before going through the CS cache,
@@ -1177,7 +1177,7 @@ mi_self_mod_barrier(struct mi_builder *b)
     * but experiment show it doesn't work properly, so for now just get over
     * the CS prefetch.
     */
-   for (uint32_t i = 0; i < (b->devinfo->cs_prefetch_size / 4); i++)
+   for (uint32_t i = 0; i < (cs_prefetch_size / 4); i++)
       mi_builder_emit(b, GENX(MI_NOOP), noop);
 }
 
diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c
index baf68d4acf1..1d555d534c1 100644
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -100,7 +100,6 @@ static const struct intel_device_info intel_device_info_gfx3 = {
    .max_eus_per_subslice = 8,
    .num_thread_per_eu = 4,
    .timestamp_frequency = 12500000,
-   .cs_prefetch_size = 512,
 };
 
 static const struct intel_device_info intel_device_info_i965 = {
@@ -119,7 +118,6 @@ static const struct intel_device_info intel_device_info_i965 = {
    },
    .timestamp_frequency = 12500000,
    .simulator_id = -1,
-   .cs_prefetch_size = 512,
 };
 
 static const struct intel_device_info intel_device_info_g4x = {
@@ -141,7 +139,6 @@ static const struct intel_device_info intel_device_info_g4x = {
    },
    .timestamp_frequency = 12500000,
    .simulator_id = -1,
-   .cs_prefetch_size = 512,
 };
 
 static const struct intel_device_info intel_device_info_ilk = {
@@ -162,7 +159,6 @@ static const struct intel_device_info intel_device_info_ilk = {
    },
    .timestamp_frequency = 12500000,
    .simulator_id = -1,
-   .cs_prefetch_size = 512,
 };
 
 static const struct intel_device_info intel_device_info_snb_gt1 = {
@@ -193,7 +189,6 @@ static const struct intel_device_info intel_device_info_snb_gt1 = {
    },
    .timestamp_frequency = 12500000,
    .simulator_id = -1,
-   .cs_prefetch_size = 512,
 };
 
 static const struct intel_device_info intel_device_info_snb_gt2 = {
@@ -224,7 +219,6 @@ static const struct intel_device_info intel_device_info_snb_gt2 = {
    },
    .timestamp_frequency = 12500000,
    .simulator_id = -1,
-   .cs_prefetch_size = 512,
 };
 
 #define GFX7_FEATURES                               \
@@ -236,8 +230,7 @@ static const struct intel_device_info intel_device_info_snb_gt2 = {
    .has_64bit_float = true,                         \
    .has_surface_tile_offset = true,                 \
    .timestamp_frequency = 12500000,                 \
-   .max_constant_urb_size_kb = 16,                  \
-   .cs_prefetch_size = 512
+   .max_constant_urb_size_kb = 16
 
 static const struct intel_device_info intel_device_info_ivb_gt1 = {
    GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
@@ -439,8 +432,7 @@ static const struct intel_device_info intel_device_info_hsw_gt3 = {
    .max_wm_threads = 384,                           \
    .max_threads_per_psd = 64,                       \
    .timestamp_frequency = 12500000,                 \
-   .max_constant_urb_size_kb = 32,                  \
-   .cs_prefetch_size = 512
+   .max_constant_urb_size_kb = 32
 
 static const struct intel_device_info intel_device_info_bdw_gt1 = {
    GFX8_FEATURES, .gt = 1,
@@ -550,7 +542,6 @@ static const struct intel_device_info intel_device_info_chv = {
    .max_threads_per_psd = 64,                       \
    .max_cs_threads = 56,                            \
    .timestamp_frequency = 12000000,                 \
-   .cs_prefetch_size = 512,                         \
    .urb = {                                         \
       .min_entries = {                              \
          [MESA_SHADER_VERTEX]    = 64,              \
@@ -835,8 +826,7 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = {
    .max_tcs_threads = 224,                          \
    .max_tes_threads = 364,                          \
    .max_threads_per_psd = 64,                       \
-   .max_cs_threads = 56,                            \
-   .cs_prefetch_size = 512
+   .max_cs_threads = 56
 
 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform)  \
    GFX8_FEATURES,                                     \
@@ -971,8 +961,7 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = {
    .has_integer_dword_mul = false,                              \
    .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
    .simulator_id = 22,                                          \
-   .max_eus_per_subslice = 16,                                   \
-   .cs_prefetch_size = 512
+   .max_eus_per_subslice = 16
 
 #define dual_subslices(args...) { args, }
 
@@ -1061,8 +1050,7 @@ static const struct intel_device_info intel_device_info_sg1 = {
    .has_llc = false,                                            \
    .has_local_mem = true,                                       \
    .has_aux_map = false,                                        \
-   .simulator_id = 29,                                          \
-   .cs_prefetch_size = 1024
+   .simulator_id = 29
 
 #define DG2_FEATURES                                            \
    /* (Sub)slice info comes from the kernel topology info */    \
@@ -1930,6 +1918,27 @@ init_max_scratch_ids(struct intel_device_info *devinfo)
    }
 }
 
+static unsigned
+intel_device_info_calc_engine_prefetch(const struct intel_device_info *devinfo,
+                                       enum drm_i915_gem_engine_class engine_class)
+{
+   if (devinfo->verx10 < 125)
+      return 512;
+
+   if (intel_device_info_is_mtl(devinfo)) {
+      switch (engine_class) {
+      case I915_ENGINE_CLASS_RENDER:
+         return 2048;
+      case I915_ENGINE_CLASS_COMPUTE:
+         return 1024;
+      default:
+         return 512;
+      }
+   }
+
+   return 1024;
+}
+
 bool
 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
 {
@@ -2045,6 +2054,11 @@ intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
 
    init_max_scratch_ids(devinfo);
 
+   for (enum drm_i915_gem_engine_class engine = I915_ENGINE_CLASS_RENDER;
+        engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++)
+      devinfo->engine_class_prefetch[engine] =
+            intel_device_info_calc_engine_prefetch(devinfo, engine);
+
    return true;
 }
 
diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h
index 1239e999e82..994da2a49a2 100644
--- a/src/intel/dev/intel_device_info.h
+++ b/src/intel/dev/intel_device_info.h
@@ -28,6 +28,8 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#include "drm-uapi/i915_drm.h"
+
 #include "util/macros.h"
 #include "compiler/shader_enums.h"
 
@@ -349,7 +351,7 @@ struct intel_device_info
     * Size of the command streamer prefetch. This is important to know for
     * self modifying batches.
     */
-   unsigned cs_prefetch_size;
+   unsigned engine_class_prefetch[I915_ENGINE_CLASS_COMPUTE + 1];
 
    /**
     * For the longest time the timestamp frequency for Gen's timestamp counter
diff --git a/src/intel/dev/intel_device_info_override_test.c b/src/intel/dev/intel_device_info_override_test.c
index b632f5ce2ba..e6b2b28c1aa 100644
--- a/src/intel/dev/intel_device_info_override_test.c
+++ b/src/intel/dev/intel_device_info_override_test.c
@@ -71,6 +71,7 @@ main(int argc, char *argv[])
       fprintf(stderr, "%u\n", devinfo.verx10);
       assert(devinfo.verx10 == verx10);
       verify_device_info(&devinfo);
+      assert(devinfo.engine_class_prefetch[I915_ENGINE_CLASS_RENDER] > 0);
    }
 
    return 0;
diff --git a/src/intel/dev/intel_device_info_test.h b/src/intel/dev/intel_device_info_test.h
index 7d0c82c624d..55a7aa4d603 100644
--- a/src/intel/dev/intel_device_info_test.h
+++ b/src/intel/dev/intel_device_info_test.h
@@ -12,7 +12,6 @@ verify_device_info(const struct intel_device_info *devinfo)
    assert(devinfo->max_eus_per_subslice != 0);
    assert(devinfo->num_thread_per_eu != 0);
    assert(devinfo->timestamp_frequency != 0);
-   assert(devinfo->cs_prefetch_size > 0);
 
    assert(devinfo->ver < 7 || devinfo->max_constant_urb_size_kb > 0);
    assert(devinfo->ver < 8 || devinfo->max_threads_per_psd > 0);
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index 28e2b8a3aaa..44e2bab3f11 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -879,10 +879,10 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
           */
          if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {
             const struct intel_device_info *devinfo = cmd_buffer->device->info;
+            const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class;
             /* Careful to have everything in signed integer. */
-            int32_t prefetch_len = devinfo->cs_prefetch_size;
-            int32_t batch_len =
-               cmd_buffer->batch.next - cmd_buffer->batch.start;
+            int32_t prefetch_len = devinfo->engine_class_prefetch[engine_class];
+            int32_t batch_len = cmd_buffer->batch.next - cmd_buffer->batch.start;
 
             for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)
                anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 219942b79d6..0541b52cc5c 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -994,7 +994,9 @@ void genX(CmdBeginQueryIndexedEXT)(
 
       assert(reloc_idx == pdevice->n_perf_query_commands);
 
-      mi_self_mod_barrier(&b);
+      const struct intel_device_info *devinfo = cmd_buffer->device->info;
+      const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+      mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
 
       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
          pc.CommandStreamerStallEnable = true;
diff --git a/src/intel/vulkan_hasvk/anv_batch_chain.c b/src/intel/vulkan_hasvk/anv_batch_chain.c
index 459747e0a29..89a29f55dac 100644
--- a/src/intel/vulkan_hasvk/anv_batch_chain.c
+++ b/src/intel/vulkan_hasvk/anv_batch_chain.c
@@ -1019,10 +1019,10 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
           */
          if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {
             const struct intel_device_info *devinfo = cmd_buffer->device->info;
+            const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class;
             /* Careful to have everything in signed integer. */
-            int32_t prefetch_len = devinfo->cs_prefetch_size;
-            int32_t batch_len =
-               cmd_buffer->batch.next - cmd_buffer->batch.start;
+            int32_t prefetch_len = devinfo->engine_class_prefetch[engine_class];
+            int batch_len = cmd_buffer->batch.next - cmd_buffer->batch.start;
 
             for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)
                anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
diff --git a/src/intel/vulkan_hasvk/genX_query.c b/src/intel/vulkan_hasvk/genX_query.c
index 8c20e2cdfe1..5084ea82473 100644
--- a/src/intel/vulkan_hasvk/genX_query.c
+++ b/src/intel/vulkan_hasvk/genX_query.c
@@ -1015,7 +1015,9 @@ void genX(CmdBeginQueryIndexedEXT)(
 
       assert(reloc_idx == pdevice->n_perf_query_commands);
 
-      mi_self_mod_barrier(&b);
+      const struct intel_device_info *devinfo = cmd_buffer->device->info;
+      const enum drm_i915_gem_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+      mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
 
       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
          pc.CommandStreamerStallEnable = true;