anv: implement generated (indexed) indirect draws

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Rohan Garg <rohan.garg@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15642>
2025-12-21 22:20:14 +01:00 · 2022-02-25 16:56:04 +02:00 · 2022-02-25 16:56:04 +02:00 · c950fe97a0
commit c950fe97a0
parent 3596a8ea7a
15 changed files with 1605 additions and 23 deletions
--- a/docs/drivers/anv.rst
+++ b/docs/drivers/anv.rst
@ -7,6 +7,9 @@ Debugging
 Here are a few environment variable debug environment variables
 specific to ANV:

+:envvar:`ANV_ENABLE_GENERATED_INDIRECT_DRAWS`
+   If defined to ``0`` or ``false``, this will disable the generated
+   indirect draw optimization in Anv. This will only affect Gfx11+.
 :envvar:`ANV_ENABLE_PIPELINE_CACHE`
   If defined to ``0`` or ``false``, this will disable pipeline
   caching, forcing ANV to reparse and recompile any VkShaderModule
@ -272,3 +275,34 @@ checking for ``ANV_CMD_DIRTY_PIPELINE``. It should only do so if it
 requires to know some value that is coming from the
 ``anv_graphics_pipeline`` object that is not available from
 ``anv_dynamic_state``.
+
+
+Generated indirect draws optimization
+-------------------------------------
+
+Indirect draws have traditionally been implemented on Intel HW by
+loading the indirect parameters from memory into HW registers using
+the command streamer's ``MI_LOAD_REGISTER_MEM`` instruction before
+dispatching a draw call to the 3D pipeline.
+
+On recent products, it was found that the command streamer is showing
+as performance bottleneck, because it cannot dispatch draw calls fast
+enough to keep the 3D pipeline busy.
+
+The solution to this problem is to change the way we deal with
+indirect draws. Instead of loading HW registers with values using the
+command streamer, we generate entire set of ``3DPRIMITIVE``
+instructions using a shader. The generated instructions contain the
+entire draw call parameters. This way the command streamer executes
+only ``3DPRIMITIVE`` instructions and doesn´t do any data loading from
+memory or touch HW registers, feeding the 3D pipeline as fast as it
+can.
+
+In Anv this implemented by using a side batch buffer. When Anv
+encounters the first indirect draws, it generates a jump into the side
+batch, the side batch contains a draw call using a generation shader
+for each indirect draw. We keep adding on more generation draws into
+the batch until we have to stop due to command buffer end, secondary
+command buffer calls or a barrier containing the access flag
+``VK_ACCESS_INDIRECT_COMMAND_READ_BIT``. The side batch buffer jump
+back right after the instruction where it was called.
--- a/src/intel/ds/intel_tracepoints.py
+++ b/src/intel/ds/intel_tracepoints.py
@ -84,6 +84,8 @@ def define_tracepoints(args):
                          Arg(type='enum isl_format', name='src_fmt', var='src_fmt', c_format='%s', to_prim_type='isl_format_get_short_name({})'),
                          ])

+    begin_end_tp('generate_draws')
+
    begin_end_tp('draw',
                 tp_args=[Arg(type='uint32_t', var='count', c_format='%u')])
    begin_end_tp('draw_multi',
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@ -193,6 +193,14 @@ anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
   return VK_SUCCESS;
 }

+void
+anv_batch_advance(struct anv_batch *batch, uint32_t size)
+{
+   assert(batch->next + size <= batch->end);
+
+   batch->next += size;
+}
+
 struct anv_address
 anv_batch_address(struct anv_batch *batch, void *batch_location)
 {
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -72,6 +72,7 @@ static const driOptionDescription anv_dri_options[] = {
      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(false)
      DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
      DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
+      DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
   DRI_CONF_SECTION_END

   DRI_CONF_SECTION_DEBUG
@ -923,6 +924,12 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
   if (debug_get_bool_option("ANV_QUEUE_THREAD_DISABLE", false))
      device->has_exec_timeline = false;

+
+   device->generated_indirect_draws =
+      device->info.ver >= 11 &&
+      debug_get_bool_option("ANV_ENABLE_GENERATED_INDIRECT_DRAWS",
+                            true);
+
   unsigned st_idx = 0;

   device->sync_syncobj_type = vk_drm_syncobj_get_type(fd);
@ -1104,6 +1111,8 @@ anv_init_dri_options(struct anv_instance *instance)
            driQueryOptionf(&instance->dri_options, "lower_depth_range_rate");
    instance->fp64_workaround_enabled =
            driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
+    instance->generated_indirect_threshold =
+            driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
 }

 VkResult anv_CreateInstance(
@ -3660,6 +3669,8 @@ VkResult anv_CreateDevice(

   anv_device_init_border_colors(device);

+   anv_device_init_generated_indirect_draws(device);
+
   anv_device_perf_init(device);

   anv_device_utrace_init(device);
@ -3747,6 +3758,8 @@ void anv_DestroyDevice(

   anv_device_finish_rt_shaders(device);

+   anv_device_finish_generated_indirect_draws(device);
+
   vk_pipeline_cache_destroy(device->internal_cache, NULL);
   vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);

--- a/src/intel/vulkan/anv_generated_indirect_draws.c
+++ b/src/intel/vulkan/anv_generated_indirect_draws.c
@ -0,0 +1,341 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "compiler/brw_compiler.h"
+#include "compiler/brw_nir.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "dev/intel_debug.h"
+#include "util/macros.h"
+
+#include "anv_generated_indirect_draws.h"
+
+#include "shaders/generated_draws_spv.h"
+#include "shaders/generated_draws_count_spv.h"
+
+/* This pass takes vulkan descriptor bindings 0 & 1 and turns them into global
+ * 64bit addresses. Binding 2 is left UBO that would normally be accessed
+ * through the binding table but it fully promoted to push constants.
+ *
+ * As a result we're not using the binding table at all which is nice because
+ * of the side command buffer we use for the generating shader does not
+ * interact with the binding table allocation.
+ */
+static bool
+lower_vulkan_descriptors_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   nir_instr *res_index_instr = intrin->src[0].ssa->parent_instr;
+   assert(res_index_instr->type == nir_instr_type_intrinsic);
+   nir_intrinsic_instr *res_index_intrin =
+      nir_instr_as_intrinsic(res_index_instr);
+   assert(res_index_intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_ssa_def *desc_value = NULL;
+   switch (nir_intrinsic_binding(res_index_intrin)) {
+   case 0: {
+      desc_value =
+         nir_load_ubo(b, 1, 64,
+                      nir_imm_int(b, 2),
+                      nir_imm_int(b,
+                                  offsetof(struct anv_generate_indirect_params,
+                                           indirect_data_addr)),
+                      .align_mul = 8,
+                      .align_offset = 0,
+                      .range_base = 0,
+                      .range = ~0);
+      desc_value =
+         nir_vec4(b,
+                  nir_unpack_64_2x32_split_x(b, desc_value),
+                  nir_unpack_64_2x32_split_y(b, desc_value),
+                  nir_imm_int(b, 0),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   case 1: {
+      desc_value =
+         nir_load_ubo(b, 1, 64,
+                      nir_imm_int(b, 2),
+                      nir_imm_int(b,
+                                  offsetof(struct anv_generate_indirect_params,
+                                           generated_cmds_addr)),
+                      .align_mul = 8,
+                      .align_offset = 0,
+                      .range_base = 0,
+                      .range = ~0);
+      desc_value =
+         nir_vec4(b,
+                  nir_unpack_64_2x32_split_x(b, desc_value),
+                  nir_unpack_64_2x32_split_y(b, desc_value),
+                  nir_imm_int(b, 0),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   case 2:
+      desc_value =
+         nir_vec2(b,
+                  nir_imm_int(b, 2),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc_value);
+
+   return true;
+}
+
+static bool
+lower_vulkan_descriptors(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader,
+                                       lower_vulkan_descriptors_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
+static struct anv_shader_bin *
+compile_upload_spirv(struct anv_device *device,
+                     const void *key,
+                     uint32_t key_size,
+                     const uint32_t *spirv_source,
+                     uint32_t spirv_source_size,
+                     uint32_t sends_count_expectation)
+{
+   struct spirv_to_nir_options spirv_options = {
+      .caps = {
+      },
+      .ubo_addr_format = nir_address_format_32bit_index_offset,
+      .ssbo_addr_format = nir_address_format_64bit_global_32bit_offset,
+      .environment = NIR_SPIRV_VULKAN,
+      .create_library = false,
+   };
+   const nir_shader_compiler_options *nir_options =
+      device->physical->compiler->nir_options[MESA_SHADER_FRAGMENT];
+
+   nir_shader* nir =
+      spirv_to_nir(spirv_source, spirv_source_size,
+                   NULL, 0, MESA_SHADER_FRAGMENT, "main",
+                   &spirv_options, nir_options);
+
+   assert(nir != NULL);
+
+   nir->info.internal = true;
+
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_dce);
+   NIR_PASS_V(nir, nir_opt_cse);
+   NIR_PASS_V(nir, nir_opt_gcm, true);
+   NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
+   struct brw_compiler *compiler = device->physical->compiler;
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, nir_propagate_invariant, false);
+
+   NIR_PASS_V(nir, nir_lower_input_attachments,
+            &(nir_input_attachment_options) {
+               .use_fragcoord_sysval = true,
+               .use_layer_id_sysval = true,
+            });
+
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   /* Do vectorizing here. For some reason when trying to do it in the back
+    * this just isn't working.
+    */
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };
+   NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
+
+   NIR_PASS_V(nir, lower_vulkan_descriptors);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
+              nir_address_format_32bit_index_offset);
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+              nir_address_format_64bit_global_32bit_offset);
+
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   struct brw_wm_prog_key wm_key;
+   memset(&wm_key, 0, sizeof(wm_key));
+
+   struct brw_wm_prog_data wm_prog_data = {
+      .base.nr_params = nir->num_uniforms / 4,
+   };
+
+   brw_nir_analyze_ubo_ranges(compiler, nir, NULL, wm_prog_data.base.ubo_ranges);
+
+   struct brw_compile_stats stats[3];
+   struct brw_compile_fs_params params = {
+      .nir = nir,
+      .key = &wm_key,
+      .prog_data = &wm_prog_data,
+      .stats = stats,
+      .log_data = device,
+      .debug_flag = DEBUG_WM,
+   };
+   const unsigned *program = brw_compile_fs(compiler, nir, &params);
+
+   if (wm_prog_data.dispatch_8) {
+      assert(stats[0].spills == 0);
+      assert(stats[0].fills == 0);
+      assert(stats[0].sends == sends_count_expectation);
+   }
+   if (wm_prog_data.dispatch_16) {
+      assert(stats[1].spills == 0);
+      assert(stats[1].fills == 0);
+      assert(stats[1].sends == sends_count_expectation);
+   }
+   if (wm_prog_data.dispatch_32) {
+      assert(stats[2].spills == 0);
+      assert(stats[2].fills == 0);
+      assert(stats[2].sends == sends_count_expectation);
+   }
+
+   struct anv_pipeline_bind_map bind_map;
+   memset(&bind_map, 0, sizeof(bind_map));
+
+   struct anv_push_descriptor_info push_desc_info = {};
+
+   struct anv_shader_bin *kernel =
+      anv_device_upload_kernel(device,
+                               device->internal_cache,
+                               nir->info.stage,
+                               key, key_size, program,
+                               wm_prog_data.base.program_size,
+                               &wm_prog_data.base, sizeof(wm_prog_data),
+                               NULL, 0, NULL, &bind_map,
+                               &push_desc_info);
+
+   ralloc_free(nir);
+
+   return kernel;
+}
+
+VkResult
+anv_device_init_generated_indirect_draws(struct anv_device *device)
+{
+   if (device->info->ver < 11)
+      return VK_SUCCESS;
+
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(device->info,
+                                   true /* wants_dc_cache */,
+                                   false /* needs_slm */);
+   device->generated_draw_l3_config = intel_get_l3_config(device->info, w);
+
+   struct {
+      char name[40];
+   } indirect_draws_key = {
+      .name = "anv-generated-indirect-draws",
+   }, indirect_draws_count_key = {
+      .name = "anv-generated-indirect-draws-count",
+   };
+
+   device->generated_draw_kernel =
+      anv_device_search_for_kernel(device,
+                                   device->internal_cache,
+                                   &indirect_draws_key,
+                                   sizeof(indirect_draws_key),
+                                   NULL);
+   if (device->generated_draw_kernel == NULL) {
+      device->generated_draw_kernel =
+         compile_upload_spirv(device,
+                              &indirect_draws_key,
+                              sizeof(indirect_draws_key),
+                              generated_draws_spv_source,
+                              ARRAY_SIZE(generated_draws_spv_source),
+                              10 /* 2 * (2 loads + 3 stores) */);
+   }
+   if (device->generated_draw_kernel == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->generated_draw_kernel);
+
+   device->generated_draw_count_kernel =
+      anv_device_search_for_kernel(device,
+                                   device->internal_cache,
+                                   &indirect_draws_count_key,
+                                   sizeof(indirect_draws_count_key),
+                                   NULL);
+   if (device->generated_draw_count_kernel == NULL) {
+      device->generated_draw_count_kernel =
+         compile_upload_spirv(device,
+                              &indirect_draws_count_key,
+                              sizeof(indirect_draws_count_key),
+                              generated_draws_count_spv_source,
+                              ARRAY_SIZE(generated_draws_count_spv_source),
+                              11 /* 2 * (3 loads + 3 stores) */);
+   }
+   if (device->generated_draw_count_kernel == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->generated_draw_count_kernel);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_generated_indirect_draws(struct anv_device *device)
+{
+}
--- a/src/intel/vulkan/anv_generated_indirect_draws.h
+++ b/src/intel/vulkan/anv_generated_indirect_draws.h
@ -0,0 +1,71 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GENERATED_INDIRECT_DRAWS_H
+#define ANV_GENERATED_INDIRECT_DRAWS_H
+
+#include <stdint.h>
+
+/* This needs to match generated_draws.glsl :
+ *
+ *    layout(set = 0, binding = 2) uniform block
+ */
+struct anv_generated_indirect_draw_params {
+   uint32_t is_indexed;
+   uint32_t is_predicated;
+   uint32_t draw_base;
+   uint32_t draw_count;
+   uint32_t instance_multiplier;
+   uint32_t indirect_data_stride;
+};
+
+/* This needs to match generated_draws_count.glsl :
+ *
+ *    layout(set = 0, binding = 2) uniform block
+ */
+struct anv_generated_indirect_draw_count_params {
+   uint32_t is_indexed;
+   uint32_t is_predicated;
+   uint32_t draw_base;
+   uint32_t item_count;
+   uint32_t draw_count;
+   uint32_t instance_multiplier;
+   uint32_t indirect_data_stride;
+   uint32_t end_addr_ldw;
+   uint32_t end_addr_udw;
+};
+
+struct anv_generate_indirect_params {
+   union {
+      struct anv_generated_indirect_draw_params       draw;
+      struct anv_generated_indirect_draw_count_params draw_count;
+   };
+
+   /* Global address of binding 0 */
+   uint64_t indirect_data_addr;
+
+   /* Global address of binding 1 */
+   uint64_t generated_cmds_addr;
+};
+
+#endif /* ANV_GENERATED_INDIRECT_DRAWS_H */
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@ -31,7 +31,7 @@
 #include "nir/nir_xfb_info.h"
 #include "vulkan/util/vk_util.h"
 #include "compiler/spirv/nir_spirv.h"
-#include "float64_spv.h"
+#include "shaders/float64_spv.h"

 static bool
 anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1009,6 +1009,15 @@ struct anv_physical_device {

    bool                                        always_flush_cache;

+    /**
+     * True if the generated indirect draw optimization is turned on.
+     *
+     * This optimization is currently only available on Gfx11+ to avoid
+     * dealing with the annoying Gfx8/9 tracking of vertex buffer for the VF
+     * cache workaround.
+     */
+    bool                                        generated_indirect_draws;
+
    struct {
      uint32_t                                  family_count;
      struct anv_queue_family                   families[ANV_MAX_QUEUE_FAMILIES];
@ -1075,6 +1084,7 @@ struct anv_instance {
    bool                                        sample_mask_out_opengl_behaviour;
    bool                                        fp64_workaround_enabled;
    float                                       lower_depth_range_rate;
+    unsigned                                    generated_indirect_threshold;
 };

 VkResult anv_init_wsi(struct anv_physical_device *physical_device);
@ -1241,6 +1251,15 @@ struct anv_device {

    enum anv_rt_bvh_build_method                bvh_build_method;

+    /** Draw generation shader
+     *
+     * Generates direct draw calls out of indirect parameters. Used to
+     * workaround slowness with indirect draw calls.
+     */
+    struct anv_shader_bin                      *generated_draw_kernel;
+    struct anv_shader_bin                      *generated_draw_count_kernel;
+    const struct intel_l3_config               *generated_draw_l3_config;
+
    pthread_mutex_t                             mutex;
    pthread_cond_t                              queue_submit;

@ -1462,6 +1481,7 @@ struct anv_batch {

 void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords);
 VkResult anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size);
+void anv_batch_advance(struct anv_batch *batch, uint32_t size);
 void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other);
 struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location);

@ -2887,6 +2907,13 @@ void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);

 void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer);

+static inline unsigned
+anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   return MAX2(1, util_bitcount(gfx->view_mask));
+}
+
 enum anv_bo_sync_state {
   /** Indicates that this is a new (or newly reset fence) */
   ANV_BO_SYNC_STATE_RESET,
@ -4139,6 +4166,18 @@ struct anv_memcpy_state {
   struct anv_vb_cache_range vb_dirty;
 };

+VkResult
+anv_device_init_generated_indirect_draws(struct anv_device *device);
+void
+anv_device_finish_generated_indirect_draws(struct anv_device *device);
+
+static inline bool anv_use_generated_draws(const struct anv_device *device,
+                                           uint32_t count)
+{
+   return device->physical->generated_indirect_draws &&
+          count >= device->physical->instance->generated_indirect_threshold;
+}
+
 struct anv_utrace_flush_copy {
   /* Needs to be the first field */
   struct intel_ds_flush_data ds;
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -3413,6 +3413,11 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
      genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
 }

+#define GFX_HAS_GENERATED_CMDS GFX_VER >= 11
+#if GFX_VER >= 11
+#include "genX_cmd_draw_generated_indirect.h"
+#endif
+
 VkResult
 genX(BeginCommandBuffer)(
    VkCommandBuffer                             commandBuffer,
@ -3618,6 +3623,10 @@ genX(EndCommandBuffer)(

   anv_measure_endcommandbuffer(cmd_buffer);

+#if GFX_HAS_GENERATED_CMDS
+   genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
+#endif
+
   /* We want every command buffer to start with the PMA fix in a known state,
    * so we disable it at the end of the command buffer.
    */
@ -3657,6 +3666,10 @@ genX(CmdExecuteCommands)(
    */
   genX(cmd_buffer_apply_pipe_flushes)(primary);

+#if GFX_HAS_GENERATED_CMDS
+   genX(cmd_buffer_flush_generated_draws)(primary);
+#endif
+
   for (uint32_t i = 0; i < commandBufferCount; i++) {
      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);

@ -3819,6 +3832,11 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
      anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
      anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);

+#if GFX_HAS_GENERATED_CMDS
+   if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
+      genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
+#endif
+
   anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
 }

@ -4442,9 +4460,24 @@ void genX(CmdDrawIndirect)(
                        drawCount);
   trace_intel_begin_draw_indirect(&cmd_buffer->trace);

+#if GFX_HAS_GENERATED_CMDS
+   if (anv_use_generated_draws(cmd_buffer->device, drawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         anv_address_add(buffer->address, offset),
+         MAX2(stride, sizeof(VkDrawIndirectCommand)),
+         drawCount,
+         false /* indexed */);
+   } else {
      emit_indirect_draws(cmd_buffer,
                          anv_address_add(buffer->address, offset),
                          stride, drawCount, false /* indexed */);
+   }
+#else
+   emit_indirect_draws(cmd_buffer,
+                       anv_address_add(buffer->address, offset),
+                       stride, drawCount, false /* indexed */);
+#endif

   trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
 }
@ -4468,9 +4501,24 @@ void genX(CmdDrawIndexedIndirect)(
                        drawCount);
   trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);

+#if GFX_HAS_GENERATED_CMDS
+   if (anv_use_generated_draws(cmd_buffer->device, drawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         anv_address_add(buffer->address, offset),
+         MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)),
+         drawCount,
+         true /* indexed */);
+   } else {
      emit_indirect_draws(cmd_buffer,
                          anv_address_add(buffer->address, offset),
                          stride, drawCount, true /* indexed */);
+   }
+#else
+   emit_indirect_draws(cmd_buffer,
+                       anv_address_add(buffer->address, offset),
+                       stride, drawCount, true /* indexed */);
+#endif

   trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
 }
@ -4643,12 +4691,37 @@ void genX(CmdDrawIndirectCount)(
                        0);
   trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);

-   emit_indirect_count_draws(cmd_buffer,
-                             anv_address_add(buffer->address, offset),
-                             MAX2(stride, sizeof(VkDrawIndirectCommand)),
-                             anv_address_add(count_buffer->address, countBufferOffset),
+   struct anv_address indirect_data_address =
+      anv_address_add(buffer->address, offset);
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+   stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
+
+#if GFX_HAS_GENERATED_CMDS
+   if (anv_use_generated_draws(cmd_buffer->device, maxDrawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws_count)(
+         cmd_buffer,
+         indirect_data_address,
+         stride,
+         count_address,
         maxDrawCount,
         false /* indexed */);
+   } else {
+      emit_indirect_count_draws(cmd_buffer,
+                                indirect_data_address,
+                                stride,
+                                count_address,
+                                maxDrawCount,
+                                false /* indexed */);
+   }
+#else
+   emit_indirect_count_draws(cmd_buffer,
+                             indirect_data_address,
+                             stride,
+                             count_address,
+                             maxDrawCount,
+                             false /* indexed */);
+#endif

   trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
 }
@ -4675,12 +4748,37 @@ void genX(CmdDrawIndexedIndirectCount)(
                        0);
   trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);

-   emit_indirect_count_draws(cmd_buffer,
-                             anv_address_add(buffer->address, offset),
-                             MAX2(stride, sizeof(VkDrawIndirectCommand)),
-                             anv_address_add(count_buffer->address, countBufferOffset),
+   struct anv_address indirect_data_address =
+      anv_address_add(buffer->address, offset);
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+   stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
+
+#if GFX_HAS_GENERATED_CMDS
+   if (anv_use_generated_draws(cmd_buffer->device, maxDrawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws_count)(
+         cmd_buffer,
+         indirect_data_address,
+         stride,
+         count_address,
         maxDrawCount,
         true /* indexed */);
+   } else {
+      emit_indirect_count_draws(cmd_buffer,
+                                indirect_data_address,
+                                stride,
+                                count_address,
+                                maxDrawCount,
+                                true /* indexed */);
+   }
+#else
+   emit_indirect_count_draws(cmd_buffer,
+                             indirect_data_address,
+                             stride,
+                             count_address,
+                             maxDrawCount,
+                             true /* indexed */);
+#endif

   trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);

--- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@ -0,0 +1,704 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_GENERATED_INDIRECT_DRAW_H
+#define GENX_CMD_GENERATED_INDIRECT_DRAW_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "anv_private.h"
+#include "anv_generated_indirect_draws.h"
+
+#if GFX_VER < 11
+#error "Generated draws optimization not supported prior to Gfx11"
+#endif
+
+/* This is a maximum number of items a fragment shader can generate due to the
+ * viewport size.
+ */
+#define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
+
+static void
+genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_shader_bin *draw_kernel = device->generated_draw_kernel;
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data_const(draw_kernel->prog_data);
+
+   uint32_t *dw = anv_batch_emitn(batch,
+                                  1 + 2 * GENX(VERTEX_ELEMENT_STATE_length),
+                                  GENX(3DSTATE_VERTEX_ELEMENTS));
+   /* You might think there is some shady stuff going here and you would be
+    * right. We're setting up 2 VERTEX_ELEMENT_STATE yet we're only providing
+    * 1 (positions) VERTEX_BUFFER_STATE later.
+    *
+    * Find more about how to set up a 3D pipeline with a fragment shader but
+    * without a vertex shader in blorp_emit_vertex_elements() in
+    * blorp_genX_exec.h.
+    */
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 1, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 1,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      });
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 3, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex   = 0,
+         .Valid               = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control   = VFCOMP_STORE_SRC,
+         .Component1Control   = VFCOMP_STORE_SRC,
+         .Component2Control   = VFCOMP_STORE_SRC,
+         .Component3Control   = VFCOMP_STORE_1_FP,
+      });
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf);
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+      sgvs.InstanceIDEnable = true;
+      sgvs.InstanceIDComponentNumber = COMP_1;
+      sgvs.InstanceIDElementOffset = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+   }
+
+   /* Emit URB setup.  We tell it that the VS is active because we want it to
+    * allocate space for the VS.  Even though one isn't run, we need VUEs to
+    * store the data that VF is going to pass to SOL.
+    */
+   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+
+   genX(emit_l3_config)(batch, device, device->generated_draw_l3_config);
+
+   cmd_buffer->state.current_l3_config = device->generated_draw_l3_config;
+
+   genX(emit_urb_setup)(device, batch, device->generated_draw_l3_config,
+                        VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+                        entry_size, NULL);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+      ps_blend.HasWriteableRT = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+      db.DepthBoundsTestEnable = false;
+      db.DepthBoundsTestMinValue = 0.0;
+      db.DepthBoundsTestMaxValue = 1.0;
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
+   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = 0x1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+   anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+   anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+   anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+
+   anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+
+   anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+
+   anv_batch_emit(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GFX_VER >= 12
+      sf.DerefBlockSize = INTEL_URB_DEREF_BLOCK_SIZE_32; // TODO
+#endif
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_RASTER), raster) {
+      raster.CullMode = CULLMODE_NONE;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+      sbe.VertexURBEntryReadLength = MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
+      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM), wm) {
+      //wm.ForceThreadDispatchEnable = ForceON;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+      psx.PixelShaderValid = true;
+      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
+      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * GENX(CC_VIEWPORT_length), 32);
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = 0.0f,
+         .MaximumDepth = 1.0f,
+      };
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map, &cc_viewport);
+      cc.CCViewportPointer = cc_state.offset;
+   }
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_HS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_DS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+      alloc.ConstantBufferOffset = 0;
+      alloc.ConstantBufferSize   = cmd_buffer->device->info->max_constant_urb_size_kb;
+   }
+
+#if GFX_VERx10 == 125
+   /* DG2: Wa_22011440098
+    * MTL: Wa_18022330953
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      c.ShaderUpdateEnable = 0x1f;
+      c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+   }
+#endif
+
+   cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0) | BITFIELD_BIT(1);
+   cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+                                    ANV_CMD_DIRTY_XFB_ENABLE);
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+   cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT;
+   vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                             "after generation batch BTI change");
+}
+
+static void
+genX(cmd_buffer_emit_generate_draws_vertex)(struct anv_cmd_buffer *cmd_buffer,
+                                            uint32_t draw_count)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_state vs_data_state =
+      anv_cmd_buffer_alloc_dynamic_state(
+         cmd_buffer, 9 * sizeof(uint32_t), 32);
+
+   float x0 = 0.0f, x1 = MIN2(draw_count, 8192);
+   float y0 = 0.0f, y1 = DIV_ROUND_UP(draw_count, 8192);
+   float z = 0.0f;
+
+   float *vertices = vs_data_state.map;
+   vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
+   vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
+   vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */
+
+   uint32_t *dw = anv_batch_emitn(batch,
+                                  1 + GENX(VERTEX_BUFFER_STATE_length),
+                                  GENX(3DSTATE_VERTEX_BUFFERS));
+   GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
+                                  &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex     = 0,
+         .AddressModifyEnable   = true,
+         .BufferStartingAddress = (struct anv_address) {
+            .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+            .offset = vs_data_state.offset,
+         },
+         .BufferPitch           = 3 * sizeof(float),
+         .BufferSize            = 9 * sizeof(float),
+         .MOCS                  = anv_mocs(cmd_buffer->device, NULL, 0),
+#if GFX_VER >= 12
+         .L3BypassDisable       = true,
+#endif
+      });
+}
+
+static struct anv_state
+genX(cmd_buffer_alloc_generated_push_data)(struct anv_cmd_buffer *cmd_buffer)
+{
+   return anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                             sizeof(struct anv_generate_indirect_params),
+                                             ANV_UBO_ALIGNMENT);
+}
+
+
+static struct anv_state
+genX(cmd_buffer_emit_generated_push_data)(struct anv_cmd_buffer *cmd_buffer,
+                                          struct anv_state push_data_state)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_address push_data_addr = anv_state_pool_state_address(
+      &cmd_buffer->device->dynamic_state_pool, push_data_state);
+
+#if GFX_VER >= 12
+   const uint32_t num_dwords = GENX(3DSTATE_CONSTANT_ALL_length) +
+      GENX(3DSTATE_CONSTANT_ALL_DATA_length);
+   uint32_t *dw =
+      anv_batch_emitn(batch, num_dwords,
+                      GENX(3DSTATE_CONSTANT_ALL),
+                      .ShaderUpdateEnable = BITFIELD_BIT(MESA_SHADER_FRAGMENT),
+                      .PointerBufferMask = 0x1,
+                      .MOCS = anv_mocs(cmd_buffer->device, NULL, 0));
+
+   GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+      batch, dw + GENX(3DSTATE_CONSTANT_ALL_length),
+      &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+         .PointerToConstantBuffer = push_data_addr,
+         .ConstantBufferReadLength = DIV_ROUND_UP(push_data_state.alloc_size, 32),
+      });
+#else
+   anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
+      c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+      c.ConstantBody.ReadLength[0] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
+      c.ConstantBody.Buffer[0] = push_data_addr;
+   }
+#endif
+
+   return push_data_state;
+}
+
+static void
+genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                     struct anv_address generated_cmds_addr,
+                                     uint32_t generated_cmds_size,
+                                     struct anv_address indirect_data_addr,
+                                     uint32_t indirect_data_stride,
+                                     uint32_t item_base,
+                                     uint32_t item_count,
+                                     bool indexed)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   const struct anv_shader_bin *draw_kernel = device->generated_draw_kernel;
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data_const(draw_kernel->prog_data);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+      ps.BindingTableEntryCount = 2;
+      ps.PushConstantEnable     = prog_data->base.nr_params > 0 ||
+                                  prog_data->base.ubo_ranges[0].length;
+
+      ps._8PixelDispatchEnable = prog_data->dispatch_8;
+      ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+      ps.KernelStartPointer0 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+      ps.KernelStartPointer2 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+
+      ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1;
+   }
+
+   genX(cmd_buffer_emit_generate_draws_vertex)(cmd_buffer, item_count);
+
+   struct anv_state push_data_state =
+      genX(cmd_buffer_alloc_generated_push_data)(cmd_buffer);
+
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+
+   struct anv_generate_indirect_params *push_data = push_data_state.map;
+   *push_data = (struct anv_generate_indirect_params) {
+      .draw                      = {
+         .is_indexed             = indexed,
+         .is_predicated          = cmd_buffer->state.conditional_render_enabled,
+         .draw_base              = item_base,
+         .draw_count             = item_count,
+         .instance_multiplier    = pipeline->instance_multiplier,
+         .indirect_data_stride   = indirect_data_stride,
+      },
+      .indirect_data_addr        = anv_address_physical(indirect_data_addr),
+      .generated_cmds_addr       = anv_address_physical(generated_cmds_addr),
+   };
+
+   genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state);
+
+   anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = _3DPRIM_RECTLIST;
+      prim.VertexCountPerInstance   = 3;
+      prim.InstanceCount            = 1;
+   }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = true;
+   }
+#endif
+
+   anv_batch_emit_ensure_space(&cmd_buffer->generation_batch, 4);
+
+   trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress =
+         anv_batch_current_address(&cmd_buffer->generation_batch);
+   }
+
+   cmd_buffer->generation_return_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+   trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+   genX(cmd_buffer_emit_generate_draws_pipeline)(cmd_buffer);
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                               struct anv_address indirect_data_addr,
+                                               uint32_t indirect_data_stride,
+                                               uint32_t draw_count,
+                                               bool indexed)
+{
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   /* Apply the pipeline flush here so the indirect data is available for the
+    * generation shader.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (anv_address_is_null(cmd_buffer->generation_return_addr))
+      genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
+
+   /* In order to have the vertex fetch gather the data we need to have a non
+    * 0 stride. It's possible to have a 0 stride given by the application when
+    * draw_count is 1, but we need a correct value for the
+    * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
+    * correctly :
+    *
+    * Vulkan spec, vkCmdDrawIndirect:
+    *
+    *   "If drawCount is less than or equal to one, stride is ignored."
+    */
+   assert(indirect_data_stride > 0);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* Emit the 3D state in the main batch. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length);
+
+   uint32_t item_base = 0;
+   while (item_base < draw_count) {
+      const uint32_t item_count = MIN2(draw_count - item_base,
+                                       MAX_GENERATED_DRAW_COUNT);
+      const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
+
+      /* Ensure we have enough contiguous space for all the draws so that the
+       * compute shader can edit all the 3DPRIMITIVEs from a single base
+       * address.
+       *
+       * TODO: we might have to split that if the amount of space is to large (at
+       *       1Mb?).
+       */
+      VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
+                                                    draw_cmd_size);
+      if (result != VK_SUCCESS)
+         return;
+
+      genX(cmd_buffer_emit_generate_draws)(
+         cmd_buffer,
+         anv_batch_current_address(&cmd_buffer->batch),
+         draw_cmd_size,
+         indirect_data_addr,
+         indirect_data_stride,
+         item_base,
+         item_count,
+         indexed);
+
+      anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
+
+      item_base += item_count;
+   }
+}
+
+static void
+genX(cmd_buffer_emit_generate_draws_count)(struct anv_cmd_buffer *cmd_buffer,
+                                           struct anv_address generated_cmds_addr,
+                                           uint32_t generated_cmds_size,
+                                           struct anv_address indirect_data_addr,
+                                           uint32_t indirect_data_stride,
+                                           uint32_t item_base,
+                                           uint32_t item_count,
+                                           struct anv_address count_addr,
+                                           bool indexed)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   const struct anv_shader_bin *draw_kernel =
+      device->generated_draw_count_kernel;
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data_const(draw_kernel->prog_data);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+      ps.BindingTableEntryCount = 2;
+      ps.PushConstantEnable     = prog_data->base.nr_params > 0 ||
+                                  prog_data->base.ubo_ranges[0].length;
+
+      ps._8PixelDispatchEnable = prog_data->dispatch_8;
+      ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+      ps.KernelStartPointer0 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+      ps.KernelStartPointer2 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+
+      ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1;
+   }
+
+   genX(cmd_buffer_emit_generate_draws_vertex)(cmd_buffer, item_count);
+
+   struct anv_state push_data_state =
+      genX(cmd_buffer_alloc_generated_push_data)(cmd_buffer);
+
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   uint64_t end_cmd_addr =
+      anv_address_physical(
+         anv_address_add(generated_cmds_addr, generated_cmds_size));
+
+   struct anv_generate_indirect_params *push_data = push_data_state.map;
+   *push_data = (struct anv_generate_indirect_params) {
+      .draw_count                = {
+         .is_indexed             = indexed,
+         .is_predicated          = cmd_buffer->state.conditional_render_enabled,
+         .draw_base              = item_base,
+         .item_count             = item_count,
+         .draw_count             = 0, // Edit this through a the command streamer
+         .instance_multiplier    = pipeline->instance_multiplier,
+         .indirect_data_stride   = indirect_data_stride,
+         .end_addr_ldw           = end_cmd_addr & 0xffffffff,
+         .end_addr_udw           = end_cmd_addr >> 32,
+      },
+      .indirect_data_addr        = anv_address_physical(indirect_data_addr),
+      .generated_cmds_addr       = anv_address_physical(generated_cmds_addr),
+   };
+
+   /* Copy the draw count into the push constants so that the generation gets
+    * the value straight away and doesn't even need to access memory.
+    */
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, batch);
+   mi_memcpy(&b,
+             anv_address_add((struct anv_address) {
+                   .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+                   .offset = push_data_state.offset,
+                },
+                offsetof(struct anv_generate_indirect_params, draw_count.draw_count)),
+             count_addr, 4);
+
+   /* Only emit the data after the memcpy above. */
+   genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state);
+
+   anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = _3DPRIM_RECTLIST;
+      prim.VertexCountPerInstance   = 3;
+      prim.InstanceCount            = 1;
+   }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_count)(struct anv_cmd_buffer *cmd_buffer,
+                                                     struct anv_address indirect_data_addr,
+                                                     uint32_t indirect_data_stride,
+                                                     struct anv_address count_addr,
+                                                     uint32_t max_draw_count,
+                                                     bool indexed)
+{
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   /* Apply the pipeline flush here so the indirect data is available for the
+    * generation shader.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (anv_address_is_null(cmd_buffer->generation_return_addr))
+      genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
+
+   /* In order to have the vertex fetch gather the data we need to have a non
+    * 0 stride. It's possible to have a 0 stride given by the application when
+    * draw_count is 1, but we need a correct value for the
+    * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
+    * correctly :
+    *
+    * Vulkan spec, vkCmdDrawIndirect:
+    *
+    *   "If drawCount is less than or equal to one, stride is ignored."
+    */
+   assert(indirect_data_stride > 0);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* Emit the 3D state in the main batch. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length);
+
+   uint32_t item_base = 0;
+   while (item_base < max_draw_count) {
+      const uint32_t item_count = MIN2(max_draw_count - item_base,
+                                       MAX_GENERATED_DRAW_COUNT);
+      const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
+
+      /* Ensure we have enough contiguous space for all the draws so that the
+       * compute shader can edit all the 3DPRIMITIVEs from a single base
+       * address.
+       *
+       * TODO: we might have to split that if the amount of space is to large (at
+       *       1Mb?).
+       */
+      VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
+                                                    draw_cmd_size);
+      if (result != VK_SUCCESS)
+         return;
+
+      genX(cmd_buffer_emit_generate_draws_count)(
+         cmd_buffer,
+         anv_batch_current_address(&cmd_buffer->batch),
+         draw_cmd_size,
+         anv_address_add(indirect_data_addr,
+                         item_base * indirect_data_stride),
+         indirect_data_stride,
+         item_base,
+         item_count,
+         count_addr,
+         indexed);
+
+      anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
+
+      item_base += item_count;
+   }
+}
+
+static void
+genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* No return address setup means we don't have to do anything */
+   if (anv_address_is_null(cmd_buffer->generation_return_addr))
+      return;
+
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+
+   /* Wait for all the generation vertex shader to generate the commands. */
+   genX(emit_apply_pipe_flushes)(batch,
+                                 cmd_buffer->device,
+                                 _3D,
+                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                                 ANV_PIPE_CS_STALL_BIT);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = false;
+   }
+#endif
+
+#if GFX_VER < 12
+   /* Prior to Gfx12 we cannot disable the CS prefetch, so we have to emit a
+    * bunch of NOOPs to ensure we do not have generated commands loaded into
+    * the CS cache prior to them having been generated.
+    */
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+   for (uint32_t i = 0; i < devinfo->engine_class_prefetch[engine_class] / 4; i++)
+      anv_batch_emit(batch, GENX(MI_NOOP), noop);
+#endif
+
+   /* Return to the main batch. */
+   anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress = cmd_buffer->generation_return_addr;
+   }
+
+   cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
+}
+
+#endif /* GENX_CMD_GENERATED_INDIRECT_DRAW_H */
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@ -18,6 +18,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

+subdir('shaders')
+
 inc_anv = include_directories('.')

 anv_flags = [
@ -43,20 +45,6 @@ anv_entrypoints = custom_target(
  depend_files : vk_entrypoints_gen_depend_files,
 )

-float64_spv_h = custom_target(
-  'float64_spv.h',
-  input : [glsl2spirv, float64_glsl_file],
-  output : 'float64_spv.h',
-  command : [
-    prog_python, '@INPUT@', '@OUTPUT@',
-    prog_glslang,
-    '--create-entry', 'main',
-    '--vn', 'float64_spv_source',
-    '--glsl-version', '450',
-    '-Olib',
-  ]
-)
-
 idep_anv_headers = declare_dependency(
  sources : [anv_entrypoints[0]],
  include_directories : inc_anv,
@ -126,7 +114,8 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']],
  _gfx_ver = g[0]
  libanv_per_hw_ver_libs += static_library(
    'anv_per_hw_ver@0@'.format(_gfx_ver),
-    [anv_per_hw_ver_files, g[1], anv_entrypoints[0]],
+    [anv_per_hw_ver_files, g[1], anv_entrypoints[0],
+     generated_draws_spv_h, generated_draws_count_spv_h],
    include_directories : [
      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel,
    ],
@ -152,6 +141,7 @@ libanv_files = files(
  'anv_descriptor_set.c',
  'anv_device.c',
  'anv_formats.c',
+  'anv_generated_indirect_draws.c',
  'anv_genX.h',
  'anv_image.c',
  'anv_measure.c',
@ -216,7 +206,7 @@ libanv_common = static_library(
  c_args : anv_flags,
  cpp_args : anv_cpp_flags,
  gnu_symbol_visibility : 'hidden',
-  dependencies : anv_deps,
+  dependencies : anv_deps
 )

 libvulkan_intel = shared_library(
--- a/src/intel/vulkan/shaders/generated_draws.glsl
+++ b/src/intel/vulkan/shaders/generated_draws.glsl
@ -0,0 +1,101 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 450
+
+/* These 2 bindings will be accessed through A64 messages */
+layout(set = 0, binding = 0, std430) buffer Storage0 {
+   uint indirect_data[];
+};
+
+layout(set = 0, binding = 1, std430) buffer Storage1 {
+   uint commands[];
+};
+
+/* This data will be provided through push constants. */
+layout(set = 0, binding = 2) uniform block {
+   uint is_indexed;
+   uint is_predicated;
+   uint draw_base;
+   uint draw_count;
+   uint instance_multiplier;
+   uint indirect_data_stride;
+};
+
+void main()
+{
+   uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
+   uint indirect_data_offset = item_idx * indirect_data_stride / 4;
+   uint _3dprim_dw_size = 10;
+   uint cmd_idx = uint(item_idx) * _3dprim_dw_size;
+   uint draw_id = draw_base + item_idx;
+
+   if (draw_id < draw_count) {
+      if (is_indexed != 0) {
+         /* Loading a VkDrawIndexedIndirectCommand */
+         uint index_count    = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_index    = indirect_data[indirect_data_offset + 2];
+         uint vertex_offset  = indirect_data[indirect_data_offset + 3];
+         uint first_instance = indirect_data[indirect_data_offset + 4];
+
+         commands[cmd_idx + 0] = (3 << 29 |         /* Command Type */
+                                  3 << 27 |         /* Command SubType */
+                                  3 << 24 |         /* 3D Command Opcode */
+                                  1 << 11 |         /* Extended Parameter Enable */
+                                  is_predicated << 8 |
+                                  8 << 0);          /* DWord Length */
+         commands[cmd_idx + 1] = 1 << 8;            /* Indexed */
+         commands[cmd_idx + 2] = index_count;       /* Vertex Count Per Instance */
+         commands[cmd_idx + 3] = first_index;       /* Start Vertex Location */
+         commands[cmd_idx + 4] = instance_count;    /* Instance Count */
+         commands[cmd_idx + 5] = first_instance;    /* Start Instance Location */
+         commands[cmd_idx + 6] = vertex_offset;     /* Base Vertex Location */
+         commands[cmd_idx + 7] = vertex_offset;     /* gl_BaseVertex */
+         commands[cmd_idx + 8] = first_instance;    /* gl_BaseInstance */
+         commands[cmd_idx + 9] = draw_id;           /* gl_DrawID */
+      } else {
+         /* Loading a VkDrawIndirectCommand structure */
+         uint vertex_count   = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_vertex   = indirect_data[indirect_data_offset + 2];
+         uint first_instance = indirect_data[indirect_data_offset + 3];
+
+         commands[cmd_idx + 0] = (3 << 29 |         /* Command Type */
+                                  3 << 27 |         /* Command SubType */
+                                  3 << 24 |         /* 3D Command Opcode */
+                                  1 << 11 |         /* Extended Parameter Enable */
+                                  is_predicated << 8 |
+                                  8 << 0);          /* DWord Length */
+         commands[cmd_idx + 1] = 0;
+         commands[cmd_idx + 2] = vertex_count;      /* Vertex Count Per Instance */
+         commands[cmd_idx + 3] = first_vertex;      /* Start Vertex Location */
+         commands[cmd_idx + 4] = instance_count;    /* Instance Count */
+         commands[cmd_idx + 5] = first_instance;    /* Start Instance Location */
+         commands[cmd_idx + 6] = 0;                 /* Base Vertex Location */
+         commands[cmd_idx + 7] = first_vertex;      /* gl_BaseVertex */
+         commands[cmd_idx + 8] = first_instance;    /* gl_BaseInstance */
+         commands[cmd_idx + 9] = draw_id;           /* gl_DrawID */
+      }
+   }
+}
--- a/src/intel/vulkan/shaders/generated_draws_count.glsl
+++ b/src/intel/vulkan/shaders/generated_draws_count.glsl
@ -0,0 +1,118 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 450
+
+/* These 2 bindings will be accessed through A64 messages */
+layout(set = 0, binding = 0, std430) buffer Storage0 {
+   uint indirect_data[];
+};
+
+layout(set = 0, binding = 1, std430) buffer Storage1 {
+   uint commands[];
+};
+
+/* This data will be provided through push constants. */
+layout(set = 0, binding = 2) uniform block {
+   uint is_indexed;
+   uint is_predicated;
+   uint draw_base;
+   uint item_count;
+   uint draw_count;
+   uint instance_multiplier;
+   uint indirect_data_stride;
+   uint end_addr_ldw;
+   uint end_addr_udw;
+};
+
+void main()
+{
+   uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
+   uint indirect_data_offset = item_idx * indirect_data_stride / 4;
+   uint _3dprim_dw_size = 10;
+   uint cmd_idx = item_idx * _3dprim_dw_size;
+
+   /* Loading a VkDrawIndexedIndirectCommand */
+   uint index_count    = indirect_data[indirect_data_offset + 0];
+   uint instance_count = indirect_data[indirect_data_offset + 1];
+   uint first_index    = indirect_data[indirect_data_offset + 2];
+   uint vertex_offset  = indirect_data[indirect_data_offset + 3];
+   uint first_instance = indirect_data[indirect_data_offset + 4];
+   uint draw_id        = draw_base + item_idx;
+
+   if (draw_id < draw_count) {
+      if (is_indexed != 0) {
+         /* Loading a VkDrawIndexedIndirectCommand */
+         uint index_count    = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_index    = indirect_data[indirect_data_offset + 2];
+         uint vertex_offset  = indirect_data[indirect_data_offset + 3];
+         uint first_instance = indirect_data[indirect_data_offset + 4];
+
+         commands[cmd_idx + 0] = (3 << 29 |         /* Command Type */
+                                  3 << 27 |         /* Command SubType */
+                                  3 << 24 |         /* 3D Command Opcode */
+                                  1 << 11 |         /* Extended Parameter Enable */
+                                  is_predicated << 8 |
+                                  8 << 0);          /* DWord Length */
+         commands[cmd_idx + 1] = 1 << 8;            /* Indexed */
+         commands[cmd_idx + 2] = index_count;       /* Vertex Count Per Instance */
+         commands[cmd_idx + 3] = first_index;       /* Start Vertex Location */
+         commands[cmd_idx + 4] = instance_count;    /* Instance Count */
+         commands[cmd_idx + 5] = first_instance;    /* Start Instance Location */
+         commands[cmd_idx + 6] = vertex_offset;     /* Base Vertex Location */
+         commands[cmd_idx + 7] = vertex_offset;     /* gl_BaseVertex */
+         commands[cmd_idx + 8] = first_instance;    /* gl_BaseInstance */
+         commands[cmd_idx + 9] = draw_id;           /* gl_DrawID */
+      } else {
+         /* Loading a VkDrawIndirectCommand structure */
+         uint vertex_count   = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_vertex   = indirect_data[indirect_data_offset + 2];
+         uint first_instance = indirect_data[indirect_data_offset + 3];
+
+         commands[cmd_idx + 0] = (3 << 29 |         /* Command Type */
+                                  3 << 27 |         /* Command SubType */
+                                  3 << 24 |         /* 3D Command Opcode */
+                                  1 << 11 |         /* Extended Parameter Enable */
+                                  is_predicated << 8 |
+                                  8 << 0);          /* DWord Length */
+         commands[cmd_idx + 1] = 0;
+         commands[cmd_idx + 2] = vertex_count;      /* Vertex Count Per Instance */
+         commands[cmd_idx + 3] = first_vertex;      /* Start Vertex Location */
+         commands[cmd_idx + 4] = instance_count;    /* Instance Count */
+         commands[cmd_idx + 5] = first_instance;    /* Start Instance Location */
+         commands[cmd_idx + 6] = 0;                 /* Base Vertex Location */
+         commands[cmd_idx + 7] = first_vertex;      /* gl_BaseVertex */
+         commands[cmd_idx + 8] = first_instance;    /* gl_BaseInstance */
+         commands[cmd_idx + 9] = draw_id;           /* gl_DrawID */
+      }
+   } else if (draw_id == draw_count) {
+      commands[cmd_idx + 0] = (0  << 29 |        /* Command Type */
+                               49 << 23 |        /* MI Command Opcode */
+                               1  << 8  |        /* Address Space Indicator (PPGTT) */
+                               1  << 0);         /* DWord Length */
+      commands[cmd_idx + 1] = end_addr_ldw;
+      commands[cmd_idx + 2] = end_addr_udw;
+   }
+}
--- a/src/intel/vulkan/shaders/meson.build
+++ b/src/intel/vulkan/shaders/meson.build
@ -0,0 +1,59 @@
+# Copyright © 2022 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+float64_spv_h = custom_target(
+  'float64_spv.h',
+  input : [glsl2spirv, float64_glsl_file],
+  output : 'float64_spv.h',
+  command : [
+    prog_python, '@INPUT@', '@OUTPUT@',
+    prog_glslang,
+    '--create-entry', 'main',
+    '--vn', 'float64_spv_source',
+    '--glsl-version', '450',
+    '-Olib',
+  ]
+)
+
+generated_draws_spv_h = custom_target(
+  'generated_draws_spv.h',
+  input : [glsl2spirv, 'generated_draws.glsl'],
+  output : 'generated_draws_spv.h',
+  command : [
+    prog_python, '@INPUT@', '@OUTPUT@',
+    prog_glslang,
+    '--vn', 'generated_draws_spv_source',
+    '--glsl-version', '450',
+    '--stage', 'frag',
+  ]
+)
+
+generated_draws_count_spv_h = custom_target(
+  'generated_draws_count_spv.h',
+  input : [glsl2spirv, 'generated_draws_count.glsl'],
+  output : 'generated_draws_count_spv.h',
+  command : [
+    prog_python, '@INPUT@', '@OUTPUT@',
+    prog_glslang,
+    '--vn', 'generated_draws_count_spv_source',
+    '--glsl-version', '450',
+    '--stage', 'frag',
+  ]
+)
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@ -615,4 +615,8 @@
   DRI_CONF_OPT_B(fp64_workaround_enabled, def, \
                  "Use softpf64 when the shader uses float64, but the device doesn't support that type")

+#define DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(def) \
+   DRI_CONF_OPT_I(generated_indirect_threshold, def, 0, INT32_MAX, \
+                  "Indirect threshold count above which we start generating commands")
+
 #endif