diff --git a/src/intel/common/intel_batch_decoder.c b/src/intel/common/intel_batch_decoder.c
index afe0f39f887..8ec3be21c37 100644
--- a/src/intel/common/intel_batch_decoder.c
+++ b/src/intel/common/intel_batch_decoder.c
@@ -25,6 +25,7 @@
 #include "intel_disasm.h"
 #include "util/macros.h"
 #include "util/u_debug.h"
+#include "util/u_dynarray.h"
 #include "util/u_math.h" /* Needed for ROUND_DOWN_TO */
 
 #include <string.h>
@@ -67,11 +68,18 @@ intel_batch_decode_ctx_init(struct intel_batch_decode_ctx *ctx,
       ctx->spec = intel_spec_load(devinfo);
    else
       ctx->spec = intel_spec_load_from_path(devinfo, xml_path);
+
+   ctx->commands =
+      _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   ctx->stats =
+      _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal);
 }
 
 void
 intel_batch_decode_ctx_finish(struct intel_batch_decode_ctx *ctx)
 {
+   _mesa_hash_table_destroy(ctx->commands, NULL);
+   _mesa_hash_table_destroy(ctx->stats, NULL);
    intel_spec_destroy(ctx->spec);
 }
 
@@ -1602,3 +1610,133 @@ intel_print_batch(struct intel_batch_decode_ctx *ctx,
 
    ctx->n_batch_buffer_start--;
 }
+
+void
+intel_batch_stats_reset(struct intel_batch_decode_ctx *ctx)
+{
+   _mesa_hash_table_clear(ctx->stats, NULL);
+}
+
+void
+intel_batch_stats(struct intel_batch_decode_ctx *ctx,
+                  const uint32_t *batch, uint32_t batch_size,
+                  uint64_t batch_addr, bool from_ring)
+{
+   const uint32_t *p, *end = batch + batch_size / sizeof(uint32_t);
+   int length;
+   struct intel_group *inst;
+
+   if (ctx->n_batch_buffer_start >= 100) {
+      fprintf(stderr, "Max batch buffer jumps exceeded\n");
+      return;
+   }
+
+   ctx->n_batch_buffer_start++;
+
+   for (p = batch; p < end; p += length) {
+      inst = intel_ctx_find_instruction(ctx, p);
+      length = intel_group_get_length(inst, p);
+      assert(inst == NULL || length > 0);
+      length = MAX2(1, length);
+
+      const char *name =
+         inst != NULL ? inst->name : "unknown";
+
+      struct hash_entry *entry = _mesa_hash_table_search(ctx->stats, name);
+      if (entry != NULL) {
+         entry->data = (void *)((uintptr_t)entry->data + 1);
+      } else {
+         _mesa_hash_table_insert(ctx->stats, name, (void *)(uintptr_t)1);
+      }
+
+      if (inst == NULL)
+         continue;
+
+      if (strcmp(inst->name, "MI_BATCH_BUFFER_START") == 0) {
+         uint64_t next_batch_addr = 0;
+         bool ppgtt = false;
+         bool second_level = false;
+         bool predicate = false;
+         struct intel_field_iterator iter;
+         intel_field_iterator_init(&iter, inst, p, 0, false);
+         while (intel_field_iterator_next(&iter)) {
+            if (strcmp(iter.name, "Batch Buffer Start Address") == 0) {
+               next_batch_addr = iter.raw_value;
+            } else if (strcmp(iter.name, "Second Level Batch Buffer") == 0) {
+               second_level = iter.raw_value;
+            } else if (strcmp(iter.name, "Address Space Indicator") == 0) {
+               ppgtt = iter.raw_value;
+            } else if (strcmp(iter.name, "Predication Enable") == 0) {
+               predicate = iter.raw_value;
+            }
+         }
+
+         if (!predicate) {
+            struct intel_batch_decode_bo next_batch =
+               ctx_get_bo(ctx, ppgtt, next_batch_addr);
+
+            if (next_batch.map == NULL) {
+               fprintf(stderr, "Secondary batch at 0x%08"PRIx64" unavailable\n",
+                       next_batch_addr);
+            } else {
+               intel_batch_stats(ctx, next_batch.map, next_batch.size,
+                                 next_batch.addr, false);
+            }
+            if (second_level) {
+               /* MI_BATCH_BUFFER_START with "2nd Level Batch Buffer" set acts
+                * like a subroutine call.  Commands that come afterwards get
+                * processed once the 2nd level batch buffer returns with
+                * MI_BATCH_BUFFER_END.
+                */
+               continue;
+            } else if (!from_ring) {
+               /* MI_BATCH_BUFFER_START with "2nd Level Batch Buffer" unset acts
+                * like a goto.  Nothing after it will ever get processed.  In
+                * order to prevent the recursion from growing, we just reset the
+                * loop and continue;
+                */
+               break;
+            }
+         }
+      } else if (strcmp(inst->name, "MI_BATCH_BUFFER_END") == 0) {
+         break;
+      }
+   }
+
+   ctx->n_batch_buffer_start--;
+}
+
+struct inst_stat {
+   const char *name;
+   uint32_t    count;
+};
+
+static int
+compare_inst_stat(const void *v1, const void *v2)
+{
+   const struct inst_stat *i1 = v1, *i2 = v2;
+   return strcmp(i1->name, i2->name);
+}
+
+void
+intel_batch_print_stats(struct intel_batch_decode_ctx *ctx)
+{
+   struct util_dynarray arr;
+   util_dynarray_init(&arr, NULL);
+
+   hash_table_foreach(ctx->stats, entry) {
+      struct inst_stat inst = {
+         .name = (const char *)entry->key,
+         .count = (uintptr_t)entry->data,
+      };
+      util_dynarray_append(&arr, struct inst_stat, inst);
+   }
+   qsort(util_dynarray_begin(&arr),
+         util_dynarray_num_elements(&arr, struct inst_stat),
+         sizeof(struct inst_stat),
+         compare_inst_stat);
+   util_dynarray_foreach(&arr, struct inst_stat, i)
+      fprintf(ctx->fp, "%-40s: %u\n", i->name, i->count);
+
+   util_dynarray_fini(&arr);
+}
diff --git a/src/intel/common/intel_batch_decoder_stub.c b/src/intel/common/intel_batch_decoder_stub.c
index 27bfeb42946..808755d7f63 100644
--- a/src/intel/common/intel_batch_decoder_stub.c
+++ b/src/intel/common/intel_batch_decoder_stub.c
@@ -53,3 +53,21 @@ intel_print_batch(struct intel_batch_decode_ctx *ctx,
 {
    mesa_logw("Batch logging not supported on Android.");
 }
+
+void
+intel_batch_stats_reset(struct intel_batch_decode_ctx *ctx)
+{
+}
+
+void
+intel_batch_stats(struct intel_batch_decode_ctx *ctx,
+                  const uint32_t *batch, uint32_t batch_size,
+                  uint64_t batch_addr, bool from_ring)
+{
+}
+
+void
+intel_batch_print_stats(struct intel_batch_decode_ctx *ctx)
+{
+   mesa_logw("Batch logging not supported on Android.");
+}
diff --git a/src/intel/common/intel_decoder.h b/src/intel/common/intel_decoder.h
index 09c38a73133..9025c8a384a 100644
--- a/src/intel/common/intel_decoder.h
+++ b/src/intel/common/intel_decoder.h
@@ -273,6 +273,9 @@ struct intel_batch_decode_ctx {
 
    int n_batch_buffer_start;
    uint64_t acthd;
+
+   struct hash_table *commands;
+   struct hash_table *stats;
 };
 
 void intel_batch_decode_ctx_init(struct intel_batch_decode_ctx *ctx,
@@ -293,6 +296,14 @@ void intel_print_batch(struct intel_batch_decode_ctx *ctx,
                        const uint32_t *batch, uint32_t batch_size,
                        uint64_t batch_addr, bool from_ring);
 
+void intel_batch_stats_reset(struct intel_batch_decode_ctx *ctx);
+
+void intel_batch_stats(struct intel_batch_decode_ctx *ctx,
+                       const uint32_t *batch, uint32_t batch_size,
+                       uint64_t batch_addr, bool from_ring);
+
+void intel_batch_print_stats(struct intel_batch_decode_ctx *ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/intel/dev/intel_debug.c b/src/intel/dev/intel_debug.c
index 70f340eb487..e7c643131c9 100644
--- a/src/intel/dev/intel_debug.c
+++ b/src/intel/dev/intel_debug.c
@@ -105,6 +105,7 @@ static const struct debug_control debug_control[] = {
    { "isl",         DEBUG_ISL },
    { "sparse",      DEBUG_SPARSE },
    { "draw_bkp",    DEBUG_DRAW_BKP },
+   { "bat-stats",   DEBUG_BATCH_STATS },
    { NULL,    0 }
 };
 
diff --git a/src/intel/dev/intel_debug.h b/src/intel/dev/intel_debug.h
index dde6b15f8d0..727d77d04a6 100644
--- a/src/intel/dev/intel_debug.h
+++ b/src/intel/dev/intel_debug.h
@@ -95,6 +95,7 @@ extern uint64_t intel_debug;
 #define DEBUG_ISL                 (1ull << 47)
 #define DEBUG_SPARSE              (1ull << 48)
 #define DEBUG_DRAW_BKP            (1ull << 49)
+#define DEBUG_BATCH_STATS         (1ull << 50)
 
 #define DEBUG_ANY                 (~0ull)
 
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index b3afc5ebd93..2f8039e0bc0 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -1190,7 +1190,7 @@ anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
                                 struct anv_query_pool *perf_query_pool,
                                 uint32_t perf_query_pass)
 {
-   if (!INTEL_DEBUG(DEBUG_BATCH))
+   if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
       return;
 
    struct anv_device *device = queue->device;
@@ -1209,19 +1209,28 @@ anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
          uint64_t pass_batch_offset =
             khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
 
-         intel_print_batch(queue->decoder,
-                           pass_batch_bo->map + pass_batch_offset, 64,
-                           pass_batch_bo->offset + pass_batch_offset, false);
+         if (INTEL_DEBUG(DEBUG_BATCH)) {
+            intel_print_batch(queue->decoder,
+                              pass_batch_bo->map + pass_batch_offset, 64,
+                              pass_batch_bo->offset + pass_batch_offset, false);
+         }
       }
 
       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-         struct anv_batch_bo **bo = u_vector_tail(&cmd_buffers[i]->seen_bbos);
+         struct anv_batch_bo *bbo =
+            list_first_entry(&cmd_buffers[i]->batch_bos, struct anv_batch_bo, link);
          device->cmd_buffer_being_decoded = cmd_buffers[i];
-         intel_print_batch(queue->decoder, (*bo)->bo->map,
-                           (*bo)->bo->size, (*bo)->bo->offset, false);
+         if (INTEL_DEBUG(DEBUG_BATCH)) {
+            intel_print_batch(queue->decoder, bbo->bo->map,
+                              bbo->bo->size, bbo->bo->offset, false);
+         }
+         if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
+            intel_batch_stats(queue->decoder, bbo->bo->map,
+                              bbo->bo->size, bbo->bo->offset, false);
+         }
          device->cmd_buffer_being_decoded = NULL;
       }
-   } else {
+   } else if (INTEL_DEBUG(DEBUG_BATCH)) {
       intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
                         device->trivial_batch_bo->size,
                         device->trivial_batch_bo->offset, false);
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 55e95569b18..0ae707e6cec 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -3071,7 +3071,7 @@ VkResult anv_CreateDevice(
    if (result != VK_SUCCESS)
       goto fail_alloc;
 
-   if (INTEL_DEBUG(DEBUG_BATCH)) {
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
       for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
          struct intel_batch_decode_ctx *decoder = &device->decoder[i];
 
@@ -3082,6 +3082,7 @@ VkResult anv_CreateDevice(
                                      &physical_device->info,
                                      stderr, decode_flags, NULL,
                                      decode_get_bo, NULL, device);
+         intel_batch_stats_reset(decoder);
 
          decoder->engine = physical_device->queue.families[i].engine_class;
          decoder->dynamic_base = physical_device->va.dynamic_state_pool.addr;
@@ -3644,9 +3645,12 @@ void anv_DestroyDevice(
 
    anv_device_destroy_context_or_vm(device);
 
-   if (INTEL_DEBUG(DEBUG_BATCH)) {
-      for (unsigned i = 0; i < pdevice->queue.family_count; i++)
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
+      for (unsigned i = 0; i < pdevice->queue.family_count; i++) {
+         if (INTEL_DEBUG(DEBUG_BATCH_STATS))
+            intel_batch_print_stats(&device->decoder[i]);
          intel_batch_decode_ctx_finish(&device->decoder[i]);
+      }
    }
 
    close(device->fd);