From 8a5ac96a6719469112aacb474ba46495b87ad931 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 26 Jan 2026 13:03:49 +0200 Subject: [PATCH] anv: predicate BTP emissions The previous commit enable different command buffers to program the same 3DSTATE_BINDING_TABLE_POOL_ALLOC instruction even though they allocated different chunks of binding tables. Now we can just predicate this programming and skip the stalling, flushing & invalidation. Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/ds/intel_driver_ds.cc | 1 + src/intel/ds/intel_tracepoints.py | 5 ++++ src/intel/vulkan/anv_private.h | 10 +++++++ src/intel/vulkan/genX_cmd_buffer.c | 45 +++++++++++++++++++++++++----- src/intel/vulkan/genX_mi_builder.h | 9 +++++- 5 files changed, 62 insertions(+), 8 deletions(-) diff --git a/src/intel/ds/intel_driver_ds.cc b/src/intel/ds/intel_driver_ds.cc index 3f79d7df1fd..50d7135a560 100644 --- a/src/intel/ds/intel_driver_ds.cc +++ b/src/intel/ds/intel_driver_ds.cc @@ -466,6 +466,7 @@ CREATE_DUAL_EVENT_CALLBACK(frame, INTEL_DS_QUEUE_STAGE_FRAME) CREATE_DUAL_EVENT_CALLBACK(batch, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) CREATE_DUAL_EVENT_CALLBACK(cmd_buffer, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) CREATE_DUAL_EVENT_CALLBACK(sba, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) +CREATE_DUAL_EVENT_CALLBACK(btp, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) CREATE_DUAL_EVENT_CALLBACK(render_pass, INTEL_DS_QUEUE_STAGE_RENDER_PASS) CREATE_DUAL_EVENT_CALLBACK(blorp, INTEL_DS_QUEUE_STAGE_BLORP) CREATE_DUAL_EVENT_CALLBACK(draw, INTEL_DS_QUEUE_STAGE_DRAW) diff --git a/src/intel/ds/intel_tracepoints.py b/src/intel/ds/intel_tracepoints.py index e4ccad65f82..696d666f4bb 100644 --- a/src/intel/ds/intel_tracepoints.py +++ b/src/intel/ds/intel_tracepoints.py @@ -131,6 +131,11 @@ def define_tracepoints(args): tp_args=[Arg(type='uint8_t', var='mode', c_format='%hhu'),], end_pipelined=False) + # 3DSTATE_BINDING_TABLE_POOL_ALLOC emission, only for Anv + begin_end_tp('btp', + tp_args=[Arg(type='uint64_t', var='addr', c_format='0x%" PRIx64 "'),], + end_pipelined=False) + # Dynamic rendering tracepoints, only for Anv begin_end_tp('render_pass', tp_args=[Arg(type='uint64_t', var='command_buffer_handle', c_format='%" PRIu64 "', perfetto_field=True), diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index e1195fcda6f..a230f031587 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -636,6 +636,13 @@ anv_address_physical(struct anv_address addr) return intel_canonical_address(address); } +static inline bool +anv_address_equals(struct anv_address addr1, + struct anv_address addr2) +{ + return anv_address_physical(addr1) == anv_address_physical(addr2); +} + static inline struct u_trace_address anv_address_utrace(struct anv_address addr) { @@ -4743,6 +4750,9 @@ struct anv_cmd_state { uint64_t address[MAX_SETS]; } descriptor_buffers; + /* Last programmed 3DSTATE_BINDING_TABLE_POOL_ALLOC address */ + struct anv_address btp; + /* For Gen 9, this allocation is 2 greater than the maximum allowed * number of vertex buffers; see comment on get_max_vbs definition. * Specializing this allocation seems needlessly complicated when we can diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index efb721645e5..195d17cde5f 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -418,15 +418,32 @@ genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer) if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer)) return; - /* If we are emitting a new state base address we probably need to re-emit - * binding tables. - */ - cmd_buffer->state.descriptors_dirty |= ~0; - #if GFX_VERx10 >= 125 + struct anv_address btp = anv_cmd_buffer_surface_base_address(cmd_buffer); + if (anv_address_equals(cmd_buffer->state.btp, btp)) + return; + struct anv_device *device = cmd_buffer->device; const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false); + trace_intel_begin_btp(cmd_buffer->batch.trace); + + /* Disable stall tracing to avoid leaving a tracepoint with random + * timestamp if the STATE_BASE_ADDRESS instruction sequence is skipped + * over. + */ + struct u_trace *tmp_trace = cmd_buffer->batch.trace; + cmd_buffer->batch.trace = NULL; + + struct mi_builder b; + mi_builder_init(&b, device->info, &cmd_buffer->batch); + mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false)); + struct mi_goto_target t = MI_GOTO_TARGET_INIT; + mi_goto_if(&b, + mi_ieq(&b, mi_reg64(ANV_BTP_ADDR_REG), + mi_imm(anv_address_physical(btp))), + &t); + /* We're changing base location of binding tables which affects the state * cache. We're adding texture cache invalidation following a * recommendation from the ICL PRMs, Volume 9: Render Engine, Coherency @@ -445,8 +462,7 @@ genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer) "pre BINDING_TABLE_POOL_ALLOC stall"); anv_batch_emit( &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) { - btpa.BindingTablePoolBaseAddress = - anv_cmd_buffer_surface_base_address(cmd_buffer); + btpa.BindingTablePoolBaseAddress = btp; btpa.BindingTablePoolBufferSize = BINDING_TABLE_VIEW_SIZE / 4096; btpa.MOCS = mocs; } @@ -457,9 +473,24 @@ genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer) ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, "post BINDING_TABLE_POOL_ALLOC invalidate"); + mi_store(&b, mi_reg64(ANV_BTP_ADDR_REG), + mi_imm(anv_address_physical(btp))); + + mi_goto_target(&b, &t); + + cmd_buffer->batch.trace = tmp_trace; + cmd_buffer->state.btp = btp; + + trace_intel_end_btp(cmd_buffer->batch.trace, anv_address_physical(btp)); + #else /* GFX_VERx10 < 125 */ genX(cmd_buffer_emit_state_base_address)(cmd_buffer); #endif + + /* If we are emitting a new state base address we probably need to re-emit + * binding tables. + */ + cmd_buffer->state.descriptors_dirty |= ~0; } static void diff --git a/src/intel/vulkan/genX_mi_builder.h b/src/intel/vulkan/genX_mi_builder.h index 4e819e77e37..79ac749192d 100644 --- a/src/intel/vulkan/genX_mi_builder.h +++ b/src/intel/vulkan/genX_mi_builder.h @@ -5,11 +5,12 @@ #pragma once /* We reserve : + * - GPR 12 for 3DSTATE_BINDING_TABLE_POOL_ALLOC address * - GPR 13 for STATE_BASE_ADDRESS bindless surface base address * - GPR 14 for perf queries * - GPR 15 for conditional rendering */ -#define MI_BUILDER_NUM_ALLOC_GPRS 13 +#define MI_BUILDER_NUM_ALLOC_GPRS 12 #ifndef MI_BUILDER_CAN_WRITE_BATCH #define MI_BUILDER_CAN_WRITE_BATCH true #endif @@ -38,3 +39,9 @@ * emissions if the address doesn't change. */ #define ANV_BINDLESS_SURFACE_BASE_ADDR_REG 0x2668 /* MI_ALU_REG13 */ + +/* We reserve this MI ALU register to hold the last programmed + * 3DSTATE_BINDING_TABLE_POOL_ALLOC address so that we can predicate + * 3DSTATE_BINDING_TABLE_POOL_ALLOC emissions if the address doesn't change. + */ +#define ANV_BTP_ADDR_REG 0x2660 /* MI_ALU_REG12 */