From eda83bc2b6ba769718ca1e5288a319e4132dae1d Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 16 Apr 2026 23:15:21 +0300 Subject: [PATCH] anv: add a pass to realign global loads on DX CBV resources CBV resources are supposed to be 256B aligned (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT). vkd3d-proton will puts CBV addresses in the push constant data and do global loads on them. Unfortunately those loads don't have a 256B alignment value on them. So when looking at what we can promote to HW push buffers, we can't consider them. This change introduces a detection pass for CBV resources (according to vkd3d-proton devs those are 64KiB in size) and realign the loads to be 256B aligned. This is only enabled on DX emulation. Signed-off-by: Lionel Landwerlin Reviewed-by: Felix DeGrood Part-of: --- src/intel/vulkan/anv_instance.c | 3 + src/intel/vulkan/anv_nir.h | 2 + src/intel/vulkan/anv_nir_realign_cbv.c | 79 ++++++++++++++++++++++++++ src/intel/vulkan/anv_private.h | 1 + src/intel/vulkan/anv_shader_compile.c | 20 ++++++- src/intel/vulkan/meson.build | 1 + src/util/00-mesa-defaults.conf | 1 + src/util/driconf.h | 4 ++ 8 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 src/intel/vulkan/anv_nir_realign_cbv.c diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c index a8c667d24ba..8002a5cc5f7 100644 --- a/src/intel/vulkan/anv_instance.c +++ b/src/intel/vulkan/anv_instance.c @@ -29,6 +29,7 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false) DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4) DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100) + DRI_CONF_ANV_PROMOTE_CBV_TO_PUSH_BUFFERS(false) DRI_CONF_ANV_STATE_CACHE_PERF_FIX(false) DRI_CONF_NO_16BIT(false) DRI_CONF_INTEL_BINDING_TABLE_BLOCK_SIZE(BINDING_TABLE_POOL_DEFAULT_BLOCK_SIZE, @@ -198,6 +199,8 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour"); instance->force_filter_addr_rounding = driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding"); + instance->promote_cbv_to_push_buffers = + driQueryOptionb(&instance->dri_options, "anv_promote_cbv_to_push_buffers"); instance->state_cache_perf_fix = driQueryOptionb(&instance->dri_options, "anv_state_cache_perf_fix"); instance->lower_depth_range_rate = diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index f7664e5afb4..637d3bdc810 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -125,6 +125,8 @@ struct anv_nir_push_layout_info { bool anv_nir_shrink_push_constant_ranges(nir_shader *nir); +bool anv_nir_realign_cbv(nir_shader *shader); + bool anv_nir_compute_push_layout(nir_shader *nir, const struct anv_physical_device *pdevice, enum brw_robustness_flags robust_flags, diff --git a/src/intel/vulkan/anv_nir_realign_cbv.c b/src/intel/vulkan/anv_nir_realign_cbv.c new file mode 100644 index 00000000000..85a437bfbd9 --- /dev/null +++ b/src/intel/vulkan/anv_nir_realign_cbv.c @@ -0,0 +1,79 @@ +/* Copyright © 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_nir.h" +#include "nir/nir_builder.h" + +/** + * This file implements a pass that looks for global read-only loads, from a + * pointer in the push constant data and based on the block size (64KiB + * indicating a CBV resource), align the load to 256B which the alignment + * guarantee the applications should make. This alignment guarantee can later + * be used to promote those 64bit pointers to push buffers (HW needs 32B + * alignment). + */ + +static bool +realign_cbv(nir_builder *b, nir_intrinsic_instr *intrin, void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_deref) + return false; + + /* If writable, it's not CBV. */ + if ((nir_intrinsic_access(intrin) & ACCESS_NON_WRITEABLE) == 0) + return false; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + /* Find the root of the deref to see if it's a pointer in the push constant + * data. + */ + while (true) { + if (deref->deref_type == nir_deref_type_var) + return false; + + nir_deref_instr *parent = nir_src_as_deref(deref->parent); + if (!parent) + break; + + deref = parent; + } + assert(deref->deref_type == nir_deref_type_cast); + + /* This is the magic value vkd3d-proton puts allowing us to recognize a + * CBV. + */ + if (glsl_get_explicit_size(deref->type, true) != 64 * 1024) + return false; + + nir_scalar val = { deref->parent.ssa, 0 }; + + if (nir_scalar_is_alu(val)) { + nir_alu_instr *pack_alu = nir_def_as_alu(val.def); + if (pack_alu->op != nir_op_pack_64_2x32_split) + return false; + + val = (nir_scalar){ pack_alu->src[0].src.ssa, pack_alu->src[0].swizzle[0] }; + } + + if (!nir_scalar_is_intrinsic(val)) + return false; + + /* If it's not a value coming from the push constant data, give up. */ + nir_intrinsic_instr *push_intrin = nir_def_as_intrinsic(val.def); + if (push_intrin->intrinsic != nir_intrinsic_load_push_constant) + return false; + + /* Realign to the CBV requirement */ + deref = nir_src_as_deref(intrin->src[0]); + deref->cast.align_mul = 256; + + return true; +} + +bool +anv_nir_realign_cbv(nir_shader *shader) +{ + return nir_shader_intrinsics_pass(shader, realign_cbv, nir_metadata_all, NULL); +} diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index f9e812ab8c5..44c2d1c7a82 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1820,6 +1820,7 @@ struct anv_instance { bool external_memory_implicit_sync; bool force_guc_low_latency; bool emulate_read_without_format; + bool promote_cbv_to_push_buffers; /** * Workarounds for game bugs. diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index 1315df9d27f..f8c27ac217e 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -186,6 +186,9 @@ anv_shader_init_uuid(struct anv_physical_device *device) const bool btp_bti_rcc = device->rt_change_needs_flush; _mesa_blake3_update(&ctx, &btp_bti_rcc, sizeof(btp_bti_rcc)); + const bool cbv_push_buffer = device->instance->promote_cbv_to_push_buffers; + _mesa_blake3_update(&ctx, &cbv_push_buffer, sizeof(cbv_push_buffer)); + uint8_t blake3[BLAKE3_KEY_LEN]; _mesa_blake3_final(&ctx, blake3); memcpy(device->shader_binary_uuid, blake3, sizeof(device->shader_binary_uuid)); @@ -1530,11 +1533,24 @@ anv_shader_lower_nir(struct anv_device *device, pdevice->isl_dev.shader_tiling); } - NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, - nir_address_format_64bit_global); + /* Lower push constants variables prior to global realignment for CBV + * resources, it makes identifying a 64bit pointer from the push constants + * easier. + */ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset); + /* Realign pointers to CBV on stages that can promote to push buffers. */ + if (pdevice->instance->promote_cbv_to_push_buffers && + nir->info.stage <= MESA_SHADER_FRAGMENT) { + /* Cleanup for the analysis, we don't want any ALU */ + cleanup_nir(nir); + NIR_PASS(_, nir, anv_nir_realign_cbv); + } + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, + nir_address_format_64bit_global); + NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info); shader_data->push_desc_info.used_descriptors = diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 22ed0712599..5894e535e9d 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -185,6 +185,7 @@ libanv_files = files( 'anv_nir_lower_unaligned_dispatch.c', 'anv_nir_push_constants_analysis.c', 'anv_nir_push_descriptor_analysis.c', + 'anv_nir_realign_cbv.c', 'anv_perf.c', 'anv_physical_device.c', 'anv_pipeline_cache.c', diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 9b5dcfe8728..58f1db2bc9b 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -1056,6 +1056,7 @@ TODO: document the other workarounds. diff --git a/src/util/driconf.h b/src/util/driconf.h index efd756fe765..348425a00c6 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -937,6 +937,10 @@ #define DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(def) \ DRI_CONF_OPT_B(anv_external_memory_implicit_sync, def, "Implicit sync on external BOs") +#define DRI_CONF_ANV_PROMOTE_CBV_TO_PUSH_BUFFERS(def) \ + DRI_CONF_OPT_B(anv_promote_cbv_to_push_buffers, def, \ + "Promote CBV 64bit pointers in push constant data to push buffers") + #define DRI_CONF_ANV_STATE_CACHE_PERF_FIX(def) \ DRI_CONF_OPT_B(anv_state_cache_perf_fix, def, \ "Whether COMMON_SLICE_CHICKEN3 bit13 should be programmed to enable BTP+BTI RCC keying")