anv: add a pass to realign global loads on DX CBV resources

CBV resources are supposed to be 256B aligned
(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT).

vkd3d-proton will puts CBV addresses in the push constant data and do
global loads on them. Unfortunately those loads don't have a 256B
alignment value on them. So when looking at what we can promote to HW
push buffers, we can't consider them.

This change introduces a detection pass for CBV resources (according
to vkd3d-proton devs those are 64KiB in size) and realign the loads to
be 256B aligned.

This is only enabled on DX emulation.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Felix DeGrood <felix.j.degrood@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39451>
This commit is contained in:
Lionel Landwerlin 2026-04-16 23:15:21 +03:00 committed by Marge Bot
parent bba428ce3f
commit eda83bc2b6
8 changed files with 109 additions and 2 deletions

View file

@ -29,6 +29,7 @@ static const driOptionDescription anv_dri_options[] = {
DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100)
DRI_CONF_ANV_PROMOTE_CBV_TO_PUSH_BUFFERS(false)
DRI_CONF_ANV_STATE_CACHE_PERF_FIX(false)
DRI_CONF_NO_16BIT(false)
DRI_CONF_INTEL_BINDING_TABLE_BLOCK_SIZE(BINDING_TABLE_POOL_DEFAULT_BLOCK_SIZE,
@ -198,6 +199,8 @@ anv_init_dri_options(struct anv_instance *instance)
driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
instance->force_filter_addr_rounding =
driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding");
instance->promote_cbv_to_push_buffers =
driQueryOptionb(&instance->dri_options, "anv_promote_cbv_to_push_buffers");
instance->state_cache_perf_fix =
driQueryOptionb(&instance->dri_options, "anv_state_cache_perf_fix");
instance->lower_depth_range_rate =

View file

@ -125,6 +125,8 @@ struct anv_nir_push_layout_info {
bool anv_nir_shrink_push_constant_ranges(nir_shader *nir);
bool anv_nir_realign_cbv(nir_shader *shader);
bool anv_nir_compute_push_layout(nir_shader *nir,
const struct anv_physical_device *pdevice,
enum brw_robustness_flags robust_flags,

View file

@ -0,0 +1,79 @@
/* Copyright © 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "anv_nir.h"
#include "nir/nir_builder.h"
/**
* This file implements a pass that looks for global read-only loads, from a
* pointer in the push constant data and based on the block size (64KiB
* indicating a CBV resource), align the load to 256B which the alignment
* guarantee the applications should make. This alignment guarantee can later
* be used to promote those 64bit pointers to push buffers (HW needs 32B
* alignment).
*/
static bool
realign_cbv(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
if (intrin->intrinsic != nir_intrinsic_load_deref)
return false;
/* If writable, it's not CBV. */
if ((nir_intrinsic_access(intrin) & ACCESS_NON_WRITEABLE) == 0)
return false;
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
/* Find the root of the deref to see if it's a pointer in the push constant
* data.
*/
while (true) {
if (deref->deref_type == nir_deref_type_var)
return false;
nir_deref_instr *parent = nir_src_as_deref(deref->parent);
if (!parent)
break;
deref = parent;
}
assert(deref->deref_type == nir_deref_type_cast);
/* This is the magic value vkd3d-proton puts allowing us to recognize a
* CBV.
*/
if (glsl_get_explicit_size(deref->type, true) != 64 * 1024)
return false;
nir_scalar val = { deref->parent.ssa, 0 };
if (nir_scalar_is_alu(val)) {
nir_alu_instr *pack_alu = nir_def_as_alu(val.def);
if (pack_alu->op != nir_op_pack_64_2x32_split)
return false;
val = (nir_scalar){ pack_alu->src[0].src.ssa, pack_alu->src[0].swizzle[0] };
}
if (!nir_scalar_is_intrinsic(val))
return false;
/* If it's not a value coming from the push constant data, give up. */
nir_intrinsic_instr *push_intrin = nir_def_as_intrinsic(val.def);
if (push_intrin->intrinsic != nir_intrinsic_load_push_constant)
return false;
/* Realign to the CBV requirement */
deref = nir_src_as_deref(intrin->src[0]);
deref->cast.align_mul = 256;
return true;
}
bool
anv_nir_realign_cbv(nir_shader *shader)
{
return nir_shader_intrinsics_pass(shader, realign_cbv, nir_metadata_all, NULL);
}

View file

@ -1820,6 +1820,7 @@ struct anv_instance {
bool external_memory_implicit_sync;
bool force_guc_low_latency;
bool emulate_read_without_format;
bool promote_cbv_to_push_buffers;
/**
* Workarounds for game bugs.

View file

@ -186,6 +186,9 @@ anv_shader_init_uuid(struct anv_physical_device *device)
const bool btp_bti_rcc = device->rt_change_needs_flush;
_mesa_blake3_update(&ctx, &btp_bti_rcc, sizeof(btp_bti_rcc));
const bool cbv_push_buffer = device->instance->promote_cbv_to_push_buffers;
_mesa_blake3_update(&ctx, &cbv_push_buffer, sizeof(cbv_push_buffer));
uint8_t blake3[BLAKE3_KEY_LEN];
_mesa_blake3_final(&ctx, blake3);
memcpy(device->shader_binary_uuid, blake3, sizeof(device->shader_binary_uuid));
@ -1530,11 +1533,24 @@ anv_shader_lower_nir(struct anv_device *device,
pdevice->isl_dev.shader_tiling);
}
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
nir_address_format_64bit_global);
/* Lower push constants variables prior to global realignment for CBV
* resources, it makes identifying a 64bit pointer from the push constants
* easier.
*/
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
nir_address_format_32bit_offset);
/* Realign pointers to CBV on stages that can promote to push buffers. */
if (pdevice->instance->promote_cbv_to_push_buffers &&
nir->info.stage <= MESA_SHADER_FRAGMENT) {
/* Cleanup for the analysis, we don't want any ALU */
cleanup_nir(nir);
NIR_PASS(_, nir, anv_nir_realign_cbv);
}
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
nir_address_format_64bit_global);
NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info);
shader_data->push_desc_info.used_descriptors =

View file

@ -185,6 +185,7 @@ libanv_files = files(
'anv_nir_lower_unaligned_dispatch.c',
'anv_nir_push_constants_analysis.c',
'anv_nir_push_descriptor_analysis.c',
'anv_nir_realign_cbv.c',
'anv_perf.c',
'anv_physical_device.c',
'anv_pipeline_cache.c',

View file

@ -1056,6 +1056,7 @@ TODO: document the other workarounds.
</application>
<engine engine_name_match="vkd3d|DXVK">
<option name="anv_force_filter_addr_rounding" value="true" />
<option name="anv_promote_cbv_to_push_buffers" value="true" />
</engine>
<!-- Needed to avoid XeSS code paths. -->
<application name="Marvel's Spider-Man Remastered" executable="Spider-Man.exe">

View file

@ -937,6 +937,10 @@
#define DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(def) \
DRI_CONF_OPT_B(anv_external_memory_implicit_sync, def, "Implicit sync on external BOs")
#define DRI_CONF_ANV_PROMOTE_CBV_TO_PUSH_BUFFERS(def) \
DRI_CONF_OPT_B(anv_promote_cbv_to_push_buffers, def, \
"Promote CBV 64bit pointers in push constant data to push buffers")
#define DRI_CONF_ANV_STATE_CACHE_PERF_FIX(def) \
DRI_CONF_OPT_B(anv_state_cache_perf_fix, def, \
"Whether COMMON_SLICE_CHICKEN3 bit13 should be programmed to enable BTP+BTI RCC keying")