From 5f119bb7afebcf997a0b286330ed0a8e5bb8a71e Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 6 Jan 2026 16:57:46 +0200 Subject: [PATCH] anv/brw: prep work for SIMD32 ray queries Signed-off-by: Lionel Landwerlin Cc: mesa-stable Reviewed-by: Sagar Ghuge (cherry picked from commit 6d19b898e730b6d7abeb6b0c6cf217456d4b680f) Conflicts: src/intel/compiler/brw/brw_lower_logical_sends.cpp Part-of: --- .pick_status.json | 2 +- src/compiler/nir/nir_intrinsics.py | 2 + .../compiler/brw/brw_lower_logical_sends.cpp | 16 ++++++- src/intel/compiler/brw/brw_rt.h | 7 ++++ src/intel/vulkan/anv_private.h | 6 ++- src/intel/vulkan/genX_cmd_compute.c | 42 +++++++++++-------- 6 files changed, 53 insertions(+), 22 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 1b7d98d67cb..fb359e9c3e9 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -4144,7 +4144,7 @@ "description": "anv/brw: prep work for SIMD32 ray queries", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 6cacbd2abbf..fdfd9d9a7de 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2520,6 +2520,8 @@ system_value("leaf_procedural_intel", 1, bit_sizes=[1]) # 2: Miss # 3: Intersection system_value("btd_shader_type_intel", 1) +# 64bit pointer to a couple of RT_DISPATCH_GLOBALS structure each aligned to +# 64B, the pointer needs 256B aligned. system_value("ray_query_global_intel", 1, bit_sizes=[64]) # Source 0: Accumulator matrix (type specified by DEST_TYPE) diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 567ab45431a..74e228a5e45 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -28,6 +28,7 @@ #include "brw_eu.h" #include "brw_shader.h" #include "brw_builder.h" +#include "brw_rt.h" #include "util/bitpack_helpers.h" @@ -2353,11 +2354,16 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst) brw_reg header = ubld.vgrf(BRW_TYPE_UD); ubld.MOV(header, brw_imm_ud(0)); + const uint32_t second_group_offset = + align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64); + const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS]; if (globals_addr.file != UNIFORM) { brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD); addr_ud.stride = 1; ubld.group(2, 0).MOV(header, addr_ud); + if (inst->group == 16) + ubld.group(1, 0).ADD(header, header, brw_imm_ud(second_group_offset)); } else { /* If the globals address comes from a uniform, do not do the SIMD2 * optimization. This occurs in many Vulkan CTS tests. @@ -2367,8 +2373,14 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst) * UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2 * optimization violates that assumption. */ - ubld.group(1, 0).MOV(byte_offset(header, 0), - subscript(globals_addr, BRW_TYPE_UD, 0)); + if (inst->group == 16) { + ubld.group(1, 0).ADD(byte_offset(header, 0), + subscript(globals_addr, BRW_TYPE_UD, 0), + brw_imm_ud(second_group_offset)); + } else { + ubld.group(1, 0).MOV(byte_offset(header, 0), + subscript(globals_addr, BRW_TYPE_UD, 0)); + } ubld.group(1, 0).MOV(byte_offset(header, 4), subscript(globals_addr, BRW_TYPE_UD, 1)); } diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h index f71273efb53..f74f6179039 100644 --- a/src/intel/compiler/brw/brw_rt.h +++ b/src/intel/compiler/brw/brw_rt.h @@ -38,6 +38,13 @@ extern "C" { /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ #define BRW_RT_DISPATCH_GLOBALS_SIZE 80 +/** RT_DISPATCH_GLOBALS alignment + * + * Use 256B to make sure we can access the pair of RT_DISPATCH_GLOBALS without + * 64bit math. + */ +#define BRW_RT_DISPATCH_GLOBALS_ALIGN 256 + /** Offset after the RT dispatch globals at which "push" constants live */ #define BRW_RT_PUSH_CONST_OFFSET 128 diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 20fed27e1ac..8d7d3d6d80d 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -4161,7 +4161,11 @@ struct anv_push_constants { */ uint32_t surfaces_base_offset; - /** Ray query globals (RT_DISPATCH_GLOBALS) */ + /** Ray query globals + * + * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see + * genX(cmd_buffer_ray_query_globals)) + */ uint64_t ray_query_globals; union { diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 03491fdb322..1019b1eb0dd 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -817,7 +817,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) struct anv_state state = anv_cmd_buffer_alloc_temporary_state(cmd_buffer, - BRW_RT_DISPATCH_GLOBALS_SIZE, 64); + 2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64), + BRW_RT_DISPATCH_GLOBALS_ALIGN); struct brw_rt_scratch_layout layout; uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in * some cases? @@ -827,23 +828,28 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); - const struct GENX(RT_DISPATCH_GLOBALS) rtdg = { - .MemBaseAddress = (struct anv_address) { - /* The ray query HW computes offsets from the top of the buffer, so - * let the address at the end of the buffer. - */ - .bo = device->ray_query_bo[idx], - .offset = device->ray_query_bo[idx]->size - }, - .AsyncRTStackSize = layout.ray_stack_stride / 64, - .NumDSSRTStacks = layout.stack_ids_per_dss, - .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, - .Flags = RT_DEPTH_TEST_LESS_EQUAL, - .ResumeShaderTable = (struct anv_address) { - .bo = cmd_buffer->state.ray_query_shadow_bo, - }, - }; - GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg); + for (uint32_t i = 0; i < 2; i++) { + const struct GENX(RT_DISPATCH_GLOBALS) rtdg = { + .MemBaseAddress = (struct anv_address) { + /* The ray query HW computes offsets from the top of the buffer, so + * let the address at the end of the buffer. + */ + .bo = device->ray_query_bo[idx], + .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2), + }, + .AsyncRTStackSize = layout.ray_stack_stride / 64, + .NumDSSRTStacks = layout.stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .ResumeShaderTable = (struct anv_address) { + .bo = cmd_buffer->state.ray_query_shadow_bo, + }, + }; + GENX(RT_DISPATCH_GLOBALS_pack)( + NULL, + state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64), + &rtdg); + } return anv_cmd_buffer_temporary_state_address(cmd_buffer, state); #else