anv/brw: prep work for SIMD32 ray queries

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
(cherry picked from commit 6d19b898e7)

Conflicts:
	src/intel/compiler/brw/brw_lower_logical_sends.cpp

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39462>
This commit is contained in:
Lionel Landwerlin 2026-01-06 16:57:46 +02:00 committed by Dylan Baker
parent 4bdf4f3e89
commit 5f119bb7af
6 changed files with 53 additions and 22 deletions

View file

@ -4144,7 +4144,7 @@
"description": "anv/brw: prep work for SIMD32 ray queries",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -2520,6 +2520,8 @@ system_value("leaf_procedural_intel", 1, bit_sizes=[1])
# 2: Miss
# 3: Intersection
system_value("btd_shader_type_intel", 1)
# 64bit pointer to a couple of RT_DISPATCH_GLOBALS structure each aligned to
# 64B, the pointer needs 256B aligned.
system_value("ray_query_global_intel", 1, bit_sizes=[64])
# Source 0: Accumulator matrix (type specified by DEST_TYPE)

View file

@ -28,6 +28,7 @@
#include "brw_eu.h"
#include "brw_shader.h"
#include "brw_builder.h"
#include "brw_rt.h"
#include "util/bitpack_helpers.h"
@ -2353,11 +2354,16 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
brw_reg header = ubld.vgrf(BRW_TYPE_UD);
ubld.MOV(header, brw_imm_ud(0));
const uint32_t second_group_offset =
align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
if (globals_addr.file != UNIFORM) {
brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD);
addr_ud.stride = 1;
ubld.group(2, 0).MOV(header, addr_ud);
if (inst->group == 16)
ubld.group(1, 0).ADD(header, header, brw_imm_ud(second_group_offset));
} else {
/* If the globals address comes from a uniform, do not do the SIMD2
* optimization. This occurs in many Vulkan CTS tests.
@ -2367,8 +2373,14 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
* UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2
* optimization violates that assumption.
*/
ubld.group(1, 0).MOV(byte_offset(header, 0),
subscript(globals_addr, BRW_TYPE_UD, 0));
if (inst->group == 16) {
ubld.group(1, 0).ADD(byte_offset(header, 0),
subscript(globals_addr, BRW_TYPE_UD, 0),
brw_imm_ud(second_group_offset));
} else {
ubld.group(1, 0).MOV(byte_offset(header, 0),
subscript(globals_addr, BRW_TYPE_UD, 0));
}
ubld.group(1, 0).MOV(byte_offset(header, 4),
subscript(globals_addr, BRW_TYPE_UD, 1));
}

View file

@ -38,6 +38,13 @@ extern "C" {
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
/** RT_DISPATCH_GLOBALS alignment
*
* Use 256B to make sure we can access the pair of RT_DISPATCH_GLOBALS without
* 64bit math.
*/
#define BRW_RT_DISPATCH_GLOBALS_ALIGN 256
/** Offset after the RT dispatch globals at which "push" constants live */
#define BRW_RT_PUSH_CONST_OFFSET 128

View file

@ -4161,7 +4161,11 @@ struct anv_push_constants {
*/
uint32_t surfaces_base_offset;
/** Ray query globals (RT_DISPATCH_GLOBALS) */
/** Ray query globals
*
* Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
* genX(cmd_buffer_ray_query_globals))
*/
uint64_t ray_query_globals;
union {

View file

@ -817,7 +817,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
struct anv_state state =
anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
BRW_RT_DISPATCH_GLOBALS_ALIGN);
struct brw_rt_scratch_layout layout;
uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
* some cases?
@ -827,23 +828,28 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
.MemBaseAddress = (struct anv_address) {
/* The ray query HW computes offsets from the top of the buffer, so
* let the address at the end of the buffer.
*/
.bo = device->ray_query_bo[idx],
.offset = device->ray_query_bo[idx]->size
},
.AsyncRTStackSize = layout.ray_stack_stride / 64,
.NumDSSRTStacks = layout.stack_ids_per_dss,
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
.ResumeShaderTable = (struct anv_address) {
.bo = cmd_buffer->state.ray_query_shadow_bo,
},
};
GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
for (uint32_t i = 0; i < 2; i++) {
const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
.MemBaseAddress = (struct anv_address) {
/* The ray query HW computes offsets from the top of the buffer, so
* let the address at the end of the buffer.
*/
.bo = device->ray_query_bo[idx],
.offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
},
.AsyncRTStackSize = layout.ray_stack_stride / 64,
.NumDSSRTStacks = layout.stack_ids_per_dss,
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
.ResumeShaderTable = (struct anv_address) {
.bo = cmd_buffer->state.ray_query_shadow_bo,
},
};
GENX(RT_DISPATCH_GLOBALS_pack)(
NULL,
state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
&rtdg);
}
return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
#else