From 9f2c6fdca4e4e7e40c8856fedd83a89af4b1aeb8 Mon Sep 17 00:00:00 2001 From: Calder Young Date: Thu, 28 May 2026 11:17:23 -0700 Subject: [PATCH] brw: Move ray payload bitfield generation to NIR This will save us the trouble of faking constant folding for the BVH level and trace ray control values when we lower this intrinsic in the new backends. Reviewed-by: Alyssa Rosenzweig Reviewed-by: Sagar Ghuge Part-of: --- src/compiler/nir/nir_intrinsics.py | 4 +-- src/intel/compiler/brw/brw_eu_defines.h | 6 ++-- src/intel/compiler/brw/brw_from_nir.cpp | 3 +- .../compiler/brw/brw_lower_logical_sends.cpp | 35 +++++-------------- .../compiler/brw/brw_nir_lower_ray_queries.c | 4 +-- .../compiler/brw/brw_nir_lower_shader_calls.c | 16 ++++----- src/intel/compiler/brw/brw_nir_rt.c | 20 +++++------ src/intel/compiler/brw/brw_nir_rt_builder.h | 16 +++++++++ 8 files changed, 50 insertions(+), 54 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 6a1ff384db9..eb7ef7fc009 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2866,8 +2866,8 @@ intrinsic("btd_stack_push_intel", indices=[STACK_SIZE]) intrinsic("btd_retire_intel") # Intel-specific ray-tracing intrinsic -# src[] = { globals, level, operation } SYNCHRONOUS=synchronous -intrinsic("trace_ray_intel", src_comp=[1, 1, 1], indices=[SYNCHRONOUS]) +# src[] = { globals, payload } SYNCHRONOUS=synchronous +intrinsic("trace_ray_intel", src_comp=[1, 1], indices=[SYNCHRONOUS]) # System values used for ray-tracing on Intel system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64]) diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index d2581a2ae81..d26ba1b1c76 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -679,10 +679,8 @@ enum memory_flags { enum rt_logical_srcs { /** Address of the globals */ RT_LOGICAL_SRC_GLOBALS, - /** Level at which the tracing should start */ - RT_LOGICAL_SRC_BVH_LEVEL, - /** Type of tracing operation */ - RT_LOGICAL_SRC_TRACE_RAY_CONTROL, + /** Trace ray payloads */ + RT_LOGICAL_SRC_PAYLOADS, /** Synchronous tracing (ray query) */ RT_LOGICAL_SRC_SYNCHRONOUS, diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 683519745b9..2a01556ef30 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -5746,8 +5746,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_reg globals = get_nir_src(ntb, instr->src[0], -1); srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals); - srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1], 0); - srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2], 0); + srcs[RT_LOGICAL_SRC_PAYLOADS] = get_nir_src(ntb, instr->src[1], 0); srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous); /* Bspec 57508, 47937: Structure_SIMD16TraceRayMessage:: RayQuery Enable diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 9444028a090..8e3ae239d1c 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -2044,22 +2044,9 @@ static void lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; - /* The emit_uniformize() in brw_from_nir.cpp will generate an horizontal - * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q - * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword - * so that the MOV operates on 2 components rather than twice the same - * component. - */ - const brw_reg bvh_level = - inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ? - inst->src[RT_LOGICAL_SRC_BVH_LEVEL] : - bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL], - inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL)); - const brw_reg trace_ray_control = - inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ? - inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] : - bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL], - inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL)); + const brw_reg payload = + bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_PAYLOADS], + inst->components_read(RT_LOGICAL_SRC_PAYLOADS)); const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS]; assert(synchronous_src.file == IMM); const bool synchronous = synchronous_src.ud; @@ -2075,6 +2062,12 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst) const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS]; if (globals_addr.file != UNIFORM) { + /* The emit_uniformize() in brw_from_nir.cpp will generate an horizontal + * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q + * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword + * so that the MOV operates on 2 components rather than twice the same + * component. + */ brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD); addr_ud.stride = 1; ubld.group(2, 0).MOV(header, addr_ud); @@ -2105,16 +2098,6 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst) ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous)); const unsigned ex_mlen = inst->exec_size / 8; - brw_reg payload = bld.vgrf(BRW_TYPE_UD); - if (bvh_level.file == IMM && - trace_ray_control.file == IMM) { - uint32_t high = devinfo->ver >= 20 ? 10 : 9; - bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) | - (bvh_level.ud & 0x7))); - } else { - bld.SHL(payload, trace_ray_control, brw_imm_ud(8)); - bld.OR(payload, payload, bvh_level); - } /* When doing synchronous traversal, the HW implicitly computes the * stack_id using the following formula : diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c index 11faecd593e..495868a9c8a 100644 --- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c +++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c @@ -319,8 +319,8 @@ lower_ray_query_intrinsic(nir_builder *b, /* Do not use state->rq_globals, we want a uniform value for the * tracing call. */ - nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b), - level, ctrl, .synchronous = true); + brw_nir_trace_ray(b, nir_load_ray_query_global_intel(b), + level, ctrl, true); struct brw_nir_rt_mem_hit_defs hit_in = {}; brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false, diff --git a/src/intel/compiler/brw/brw_nir_lower_shader_calls.c b/src/intel/compiler/brw/brw_nir_lower_shader_calls.c index 99ea3f67393..26a52d88c83 100644 --- a/src/intel/compiler/brw/brw_nir_lower_shader_calls.c +++ b/src/intel/compiler/brw/brw_nir_lower_shader_calls.c @@ -232,11 +232,11 @@ lower_shader_trace_ray(nir_builder *b, nir_intrinsic_instr *call, void *data) brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD, devinfo); - nir_trace_ray_intel(b, - nir_load_btd_global_arg_addr_intel(b), - nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD), - nir_imm_int(b, GEN_RT_TRACE_RAY_INITIAL), - .synchronous = false); + brw_nir_trace_ray(b, + nir_load_btd_global_arg_addr_intel(b), + nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD), + nir_imm_int(b, GEN_RT_TRACE_RAY_INITIAL), + false); return true; } @@ -359,8 +359,8 @@ brw_nir_create_null_ahs_shader(const struct brw_compiler *compiler, brw_nir_rt_load_mem_hit(b, &hit_in, false, compiler->devinfo); nir_def *ray_level = hit_in.bvh_level; nir_def *ray_op = nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT); - nir_trace_ray_intel(b, - nir_load_btd_global_arg_addr_intel(b), - ray_level, ray_op); + brw_nir_trace_ray(b, + nir_load_btd_global_arg_addr_intel(b), + ray_level, ray_op, false); return nir; } diff --git a/src/intel/compiler/brw/brw_nir_rt.c b/src/intel/compiler/brw/brw_nir_rt.c index 1c94b65ad5b..415d5601fec 100644 --- a/src/intel/compiler/brw/brw_nir_rt.c +++ b/src/intel/compiler/brw/brw_nir_rt.c @@ -271,11 +271,11 @@ lower_ray_walk_intrinsics(nir_shader *shader, * optimization passes. */ nir_push_if(&b, nir_imm_true(&b)); - nir_trace_ray_intel(&b, - nir_load_btd_global_arg_addr_intel(&b), - nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), - nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE), - .synchronous = false); + brw_nir_trace_ray(&b, + nir_load_btd_global_arg_addr_intel(&b), + nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), + nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE), + false); nir_jump(&b, nir_jump_halt); nir_pop_if(&b, NULL); progress = true; @@ -293,11 +293,11 @@ lower_ray_walk_intrinsics(nir_shader *shader, } nir_push_else(&b, NULL); { - nir_trace_ray_intel(&b, - nir_load_btd_global_arg_addr_intel(&b), - nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), - nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT), - .synchronous = false); + brw_nir_trace_ray(&b, + nir_load_btd_global_arg_addr_intel(&b), + nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), + nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT), + false); nir_jump(&b, nir_jump_halt); } nir_pop_if(&b, NULL); diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h index e4521e99f1b..45436614e2a 100644 --- a/src/intel/compiler/brw/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw/brw_nir_rt_builder.h @@ -126,6 +126,22 @@ brw_nir_btd_return(struct nir_builder *b) brw_nir_btd_spawn(b, resume_addr); } +static inline void +brw_nir_trace_ray(nir_builder *b, + nir_def *globals, + nir_def *bvh_level, + nir_def *trace_ray_control, + bool synchronous) +{ + nir_trace_ray_intel(b, + globals, + nir_bfi(b, + nir_imm_int(b, INTEL_MASK(10, 8)), + nir_u2u32(b, trace_ray_control), + nir_u2u32(b, bvh_level)), + .synchronous = synchronous); +} + static inline void assert_def_size(nir_def *def, unsigned num_components, unsigned bit_size) {