mesa/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
Lionel Landwerlin df15968813 anv/brw: stop turning load_push_constants into load_uniform
Those intrinsics have different semantics in particular with regards
to divergence. Turning one into the other without invalidating the
divergence information breaks NIR validation. But also the conversion
means we get artificially less convergent values in the shaders.

So just handle load_push_constants in the backend and stop changing
things in Anv.

Fixes a bunch of tests in
   dEQP-VK.descriptor_indexing.*
   dEQP-VK.pipeline.*.push_constant.graphics_pipeline.dynamic_index_*

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34546>
2025-05-22 07:49:20 +00:00

425 lines
16 KiB
C

/*
* Copyright (c) 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
static nir_def *
nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
nir_def *matrix[], bool translation)
{
nir_def *result_components[3] = {
nir_channel(b, matrix[3], 0),
nir_channel(b, matrix[3], 1),
nir_channel(b, matrix[3], 2),
};
for (unsigned i = 0; i < 3; ++i) {
for (unsigned j = 0; j < 3; ++j) {
nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
}
}
return nir_vec(b, result_components, 3);
}
static nir_def *
build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
{
switch (b->shader->info.stage) {
case MESA_SHADER_ANY_HIT:
/* Any-hit shaders are always compiled into intersection shaders for
* procedural geometry. If we got here in an any-hit shader, it's for
* triangles.
*/
return nir_imm_false(b);
case MESA_SHADER_INTERSECTION:
return nir_imm_true(b);
default:
return nir_ieq_imm(b, hit->leaf_type,
BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
}
}
static void
lower_rt_intrinsics_impl(nir_function_impl *impl,
const struct brw_base_prog_key *key,
const struct intel_device_info *devinfo)
{
bool progress = false;
nir_builder build = nir_builder_at(nir_before_impl(impl));
nir_builder *b = &build;
struct brw_nir_rt_globals_defs globals;
brw_nir_rt_load_globals(b, &globals, devinfo);
nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
gl_shader_stage stage = b->shader->info.stage;
struct brw_nir_rt_mem_ray_defs world_ray_in = {};
struct brw_nir_rt_mem_ray_defs object_ray_in = {};
struct brw_nir_rt_mem_hit_defs hit_in = {};
switch (stage) {
case MESA_SHADER_ANY_HIT:
case MESA_SHADER_CLOSEST_HIT:
case MESA_SHADER_INTERSECTION:
brw_nir_rt_load_mem_hit(b, &hit_in,
stage == MESA_SHADER_CLOSEST_HIT, devinfo);
brw_nir_rt_load_mem_ray(b, &object_ray_in,
BRW_RT_BVH_LEVEL_OBJECT, devinfo);
FALLTHROUGH;
case MESA_SHADER_MISS:
brw_nir_rt_load_mem_ray(b, &world_ray_in,
BRW_RT_BVH_LEVEL_WORLD, devinfo);
break;
default:
break;
}
nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
nir_def *stack_base_addr =
nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
ASSERTED bool seen_scratch_base_ptr_load = false;
ASSERTED bool found_resume = false;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
b->cursor = nir_after_instr(&intrin->instr);
nir_def *sysval = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_scratch_base_ptr:
assert(nir_intrinsic_base(intrin) == 1);
seen_scratch_base_ptr_load = true;
sysval = stack_base_addr;
break;
case nir_intrinsic_btd_stack_push_intel: {
int32_t stack_size = nir_intrinsic_stack_size(intrin);
if (stack_size > 0) {
nir_def *child_stack_offset =
nir_iadd_imm(b, stack_base_offset, stack_size);
nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
}
nir_instr_remove(instr);
break;
}
case nir_intrinsic_rt_resume:
/* This is the first "interesting" instruction */
assert(block == nir_start_block(impl));
assert(!seen_scratch_base_ptr_load);
found_resume = true;
int32_t stack_size = nir_intrinsic_stack_size(intrin);
if (stack_size > 0) {
stack_base_offset =
nir_iadd_imm(b, stack_base_offset, -stack_size);
nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
stack_base_addr = nir_iadd(b, thread_stack_base_addr,
nir_u2u64(b, stack_base_offset));
}
nir_instr_remove(instr);
break;
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_push_constant:
/* We don't want to lower this in the launch trampoline.
*
* Also if the driver chooses to use an inline push address, we
* can do all the loading of the push constant in
* assign_curb_setup() (more efficient as we can do NoMask
* instructions for address calculations).
*/
if (stage == MESA_SHADER_COMPUTE || key->uses_inline_push_addr)
break;
sysval = brw_nir_load_global_const(b, intrin,
nir_load_btd_global_arg_addr_intel(b),
BRW_RT_PUSH_CONST_OFFSET);
break;
case nir_intrinsic_load_ray_launch_id:
sysval = nir_channels(b, hotzone, 0xe);
break;
case nir_intrinsic_load_ray_launch_size:
sysval = globals.launch_size;
break;
case nir_intrinsic_load_ray_world_origin:
sysval = world_ray_in.orig;
break;
case nir_intrinsic_load_ray_world_direction:
sysval = world_ray_in.dir;
break;
case nir_intrinsic_load_ray_object_origin:
if (stage == MESA_SHADER_CLOSEST_HIT) {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
devinfo);
sysval = nir_build_vec3_mat_mult_col_major(
b, world_ray_in.orig, leaf.world_to_object, true);
} else {
sysval = object_ray_in.orig;
}
break;
case nir_intrinsic_load_ray_object_direction:
if (stage == MESA_SHADER_CLOSEST_HIT) {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
devinfo);
sysval = nir_build_vec3_mat_mult_col_major(
b, world_ray_in.dir, leaf.world_to_object, false);
} else {
sysval = object_ray_in.dir;
}
break;
case nir_intrinsic_load_ray_t_min:
/* It shouldn't matter which we pull this from */
sysval = world_ray_in.t_near;
break;
case nir_intrinsic_load_ray_t_max:
if (stage == MESA_SHADER_MISS)
sysval = world_ray_in.t_far;
else
sysval = hit_in.t;
break;
case nir_intrinsic_load_primitive_id:
sysval = brw_nir_rt_load_primitive_id_from_hit(b,
build_leaf_is_procedural(b, &hit_in),
&hit_in);
break;
case nir_intrinsic_load_instance_id: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr, devinfo);
sysval = leaf.instance_index;
break;
}
case nir_intrinsic_load_ray_object_to_world: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr, devinfo);
sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_ray_world_to_object: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr, devinfo);
sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_ray_hit_kind: {
nir_def *tri_hit_kind =
nir_bcsel(b, hit_in.front_face,
nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
hit_in.aabb_hit_kind, tri_hit_kind);
break;
}
case nir_intrinsic_load_ray_flags:
/* We need to fetch the original ray flags we stored in the
* leaf pointer, because the actual ray flags we get here
* will include any flags passed on the pipeline at creation
* time, and the spec for IncomingRayFlagsKHR says:
* Setting pipeline flags on the raytracing pipeline must not
* cause any corresponding flags to be set in variables with
* this decoration.
*/
sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
break;
case nir_intrinsic_load_cull_mask:
sysval = nir_u2u32(b, world_ray_in.ray_mask);
break;
case nir_intrinsic_load_ray_geometry_index: {
nir_def *geometry_index_dw =
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
1, 32);
sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
break;
}
case nir_intrinsic_load_ray_instance_custom_index: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr, devinfo);
sysval = leaf.instance_id;
break;
}
case nir_intrinsic_load_shader_record_ptr:
/* We can't handle this intrinsic in resume shaders because the
* handle we get there won't be from the original SBT. The shader
* call lowering/splitting pass should have ensured that this
* value was spilled from the initial shader and unspilled in any
* resume shaders that need it.
*/
assert(!found_resume);
sysval = nir_load_btd_local_arg_addr_intel(b);
break;
case nir_intrinsic_load_ray_base_mem_addr_intel:
sysval = globals.base_mem_addr;
break;
case nir_intrinsic_load_ray_hw_stack_size_intel:
sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
break;
case nir_intrinsic_load_ray_sw_stack_size_intel:
sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
break;
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
sysval = globals.num_dss_rt_stacks;
break;
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
sysval = globals.hit_sbt_addr;
break;
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
sysval = globals.hit_sbt_stride;
break;
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
sysval = globals.miss_sbt_addr;
break;
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
sysval = globals.miss_sbt_stride;
break;
case nir_intrinsic_load_callable_sbt_addr_intel:
sysval = globals.call_sbt_addr;
break;
case nir_intrinsic_load_callable_sbt_stride_intel:
sysval = globals.call_sbt_stride;
break;
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
sysval = nir_pack_64_2x32_split(b,
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
break;
case nir_intrinsic_load_leaf_procedural_intel:
sysval = build_leaf_is_procedural(b, &hit_in);
break;
case nir_intrinsic_load_ray_triangle_vertex_positions: {
struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
sysval = pos.positions[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_leaf_opaque_intel: {
if (stage == MESA_SHADER_INTERSECTION) {
/* In intersection shaders, the opaque bit is passed to us in
* the front_face bit.
*/
sysval = hit_in.front_face;
} else {
nir_def *flags_dw =
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
1, 32);
sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
}
break;
}
default:
continue;
}
progress = true;
if (sysval) {
nir_def_replace(&intrin->def, sysval);
}
}
}
nir_progress(true, impl,
progress ? nir_metadata_none : (nir_metadata_control_flow));
}
/** Lower ray-tracing system values and intrinsics
*
* In most 3D shader stages, intrinsics are a fairly thin wrapper around
* hardware functionality and system values represent magic bits that come
* into the shader from FF hardware. Ray-tracing, however, looks a bit more
* like the OpenGL 1.0 world where the underlying hardware is simple and most
* of the API implementation is software.
*
* In particular, most things that are treated as system values (or built-ins
* in SPIR-V) don't get magically dropped into registers for us. Instead, we
* have to fetch them from the relevant data structures shared with the
* ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
* from one of the MemHit data structures. Some, such as primitive_id require
* us to fetch the leaf address from the MemHit struct and then manually read
* the data out of the BVH. Instead of trying to emit all this code deep in
* the back-end where we can't effectively optimize it, we lower it all to
* global memory access in NIR.
*
* Once this pass is complete, the only real system values left are the two
* argument pointer system values for BTD dispatch: btd_local_arg_addr and
* btd_global_arg_addr.
*/
void
brw_nir_lower_rt_intrinsics(nir_shader *nir,
const struct brw_base_prog_key *key,
const struct intel_device_info *devinfo)
{
nir_foreach_function_impl(impl, nir) {
lower_rt_intrinsics_impl(impl, key, devinfo);
}
}