intel/nir: use a single intel intrinsic to deal with ray traversal

In the future we'll want to reuse this intrinsic to deal with ray
queries. Ray queries will use a different global pointer and
programmatically change the control/level arguments of the trace send
instruction.

v2: Comment on barrier after sync trace instruction (Caio)
    Generalize lsc helper (Caio)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13719>
This commit is contained in:
Lionel Landwerlin 2021-06-14 17:30:31 +03:00 committed by Marge Bot
parent 39f6cd5d79
commit bb40e999d1
7 changed files with 121 additions and 44 deletions

View file

@ -1649,7 +1649,7 @@ typedef struct {
#include "nir_intrinsics.h" #include "nir_intrinsics.h"
#define NIR_INTRINSIC_MAX_CONST_INDEX 5 #define NIR_INTRINSIC_MAX_CONST_INDEX 6
/** Represents an intrinsic /** Represents an intrinsic
* *

View file

@ -254,6 +254,9 @@ index("nir_rounding_mode", "rounding_mode")
# Whether or not to saturate in conversions # Whether or not to saturate in conversions
index("unsigned", "saturate") index("unsigned", "saturate")
# Whether or not trace_ray_intel is synchronous
index("bool", "synchronous")
intrinsic("nop", flags=[CAN_ELIMINATE]) intrinsic("nop", flags=[CAN_ELIMINATE])
intrinsic("convert_alu_types", dest_comp=0, src_comp=[0], intrinsic("convert_alu_types", dest_comp=0, src_comp=[0],
@ -1366,10 +1369,9 @@ intrinsic("btd_stack_push_intel", indices=[STACK_SIZE])
# src[] = { } # src[] = { }
intrinsic("btd_retire_intel") intrinsic("btd_retire_intel")
# Intel-specific ray-tracing intrinsics # Intel-specific ray-tracing intrinsic
intrinsic("trace_ray_initial_intel") # src[] = { globals, level, operation } SYNCHRONOUS=synchronous
intrinsic("trace_ray_commit_intel") intrinsic("trace_ray_intel", src_comp=[1, 1, 1], indices=[SYNCHRONOUS])
intrinsic("trace_ray_continue_intel")
# System values used for ray-tracing on Intel # System values used for ray-tracing on Intel
system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64]) system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64])

View file

@ -955,6 +955,19 @@ enum a64_logical_srcs {
A64_LOGICAL_NUM_SRCS A64_LOGICAL_NUM_SRCS
}; };
enum rt_logical_srcs {
/** Address of the globals */
RT_LOGICAL_SRC_GLOBALS,
/** Level at which the tracing should start */
RT_LOGICAL_SRC_BVH_LEVEL,
/** Type of tracing operation */
RT_LOGICAL_SRC_TRACE_RAY_CONTROL,
/** Synchronous tracing (ray query) */
RT_LOGICAL_SRC_SYNCHRONOUS,
RT_LOGICAL_NUM_SRCS
};
#ifdef __cplusplus #ifdef __cplusplus
/** /**
* Allow brw_urb_write_flags enums to be ORed together. * Allow brw_urb_write_flags enums to be ORed together.

View file

@ -6663,31 +6663,53 @@ static void
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst) lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
{ {
const intel_device_info *devinfo = bld.shader->devinfo; const intel_device_info *devinfo = bld.shader->devinfo;
const fs_reg &bvh_level = inst->src[0]; const fs_reg &globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
assert(inst->src[1].file == BRW_IMMEDIATE_VALUE); const fs_reg &bvh_level =
const uint32_t trace_ray_control = inst->src[1].ud; inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
const fs_reg &trace_ray_control =
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
const bool synchronous = synchronous_src.ud;
const unsigned mlen = 1; const unsigned mlen = 1;
const fs_builder ubld = bld.exec_all().group(8, 0); const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(header, brw_imm_ud(0)); ubld.MOV(header, brw_imm_ud(0));
ubld.group(2, 0).MOV(header, ubld.group(2, 0).MOV(header, retype(globals_addr, BRW_REGISTER_TYPE_UD));
retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD)); if (synchronous)
/* TODO: Bit 128 is ray_query */ ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
const unsigned ex_mlen = inst->exec_size / 8; const unsigned ex_mlen = inst->exec_size / 8;
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8); if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
if (bvh_level.file == BRW_IMMEDIATE_VALUE) { trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7))); bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
(bvh_level.ud & 0x7)));
} else { } else {
bld.AND(payload, bvh_level, brw_imm_ud(0x7)); bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
if (trc_bits != 0) bld.OR(payload, payload, bvh_level);
bld.OR(payload, payload, brw_imm_ud(trc_bits));
} }
/* When doing synchronous traversal, the HW implicitly computes the
* stack_id using the following formula :
*
* EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
*
* Only in the asynchronous case we need to set the stack_id given from the
* payload register.
*/
if (!synchronous) {
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
brw_imm_uw(0x7ff)); brw_imm_uw(0x7ff));
}
/* Update the original instruction. */ /* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND; inst->opcode = SHADER_OPCODE_SEND;

View file

@ -3997,6 +3997,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
} }
} }
static void
emit_rt_lsc_fence(const fs_builder &bld, enum lsc_flush_type flush_type)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
brw_imm_ud(0) /* desc */,
brw_imm_ud(0) /* ex_desc */,
brw_vec8_grf(0, 0) /* payload */);
send->sfid = GFX12_SFID_UGM;
send->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
flush_type, true);
send->mlen = 1; /* g0 header */
send->ex_mlen = 0;
send->size_written = REG_SIZE; /* Temp write for scheduling */
send->send_has_side_effects = true;
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
}
void void
fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr) nir_intrinsic_instr *instr)
@ -4016,27 +4039,6 @@ fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type)); bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));
break; break;
case nir_intrinsic_trace_ray_initial_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD),
brw_imm_ud(GEN_RT_TRACE_RAY_INITAL));
break;
case nir_intrinsic_trace_ray_commit_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT));
break;
case nir_intrinsic_trace_ray_continue_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE));
break;
default: default:
nir_emit_intrinsic(bld, instr); nir_emit_intrinsic(bld, instr);
break; break;
@ -5869,6 +5871,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
break; break;
case nir_intrinsic_trace_ray_intel: {
const bool synchronous = nir_intrinsic_synchronous(instr);
assert(brw_shader_stage_is_bindless(stage) || synchronous);
if (synchronous)
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_EVICT);
fs_reg srcs[RT_LOGICAL_NUM_SRCS];
srcs[RT_LOGICAL_SRC_GLOBALS] = get_nir_src(instr->src[0]);
srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]);
srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]);
srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
srcs, RT_LOGICAL_NUM_SRCS);
/* There is no actual value to use in the destination register of the
* synchronous trace instruction. All of the communication with the HW
* unit happens through memory reads/writes. So to ensure that the
* operation has completed before we go read the results in memory, we
* need a barrier followed by an invalidate before accessing memory.
*/
if (synchronous) {
bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_INVALIDATE);
}
break;
}
default: default:
unreachable("unknown intrinsic"); unreachable("unknown intrinsic");
} }

View file

@ -214,7 +214,11 @@ lower_shader_calls_instr(struct nir_builder *b, nir_instr *instr, void *data)
.shader_index_multiplier = sbt_stride, .shader_index_multiplier = sbt_stride,
}; };
brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD); brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
nir_trace_ray_initial_intel(b); nir_trace_ray_intel(b,
nir_load_btd_global_arg_addr_intel(b),
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
.synchronous = false);
return true; return true;
} }

View file

@ -294,7 +294,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
* optimization passes. * optimization passes.
*/ */
nir_push_if(&b, nir_imm_true(&b)); nir_push_if(&b, nir_imm_true(&b));
nir_trace_ray_continue_intel(&b); nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
.synchronous = false);
nir_jump(&b, nir_jump_halt); nir_jump(&b, nir_jump_halt);
nir_pop_if(&b, NULL); nir_pop_if(&b, NULL);
progress = true; progress = true;
@ -313,7 +317,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
} }
nir_push_else(&b, NULL); nir_push_else(&b, NULL);
{ {
nir_trace_ray_commit_intel(&b); nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
.synchronous = false);
nir_jump(&b, nir_jump_halt); nir_jump(&b, nir_jump_halt);
} }
nir_pop_if(&b, NULL); nir_pop_if(&b, NULL);