diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index dc675939215..92d6d1a6ee7 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -1649,7 +1649,7 @@ typedef struct { #include "nir_intrinsics.h" -#define NIR_INTRINSIC_MAX_CONST_INDEX 5 +#define NIR_INTRINSIC_MAX_CONST_INDEX 6 /** Represents an intrinsic * diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index fc419581084..71c256933e1 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -254,6 +254,9 @@ index("nir_rounding_mode", "rounding_mode") # Whether or not to saturate in conversions index("unsigned", "saturate") +# Whether or not trace_ray_intel is synchronous +index("bool", "synchronous") + intrinsic("nop", flags=[CAN_ELIMINATE]) intrinsic("convert_alu_types", dest_comp=0, src_comp=[0], @@ -1366,10 +1369,9 @@ intrinsic("btd_stack_push_intel", indices=[STACK_SIZE]) # src[] = { } intrinsic("btd_retire_intel") -# Intel-specific ray-tracing intrinsics -intrinsic("trace_ray_initial_intel") -intrinsic("trace_ray_commit_intel") -intrinsic("trace_ray_continue_intel") +# Intel-specific ray-tracing intrinsic +# src[] = { globals, level, operation } SYNCHRONOUS=synchronous +intrinsic("trace_ray_intel", src_comp=[1, 1, 1], indices=[SYNCHRONOUS]) # System values used for ray-tracing on Intel system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64]) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 227604c8210..addf63e870d 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -955,6 +955,19 @@ enum a64_logical_srcs { A64_LOGICAL_NUM_SRCS }; +enum rt_logical_srcs { + /** Address of the globals */ + RT_LOGICAL_SRC_GLOBALS, + /** Level at which the tracing should start */ + RT_LOGICAL_SRC_BVH_LEVEL, + /** Type of tracing operation */ + RT_LOGICAL_SRC_TRACE_RAY_CONTROL, + /** Synchronous tracing (ray query) */ + RT_LOGICAL_SRC_SYNCHRONOUS, + + RT_LOGICAL_NUM_SRCS +}; + #ifdef __cplusplus /** * Allow brw_urb_write_flags enums to be ORed together. diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 3dbb7b61081..a1a5d0dd319 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6663,31 +6663,53 @@ static void lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; - const fs_reg &bvh_level = inst->src[0]; - assert(inst->src[1].file == BRW_IMMEDIATE_VALUE); - const uint32_t trace_ray_control = inst->src[1].ud; + const fs_reg &globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS]; + const fs_reg &bvh_level = + inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ? + inst->src[RT_LOGICAL_SRC_BVH_LEVEL] : + bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL], + inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL)); + const fs_reg &trace_ray_control = + inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ? + inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] : + bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL], + inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL)); + const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS]; + assert(synchronous_src.file == BRW_IMMEDIATE_VALUE); + const bool synchronous = synchronous_src.ud; const unsigned mlen = 1; const fs_builder ubld = bld.exec_all().group(8, 0); fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); ubld.MOV(header, brw_imm_ud(0)); - ubld.group(2, 0).MOV(header, - retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD)); - /* TODO: Bit 128 is ray_query */ + ubld.group(2, 0).MOV(header, retype(globals_addr, BRW_REGISTER_TYPE_UD)); + if (synchronous) + ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous)); const unsigned ex_mlen = inst->exec_size / 8; fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); - const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8); - if (bvh_level.file == BRW_IMMEDIATE_VALUE) { - bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7))); + if (bvh_level.file == BRW_IMMEDIATE_VALUE && + trace_ray_control.file == BRW_IMMEDIATE_VALUE) { + bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) | + (bvh_level.ud & 0x7))); } else { - bld.AND(payload, bvh_level, brw_imm_ud(0x7)); - if (trc_bits != 0) - bld.OR(payload, payload, brw_imm_ud(trc_bits)); + bld.SHL(payload, trace_ray_control, brw_imm_ud(8)); + bld.OR(payload, payload, bvh_level); + } + + /* When doing synchronous traversal, the HW implicitly computes the + * stack_id using the following formula : + * + * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0] + * + * Only in the asynchronous case we need to set the stack_id given from the + * payload register. + */ + if (!synchronous) { + bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), + retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0x7ff)); } - bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), - retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), - brw_imm_uw(0x7ff)); /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 63c63389911..acec08dbbab 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3997,6 +3997,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, } } +static void +emit_rt_lsc_fence(const fs_builder &bld, enum lsc_flush_type flush_type) +{ + const intel_device_info *devinfo = bld.shader->devinfo; + + const fs_builder ubld = bld.exec_all().group(8, 0); + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp, + brw_imm_ud(0) /* desc */, + brw_imm_ud(0) /* ex_desc */, + brw_vec8_grf(0, 0) /* payload */); + send->sfid = GFX12_SFID_UGM; + send->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE, + flush_type, true); + send->mlen = 1; /* g0 header */ + send->ex_mlen = 0; + send->size_written = REG_SIZE; /* Temp write for scheduling */ + send->send_has_side_effects = true; + + ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp); +} + + void fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -4016,27 +4039,6 @@ fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type)); break; - case nir_intrinsic_trace_ray_initial_intel: - bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, - bld.null_reg_ud(), - brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD), - brw_imm_ud(GEN_RT_TRACE_RAY_INITAL)); - break; - - case nir_intrinsic_trace_ray_commit_intel: - bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, - bld.null_reg_ud(), - brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT), - brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT)); - break; - - case nir_intrinsic_trace_ray_continue_intel: - bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, - bld.null_reg_ud(), - brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT), - brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE)); - break; - default: nir_emit_intrinsic(bld, instr); break; @@ -5869,6 +5871,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); break; + case nir_intrinsic_trace_ray_intel: { + const bool synchronous = nir_intrinsic_synchronous(instr); + assert(brw_shader_stage_is_bindless(stage) || synchronous); + if (synchronous) + emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_EVICT); + fs_reg srcs[RT_LOGICAL_NUM_SRCS]; + srcs[RT_LOGICAL_SRC_GLOBALS] = get_nir_src(instr->src[0]); + srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]); + srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]); + srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous); + bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(), + srcs, RT_LOGICAL_NUM_SRCS); + + /* There is no actual value to use in the destination register of the + * synchronous trace instruction. All of the communication with the HW + * unit happens through memory reads/writes. So to ensure that the + * operation has completed before we go read the results in memory, we + * need a barrier followed by an invalidate before accessing memory. + */ + if (synchronous) { + bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); + emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_INVALIDATE); + } + break; + } + default: unreachable("unknown intrinsic"); } diff --git a/src/intel/compiler/brw_nir_lower_shader_calls.c b/src/intel/compiler/brw_nir_lower_shader_calls.c index b4c40b112bd..a92b96cbc09 100644 --- a/src/intel/compiler/brw_nir_lower_shader_calls.c +++ b/src/intel/compiler/brw_nir_lower_shader_calls.c @@ -214,7 +214,11 @@ lower_shader_calls_instr(struct nir_builder *b, nir_instr *instr, void *data) .shader_index_multiplier = sbt_stride, }; brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD); - nir_trace_ray_initial_intel(b); + nir_trace_ray_intel(b, + nir_load_btd_global_arg_addr_intel(b), + nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD), + nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL), + .synchronous = false); return true; } diff --git a/src/intel/compiler/brw_nir_rt.c b/src/intel/compiler/brw_nir_rt.c index fd2e33ae050..b9fc5de9d0b 100644 --- a/src/intel/compiler/brw_nir_rt.c +++ b/src/intel/compiler/brw_nir_rt.c @@ -294,7 +294,11 @@ lower_ray_walk_intrinsics(nir_shader *shader, * optimization passes. */ nir_push_if(&b, nir_imm_true(&b)); - nir_trace_ray_continue_intel(&b); + nir_trace_ray_intel(&b, + nir_load_btd_global_arg_addr_intel(&b), + nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), + nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE), + .synchronous = false); nir_jump(&b, nir_jump_halt); nir_pop_if(&b, NULL); progress = true; @@ -313,7 +317,11 @@ lower_ray_walk_intrinsics(nir_shader *shader, } nir_push_else(&b, NULL); { - nir_trace_ray_commit_intel(&b); + nir_trace_ray_intel(&b, + nir_load_btd_global_arg_addr_intel(&b), + nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT), + nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT), + .synchronous = false); nir_jump(&b, nir_jump_halt); } nir_pop_if(&b, NULL);