From 978e9b670eaef3d6b3984f135af7d1d240938a7d Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Wed, 12 Mar 2025 22:43:57 +0100 Subject: [PATCH] aco,nir: Add support for new GFX12 ray tracing instructions Adds image_bvh_dual_intersect_ray and image_bvh8_intersect_ray which can handle the new BVH format. Both instructions write up to 10 VGPRs so they need to use a vec16 definition in nir. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_assembler.cpp | 2 +- src/amd/compiler/aco_form_hard_clauses.cpp | 4 ++- src/amd/compiler/aco_insert_waitcnt.cpp | 4 +-- .../compiler/aco_instruction_selection.cpp | 33 +++++++++++++++++++ .../aco_instruction_selection_setup.cpp | 1 + src/amd/compiler/aco_ir.cpp | 8 +++-- src/amd/compiler/aco_ir.h | 2 ++ src/amd/compiler/aco_opcodes.py | 2 ++ src/amd/compiler/aco_validate.cpp | 2 ++ src/amd/vulkan/radv_shader_info.c | 1 + src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 26 +++++++++++++++ 12 files changed, 80 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 5d12fb5000e..7646159caf9 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -838,7 +838,7 @@ emit_mimg_instruction_gfx12(asm_context& ctx, std::vector& out, const encoding = 0; if (!instr->definitions.empty()) - encoding |= reg(ctx, instr->definitions[0], 8); /* VDATA */ + encoding |= reg(ctx, instr->definitions.back(), 8); /* VDATA */ else if (!instr->operands[2].isUndefined()) encoding |= reg(ctx, instr->operands[2], 8); /* VDATA */ encoding |= reg(ctx, instr->operands[0]) << 9; /* T# (resource) */ diff --git a/src/amd/compiler/aco_form_hard_clauses.cpp b/src/amd/compiler/aco_form_hard_clauses.cpp index 73dc3cd63e1..f37168406ff 100644 --- a/src/amd/compiler/aco_form_hard_clauses.cpp +++ b/src/amd/compiler/aco_form_hard_clauses.cpp @@ -53,7 +53,9 @@ get_type(Program* program, aco_ptr& instr) if (instr->isMIMG()) { switch (instr->opcode) { case aco_opcode::image_bvh_intersect_ray: - case aco_opcode::image_bvh64_intersect_ray: return clause_bvh; + case aco_opcode::image_bvh64_intersect_ray: + case aco_opcode::image_bvh_dual_intersect_ray: + case aco_opcode::image_bvh8_intersect_ray: return clause_bvh; case aco_opcode::image_atomic_swap: case aco_opcode::image_atomic_cmpswap: case aco_opcode::image_atomic_add: diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 49b6f99b8b8..afc515e098b 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -650,8 +650,8 @@ gen(Instruction* instr, wait_ctx& ctx) update_counters(ctx, ev, get_sync_info(instr)); - if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ev, type); + for (auto& definition : instr->definitions) + insert_wait_entry(ctx, definition, ev, type); if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { update_counters(ctx, event_vmem_gpr_lock); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a7ee2e19de5..72ede3e6cc2 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5962,6 +5962,38 @@ visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) emit_split_vector(ctx, dst, instr->def.num_components); } +void +visit_bvh8_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) +{ + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->def); + Temp resource = get_ssa_temp(ctx, instr->src[0].ssa); + Temp bvh_base = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + Temp cull_mask = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); + Temp tmax = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + Temp origin = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[4].ssa)); + Temp dir = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[5].ssa)); + Temp node_id = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[6].ssa)); + + Temp result = bld.tmp(v10); + Temp new_origin = bld.tmp(v3); + Temp new_dir = bld.tmp(v3); + + std::vector args = {bvh_base, + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmax, cull_mask), + origin, dir, node_id}; + + MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh8_intersect_ray, + {new_origin, new_dir, result}, resource, Operand(s4), args); + mimg->dim = ac_image_1d; + mimg->dmask = 0xf; + mimg->unrm = true; + mimg->r128 = true; + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(result), Operand(new_origin), + Operand(new_dir)); +} + static std::vector get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr) { @@ -8787,6 +8819,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) break; } case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break; + case nir_intrinsic_bvh8_intersect_ray_amd: visit_bvh8_intersect_ray_amd(ctx, instr); break; case nir_intrinsic_load_resume_shader_address_amd: { bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr))); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 72661e0c5a8..c276d5bd658 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -562,6 +562,7 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_load_initial_edgeflags_amd: case nir_intrinsic_gds_atomic_add_amd: case nir_intrinsic_bvh64_intersect_ray_amd: + case nir_intrinsic_bvh8_intersect_ray_amd: case nir_intrinsic_load_vector_arg_amd: case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: case nir_intrinsic_cmat_muladd_amd: diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 16fc05df6b0..d0775439838 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1414,7 +1414,6 @@ aco::small_vec get_ops_fixed_to_def(Instruction* instr) { aco::small_vec ops; - if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 || @@ -1432,6 +1431,10 @@ get_ops_fixed_to_def(Instruction* instr) } else if (instr->isMIMG() && instr->definitions.size() == 1 && !instr->operands[2].isUndefined()) { ops.push_back(2); + } else if (instr->opcode == aco_opcode::image_bvh8_intersect_ray) { + /* VADDR starts at 3. */ + ops.push_back(3 + 2); + ops.push_back(3 + 3); } return ops; } @@ -1439,7 +1442,8 @@ get_ops_fixed_to_def(Instruction* instr) uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) { - if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) + if (instr->opcode == aco_opcode::image_bvh64_intersect_ray || + instr->opcode == aco_opcode::image_bvh8_intersect_ray) return vmem_bvh; else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) return vmem_sampler; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 4b0ec7aab6f..28e84809b5a 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -292,6 +292,7 @@ struct RegClass { v6 = 6 | (1 << 5), v7 = 7 | (1 << 5), v8 = 8 | (1 << 5), + v10 = 10 | (1 << 5), /* byte-sized register class */ v1b = v1 | (1 << 7), v2b = v2 | (1 << 7), @@ -360,6 +361,7 @@ static constexpr RegClass v5{RegClass::v5}; static constexpr RegClass v6{RegClass::v6}; static constexpr RegClass v7{RegClass::v7}; static constexpr RegClass v8{RegClass::v8}; +static constexpr RegClass v10{RegClass::v10}; static constexpr RegClass v1b{RegClass::v1b}; static constexpr RegClass v2b{RegClass::v2b}; static constexpr RegClass v3b{RegClass::v3b}; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 361535bbf5e..95416bc844a 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1867,6 +1867,8 @@ MIMG = { ("image_gather4_c_lz_o", op(0x5f, gfx11=0x37)), ("image_bvh_intersect_ray", op(gfx10=0xe6, gfx11=0x19)), ("image_bvh64_intersect_ray", op(gfx10=0xe7, gfx11=0x1a)), + ("image_bvh_dual_intersect_ray", op(gfx12=0x80)), + ("image_bvh8_intersect_ray", op(gfx12=0x81)), } for (name, num) in MIMG: insn(name, num, Format.MIMG, InstrClass.VMem, is_atomic = "atomic" in name) diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 2cad50300a2..02e97c798f1 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -887,6 +887,8 @@ validate_ir(Program* program) program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4; if (instr->opcode != aco_opcode::image_bvh_intersect_ray && instr->opcode != aco_opcode::image_bvh64_intersect_ray && + instr->opcode != aco_opcode::image_bvh_dual_intersect_ray && + instr->opcode != aco_opcode::image_bvh8_intersect_ray && i < 3 + num_scalar) { check(instr->operands[i].regClass() == v1, "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get()); diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index f2fdaed5262..35553d0c39a 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -314,6 +314,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, s gather_intrinsic_store_output_info(nir, instr, info, consider_force_vrs); break; case nir_intrinsic_bvh64_intersect_ray_amd: + case nir_intrinsic_bvh8_intersect_ray_amd: info->cs.uses_rt = true; break; case nir_intrinsic_load_poly_line_smooth_enabled: diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 8c45b4a758c..c906360357c 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -693,6 +693,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_is_sparse_resident_zink: case nir_intrinsic_sparse_residency_code_and: case nir_intrinsic_bvh64_intersect_ray_amd: + case nir_intrinsic_bvh8_intersect_ray_amd: case nir_intrinsic_image_deref_load_param_intel: case nir_intrinsic_image_load_raw_intel: case nir_intrinsic_get_ubo_size: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 322063b3797..8991ddb0c88 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1777,6 +1777,32 @@ system_value("sbt_base_amd", 1, bit_sizes=[64]) # 6. inverse ray direction (componentwise 1.0/ray direction) intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE, CAN_REORDER]) +# 1. HW descriptor +# 2. BVH base +# 3. instance cull mask +# 4. ray extent +# 5. ray origin +# 6. ray direction +# 7. node ID +# +# dst: +# | component | box node | instance node | triangle node | procedural node | +# |-----------|-------------|----------------------|-----------------------------------|-----------------------------------| +# | 0 | child_id[0] | | t[0] | | +# | 1 | child_id[1] | | u[0] | | +# | 2 | child_id[2] | blas_addr_lo | v[0] | | +# | 3 | child_id[3] | blas_addr_hi | primitive_index_hit_kind[0] | primitive_index | +# | 4 | child_id[4] | | t[1] | | +# | 5 | child_id[5] | | u[1] | | +# | 6 | child_id[6] | user_data | v[1] | | +# | 7 | child_id[7] | next_node_ids | primitive_index_hit_kind[1] | | +# | 8 | | | geometry_index_navigation_bits[0] | geometry_index_navigation_bits[0] | +# | 9 | | | geometry_index_navigation_bits[1] | geometry_index_navigation_bits[1] | +# | [10,12] | | object_ray_origin | | | +# | [13,15] | | object_ray_direction | | | +# +intrinsic("bvh8_intersect_ray_amd", [4, 2, 1, 1, 3, 3, 1], 16, flags=[CAN_ELIMINATE, CAN_REORDER]) + # Return of a callable in raytracing pipelines intrinsic("rt_return_amd")