aco,nir: Add support for new GFX12 ray tracing instructions

Adds image_bvh_dual_intersect_ray and image_bvh8_intersect_ray which can handle the new BVH format. Both instructions write up to 10 VGPRs so they need to use a vec16 definition in nir. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34273>
2025-12-20 18:10:11 +01:00 · 2025-03-12 22:43:57 +01:00 · 2025-03-12 22:43:57 +01:00 · 978e9b670e
commit 978e9b670e
parent ee0f784858
12 changed files with 80 additions and 6 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -838,7 +838,7 @@ emit_mimg_instruction_gfx12(asm_context& ctx, std::vector<uint32_t>& out, const

   encoding = 0;
   if (!instr->definitions.empty())
-      encoding |= reg(ctx, instr->definitions[0], 8); /* VDATA */
+      encoding |= reg(ctx, instr->definitions.back(), 8); /* VDATA */
   else if (!instr->operands[2].isUndefined())
      encoding |= reg(ctx, instr->operands[2], 8); /* VDATA */
   encoding |= reg(ctx, instr->operands[0]) << 9;  /* T# (resource) */
--- a/src/amd/compiler/aco_form_hard_clauses.cpp
+++ b/src/amd/compiler/aco_form_hard_clauses.cpp
@ -53,7 +53,9 @@ get_type(Program* program, aco_ptr<Instruction>& instr)
      if (instr->isMIMG()) {
         switch (instr->opcode) {
         case aco_opcode::image_bvh_intersect_ray:
-         case aco_opcode::image_bvh64_intersect_ray: return clause_bvh;
+         case aco_opcode::image_bvh64_intersect_ray:
+         case aco_opcode::image_bvh_dual_intersect_ray:
+         case aco_opcode::image_bvh8_intersect_ray: return clause_bvh;
         case aco_opcode::image_atomic_swap:
         case aco_opcode::image_atomic_cmpswap:
         case aco_opcode::image_atomic_add:
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -650,8 +650,8 @@ gen(Instruction* instr, wait_ctx& ctx)

      update_counters(ctx, ev, get_sync_info(instr));

-      if (!instr->definitions.empty())
-         insert_wait_entry(ctx, instr->definitions[0], ev, type);
+      for (auto& definition : instr->definitions)
+         insert_wait_entry(ctx, definition, ev, type);

      if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
         update_counters(ctx, event_vmem_gpr_lock);
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -5962,6 +5962,38 @@ visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
   emit_split_vector(ctx, dst, instr->def.num_components);
 }

+void
+visit_bvh8_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp dst = get_ssa_temp(ctx, &instr->def);
+   Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
+   Temp bvh_base = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+   Temp cull_mask = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
+   Temp tmax = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
+   Temp origin = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[4].ssa));
+   Temp dir = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[5].ssa));
+   Temp node_id = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[6].ssa));
+
+   Temp result = bld.tmp(v10);
+   Temp new_origin = bld.tmp(v3);
+   Temp new_dir = bld.tmp(v3);
+
+   std::vector<Temp> args = {bvh_base,
+                             bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmax, cull_mask),
+                             origin, dir, node_id};
+
+   MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh8_intersect_ray,
+                                      {new_origin, new_dir, result}, resource, Operand(s4), args);
+   mimg->dim = ac_image_1d;
+   mimg->dmask = 0xf;
+   mimg->unrm = true;
+   mimg->r128 = true;
+
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(result), Operand(new_origin),
+              Operand(new_dir));
+}
+
 static std::vector<Temp>
 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
 {
@ -8787,6 +8819,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
      break;
   }
   case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
+   case nir_intrinsic_bvh8_intersect_ray_amd: visit_bvh8_intersect_ray_amd(ctx, instr); break;
   case nir_intrinsic_load_resume_shader_address_amd: {
      bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
                 bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -562,6 +562,7 @@ init_context(isel_context* ctx, nir_shader* shader)
               case nir_intrinsic_load_initial_edgeflags_amd:
               case nir_intrinsic_gds_atomic_add_amd:
               case nir_intrinsic_bvh64_intersect_ray_amd:
+               case nir_intrinsic_bvh8_intersect_ray_amd:
               case nir_intrinsic_load_vector_arg_amd:
               case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
               case nir_intrinsic_cmat_muladd_amd:
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -1414,7 +1414,6 @@ aco::small_vec<uint32_t, 2>
 get_ops_fixed_to_def(Instruction* instr)
 {
   aco::small_vec<uint32_t, 2> ops;
-
   if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
       instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
       instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
@ -1432,6 +1431,10 @@ get_ops_fixed_to_def(Instruction* instr)
   } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
              !instr->operands[2].isUndefined()) {
      ops.push_back(2);
+   } else if (instr->opcode == aco_opcode::image_bvh8_intersect_ray) {
+      /* VADDR starts at 3. */
+      ops.push_back(3 + 2);
+      ops.push_back(3 + 3);
   }
   return ops;
 }
@ -1439,7 +1442,8 @@ get_ops_fixed_to_def(Instruction* instr)
 uint8_t
 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
 {
-   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
+   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
+       instr->opcode == aco_opcode::image_bvh8_intersect_ray)
      return vmem_bvh;
   else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
      return vmem_sampler;
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -292,6 +292,7 @@ struct RegClass {
      v6 = 6 | (1 << 5),
      v7 = 7 | (1 << 5),
      v8 = 8 | (1 << 5),
+      v10 = 10 | (1 << 5),
      /* byte-sized register class */
      v1b = v1 | (1 << 7),
      v2b = v2 | (1 << 7),
@ -360,6 +361,7 @@ static constexpr RegClass v5{RegClass::v5};
 static constexpr RegClass v6{RegClass::v6};
 static constexpr RegClass v7{RegClass::v7};
 static constexpr RegClass v8{RegClass::v8};
+static constexpr RegClass v10{RegClass::v10};
 static constexpr RegClass v1b{RegClass::v1b};
 static constexpr RegClass v2b{RegClass::v2b};
 static constexpr RegClass v3b{RegClass::v3b};
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -1867,6 +1867,8 @@ MIMG = {
   ("image_gather4_c_lz_o",      op(0x5f, gfx11=0x37)),
   ("image_bvh_intersect_ray",   op(gfx10=0xe6, gfx11=0x19)),
   ("image_bvh64_intersect_ray", op(gfx10=0xe7, gfx11=0x1a)),
+   ("image_bvh_dual_intersect_ray", op(gfx12=0x80)),
+   ("image_bvh8_intersect_ray",  op(gfx12=0x81)),
 }
 for (name, num) in MIMG:
   insn(name, num, Format.MIMG, InstrClass.VMem, is_atomic = "atomic" in name)
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -887,6 +887,8 @@ validate_ir(Program* program)
                           program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4;
                        if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
                            instr->opcode != aco_opcode::image_bvh64_intersect_ray &&
+                            instr->opcode != aco_opcode::image_bvh_dual_intersect_ray &&
+                            instr->opcode != aco_opcode::image_bvh8_intersect_ray &&
                            i < 3 + num_scalar) {
                           check(instr->operands[i].regClass() == v1,
                                 "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -314,6 +314,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, s
      gather_intrinsic_store_output_info(nir, instr, info, consider_force_vrs);
      break;
   case nir_intrinsic_bvh64_intersect_ray_amd:
+   case nir_intrinsic_bvh8_intersect_ray_amd:
      info->cs.uses_rt = true;
      break;
   case nir_intrinsic_load_poly_line_smooth_enabled:
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -693,6 +693,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_is_sparse_resident_zink:
   case nir_intrinsic_sparse_residency_code_and:
   case nir_intrinsic_bvh64_intersect_ray_amd:
+   case nir_intrinsic_bvh8_intersect_ray_amd:
   case nir_intrinsic_image_deref_load_param_intel:
   case nir_intrinsic_image_load_raw_intel:
   case nir_intrinsic_get_ubo_size:
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1777,6 +1777,32 @@ system_value("sbt_base_amd", 1, bit_sizes=[64])
 # 6. inverse ray direction (componentwise 1.0/ray direction)
 intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE, CAN_REORDER])

+# 1. HW descriptor
+# 2. BVH base
+# 3. instance cull mask
+# 4. ray extent
+# 5. ray origin
+# 6. ray direction
+# 7. node ID
+#
+# dst:
+# | component | box node    | instance node        | triangle node                     | procedural node                   |
+# |-----------|-------------|----------------------|-----------------------------------|-----------------------------------|
+# | 0         | child_id[0] |                      | t[0]                              |                                   |
+# | 1         | child_id[1] |                      | u[0]                              |                                   |
+# | 2         | child_id[2] | blas_addr_lo         | v[0]                              |                                   |
+# | 3         | child_id[3] | blas_addr_hi         | primitive_index_hit_kind[0]       | primitive_index                   |
+# | 4         | child_id[4] |                      | t[1]                              |                                   |
+# | 5         | child_id[5] |                      | u[1]                              |                                   |
+# | 6         | child_id[6] | user_data            | v[1]                              |                                   |
+# | 7         | child_id[7] | next_node_ids        | primitive_index_hit_kind[1]       |                                   |
+# | 8         |             |                      | geometry_index_navigation_bits[0] | geometry_index_navigation_bits[0] |
+# | 9         |             |                      | geometry_index_navigation_bits[1] | geometry_index_navigation_bits[1] |
+# | [10,12]   |             | object_ray_origin    |                                   |                                   |
+# | [13,15]   |             | object_ray_direction |                                   |                                   |
+#
+intrinsic("bvh8_intersect_ray_amd", [4, 2, 1, 1, 3, 3, 1], 16, flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # Return of a callable in raytracing pipelines
 intrinsic("rt_return_amd")