diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 08a66be8d43..172704915ba 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -513,7 +513,7 @@ emit_ds_instruction(asm_context& ctx, std::vector& out, const Instruct out.push_back(encoding); encoding = 0; if (!instr->definitions.empty()) - encoding |= reg(ctx, instr->definitions[0], 8) << 24; + encoding |= reg(ctx, instr->definitions.back(), 8) << 24; for (unsigned i = 0; i < MIN2(instr->operands.size(), 3); i++) { const Operand& op = instr->operands[i]; if (op.physReg() != m0 && !op.isUndefined()) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index ffdb736f79f..7025add394c 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -569,7 +569,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(7) ("sopp", [Format.SOPP], [(0, 0), (0, 1)]), ("sopc", [Format.SOPC], [(1, 2)]), ("smem", [Format.SMEM], [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (1, 1), (0, 0)]), - ("ds", [Format.DS], [(1, 0), (1, 1), (1, 2), (1, 3), (0, 3), (0, 4)]), + ("ds", [Format.DS], [(1, 0), (1, 1), (1, 2), (1, 3), (0, 3), (0, 4), (2, 3)]), ("ldsdir", [Format.LDSDIR], [(1, 1)]), ("mubuf", [Format.MUBUF], [(0, 4), (1, 3), (1, 4)]), ("mtbuf", [Format.MTBUF], [(0, 4), (1, 3)]), diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 0eb6605ed50..1fb30571b86 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -696,8 +696,8 @@ gen(Instruction* instr, wait_ctx& ctx) if (ds.gds) update_counters(ctx, event_gds_gpr_lock); - if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds); + for (auto& definition : instr->definitions) + insert_wait_entry(ctx, definition, ds.gds ? event_gds : event_lds); if (ds.gds) { for (const Operand& op : instr->operands) diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 964bc8025b5..f706cd8822d 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1444,7 +1444,8 @@ get_tied_defs(Instruction* instr) instr->opcode == aco_opcode::s_fmac_f16) { ops.push_back(2); } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 || - instr->opcode == aco_opcode::s_cmovk_i32) { + instr->opcode == aco_opcode::s_cmovk_i32 || + instr->opcode == aco_opcode::ds_bvh_stack_push4_pop1_rtn_b32) { ops.push_back(0); } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) { ops.push_back(3); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index b8601cbfebc..d577556a562 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1598,6 +1598,14 @@ static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected * Operand(n-1): M0 - LDS size. * Definition(0): VDST - Destination VGPR when results returned to VGPRs. * + * For ds_bvh_stack* instructions: + * + * Operand(0): ADDR - VGPR supplying the stack address (overwritten with stack address after push) + * Operand(1): LVADDR - VGPR supplying the last visited node ID + * Operand(2): DATA - VGPR supplying the result of bvh*_intersect_ray + * Definition(0) - new ADDR (tied to operand 0, contains new stack address) + * Definition(1): VDST - next node ID to test for intersection + * */ struct DS_instruction : public Instruction { memory_sync_info sync; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 2c3df623a58..765c1e223a5 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1649,6 +1649,7 @@ DS = { ("ds_pk_add_rtn_f16", op(gfx12=0xaa)), ("ds_pk_add_bf16", op(gfx12=0x9b)), ("ds_pk_add_rtn_bf16", op(gfx12=0xab)), + ("ds_bvh_stack_push4_pop1_rtn_b32", op(gfx11=0xad, gfx12=0xe0)), #ds_bvh_stack_rtn in GFX11 } for (name, num) in DS: insn(name, num, Format.DS, InstrClass.DS) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 0a8fa65efae..48a5740754a 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1518,7 +1518,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) if (has_usable_ds_offset && i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && base.regClass() == instr->operands[i].regClass() && - instr->opcode != aco_opcode::ds_swizzle_b32) { + instr->opcode != aco_opcode::ds_swizzle_b32 && + instr->opcode != aco_opcode::ds_bvh_stack_push4_pop1_rtn_b32) { if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || instr->opcode == aco_opcode::ds_write2_b64 || diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 6e39c423332..6122b86d7b1 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -912,9 +912,10 @@ validate_ir(Program* program) check(op.isOfType(RegType::vgpr) || op.physReg() == m0 || op.isUndefined(), "Only VGPRs are valid DS instruction operands", instr.get()); } - if (!instr->definitions.empty()) - check(instr->definitions[0].regClass().type() == RegType::vgpr, - "DS instruction must return VGPR", instr.get()); + for (const Definition& def : instr->definitions) { + check(def.regClass().type() == RegType::vgpr, "DS instruction must return VGPR", + instr.get()); + } break; } case Format::EXP: { diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 9fc4df236b5..6e65c81b023 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -4018,6 +4018,40 @@ pops_await_overlapped_waves(isel_context* ctx) bld.reset(ctx->block); } +uint16_t +ds_bvh_stack_offset1_gfx11(unsigned stack_size) +{ + switch (stack_size) { + case 8: return 0x00; + case 16: return 0x10; + case 32: return 0x20; + case 64: return 0x30; + default: unreachable("invalid stack size"); + } +} + +void +emit_ds_bvh_stack_push4_pop1_rtn(isel_context* ctx, nir_intrinsic_instr* instr, Builder& bld) +{ + Temp dst = get_ssa_temp(ctx, &instr->def); + Temp stack_addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp last_node = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + Temp intersection_result = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); + + Temp dst_stack_addr = bld.tmp(v1); + Temp dst_node_pointer = bld.tmp(v1); + uint32_t offset0 = 0, offset1 = 0; + if (ctx->program->gfx_level >= GFX12) + offset0 = nir_intrinsic_stack_size(instr); + else + offset1 = ds_bvh_stack_offset1_gfx11(nir_intrinsic_stack_size(instr)); + bld.ds(aco_opcode::ds_bvh_stack_push4_pop1_rtn_b32, Definition(dst_stack_addr), + Definition(dst_node_pointer), Operand(stack_addr), Operand(last_node), + Operand(intersection_result), offset0, offset1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(dst_stack_addr), + Operand(dst_node_pointer)); +} + } // namespace void @@ -5056,6 +5090,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), Operand::c32(nir_intrinsic_base(instr))); break; + case nir_intrinsic_bvh_stack_rtn_amd: { + switch (instr->num_components) { + case 4: emit_ds_bvh_stack_push4_pop1_rtn(ctx, instr, bld); break; + default: unreachable("Invalid BVH stack component count!"); + } + break; + } default: isel_err(&instr->instr, "Unimplemented intrinsic instr"); abort(); diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index e2d7a6f8648..d92da173892 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -959,6 +959,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_agx: case nir_intrinsic_load_shared_lock_nv: case nir_intrinsic_store_shared_unlock_nv: + case nir_intrinsic_bvh_stack_rtn_amd: is_divergent = true; break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index dd49204045a..360b498fca1 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1868,6 +1868,15 @@ intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE # intrinsic("bvh8_intersect_ray_amd", [4, 2, 1, 1, 3, 3, 1], 16, flags=[CAN_ELIMINATE, CAN_REORDER]) +# operands: +# 1. stack address +# 2. previous node pointer +# 3. BVH node pointers +# returns: +# component 0: next stack address +# component 1: next node pointer +intrinsic("bvh_stack_rtn_amd", [1, 1, 0], 2, indices=[STACK_SIZE]) + # Return of a callable in raytracing pipelines intrinsic("rt_return_amd")