diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index db0c4e3f6bc..8445940dfb9 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -85,15 +85,26 @@ aco_ptr create_s_mov(Definition dst, Operand src); enum sendmsg { sendmsg_none = 0, - _sendmsg_gs = 2, - _sendmsg_gs_done = 3, - sendmsg_save_wave = 4, - sendmsg_stall_wave_gen = 5, - sendmsg_halt_waves = 6, - sendmsg_ordered_ps_done = 7, - sendmsg_early_prim_dealloc = 8, - sendmsg_gs_alloc_req = 9, - sendmsg_id_mask = 0xf, + _sendmsg_gs = 2, /* gfx6 to gfx10.3 */ + _sendmsg_gs_done = 3, /* gfx6 to gfx10.3 */ + sendmsg_hs_tessfactor = 2, /* gfx11+ */ + sendmsg_dealloc_vgprs = 3, /* gfx11+ */ + sendmsg_save_wave = 4, /* gfx8 to gfx10.3 */ + sendmsg_stall_wave_gen = 5, /* gfx9+ */ + sendmsg_halt_waves = 6, /* gfx9+ */ + sendmsg_ordered_ps_done = 7, /* gfx9+ */ + sendmsg_early_prim_dealloc = 8, /* gfx9 to gfx10 */ + sendmsg_gs_alloc_req = 9, /* gfx9+ */ + sendmsg_get_doorbell = 10, /* gfx9 to gfx10.3 */ + sendmsg_get_ddid = 11, /* gfx10 to gfx10.3 */ + sendmsg_rtn_get_doorbell = 128, /* gfx11+ */ + sendmsg_rtn_get_ddid = 129, /* gfx11+ */ + sendmsg_rtn_get_tma = 130, /* gfx11+ */ + sendmsg_rtn_get_realtime = 131, /* gfx11+ */ + sendmsg_rtn_save_wave = 132, /* gfx11+ */ + sendmsg_rtn_get_tba = 133, /* gfx11+ */ + sendmsg_id_mask_gfx6 = 0xf, + sendmsg_id_mask_gfx11 = 0xff, }; inline sendmsg diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index dee0f891ca1..9b77a6996b8 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -333,16 +333,25 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins break; } case aco_opcode::s_sendmsg: { - unsigned id = imm & sendmsg_id_mask; + unsigned id = + gfx_level >= GFX11 ? (imm & sendmsg_id_mask_gfx11) : (imm & sendmsg_id_mask_gfx6); + static_assert(_sendmsg_gs == sendmsg_hs_tessfactor); + static_assert(_sendmsg_gs_done == sendmsg_dealloc_vgprs); switch (id) { case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break; case _sendmsg_gs: - fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "", - imm & 0x20 ? ", emit" : "", imm >> 8); + if (gfx_level >= GFX11) + fprintf(output, " sendmsg(hs_tessfactor)"); + else + fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "", + imm & 0x20 ? ", emit" : "", imm >> 8); break; case _sendmsg_gs_done: - fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "", - imm & 0x20 ? ", emit" : "", imm >> 8); + if (gfx_level >= GFX11) + fprintf(output, " sendmsg(dealloc_vgprs)"); + else + fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "", + imm & 0x20 ? ", emit" : "", imm >> 8); break; case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break; case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break; @@ -350,6 +359,15 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break; case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break; case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break; + case sendmsg_get_doorbell: fprintf(output, " sendmsg(get_doorbell)"); break; + case sendmsg_get_ddid: fprintf(output, " sendmsg(get_ddid)"); break; + case sendmsg_rtn_get_doorbell: fprintf(output, " sendmsg(rtn_get_doorbell)"); break; + case sendmsg_rtn_get_ddid: fprintf(output, " sendmsg(rtn_get_ddid)"); break; + case sendmsg_rtn_get_tma: fprintf(output, " sendmsg(rtn_get_Tma)"); break; + case sendmsg_rtn_get_realtime: fprintf(output, " sendmsg(rtn_get_realtime)"); break; + case sendmsg_rtn_save_wave: fprintf(output, " sendmsg(rtn_save_wave)"); break; + case sendmsg_rtn_get_tba: fprintf(output, " sendmsg(rtn_get_Tba)"); break; + default: fprintf(output, " imm:%u", imm); } break; } diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 6cebcf95622..88498ef4565 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -122,6 +122,7 @@ struct MoveState { }; struct sched_ctx { + amd_gfx_level gfx_level; int16_t num_waves; int16_t last_SMEM_stall; int last_SMEM_dep_idx; @@ -420,20 +421,10 @@ MoveState::upwards_skip(UpwardsCursor& cursor) } bool -is_gs_or_done_sendmsg(const Instruction* instr) +is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr) { - if (instr->opcode == aco_opcode::s_sendmsg) { - uint16_t imm = instr->sopp().imm; - return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done; - } - return false; -} - -bool -is_done_sendmsg(const Instruction* instr) -{ - if (instr->opcode == aco_opcode::s_sendmsg) - return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done; + if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg) + return (instr->sopp().imm & sendmsg_id_mask_gfx6) == _sendmsg_gs_done; return false; } @@ -464,6 +455,7 @@ struct memory_event_set { }; struct hazard_query { + amd_gfx_level gfx_level; bool contains_spill; bool contains_sendmsg; bool uses_exec; @@ -473,8 +465,9 @@ struct hazard_query { }; void -init_hazard_query(hazard_query* query) +init_hazard_query(const sched_ctx& ctx, hazard_query* query) { + query->gfx_level = ctx.gfx_level; query->contains_spill = false; query->contains_sendmsg = false; query->uses_exec = false; @@ -484,9 +477,10 @@ init_hazard_query(hazard_query* query) } void -add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync) +add_memory_event(amd_gfx_level gfx_level, memory_event_set* set, Instruction* instr, + memory_sync_info* sync) { - set->has_control_barrier |= is_done_sendmsg(instr); + set->has_control_barrier |= is_done_sendmsg(gfx_level, instr); if (instr->opcode == aco_opcode::p_barrier) { Pseudo_barrier_instruction& bar = instr->barrier(); if (bar.sync.semantics & semantic_acquire) @@ -524,7 +518,7 @@ add_to_hazard_query(hazard_query* query, Instruction* instr) memory_sync_info sync = get_sync_info_with_hack(instr); - add_memory_event(&query->mem_events, instr, &sync); + add_memory_event(query->gfx_level, &query->mem_events, instr, &sync); if (!(sync.semantics & semantic_can_reorder)) { unsigned storage = sync.storage; @@ -580,7 +574,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) memory_event_set instr_set; memset(&instr_set, 0, sizeof(instr_set)); memory_sync_info sync = get_sync_info_with_hack(instr); - add_memory_event(&instr_set, instr, &sync); + add_memory_event(query->gfx_level, &instr_set, instr, &sync); memory_event_set* first = &instr_set; memory_event_set* second = &query->mem_events; @@ -655,7 +649,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& registe /* first, check if we have instructions before current to move down */ hazard_query hq; - init_hazard_query(&hq); + init_hazard_query(ctx, &hq); add_to_hazard_query(&hq, current); DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false); @@ -751,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& registe if (is_dependency) { if (!found_dependency) { ctx.mv.upwards_update_insert_idx(up_cursor); - init_hazard_query(&hq); + init_hazard_query(ctx, &hq); found_dependency = true; } } @@ -797,8 +791,8 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector& registe /* first, check if we have instructions before current to move down */ hazard_query indep_hq; hazard_query clause_hq; - init_hazard_query(&indep_hq); - init_hazard_query(&clause_hq); + init_hazard_query(ctx, &indep_hq); + init_hazard_query(ctx, &clause_hq); add_to_hazard_query(&indep_hq, current); DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); @@ -923,7 +917,7 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector& registe if (is_dependency) { if (!found_dependency) { ctx.mv.upwards_update_insert_idx(up_cursor); - init_hazard_query(&indep_hq); + init_hazard_query(ctx, &indep_hq); found_dependency = true; } } else if (is_vmem) { @@ -967,7 +961,7 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector (int)idx - window_size; @@ -1054,6 +1048,7 @@ schedule_program(Program* program, live& live_vars) demand.vgpr += program->config->num_shared_vgprs / 2; sched_ctx ctx; + ctx.gfx_level = program->gfx_level; ctx.mv.depends_on.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());