diff --git a/src/amd/compiler/README b/src/amd/compiler/README index cd5861b1f9a..16e7daee3e4 100644 --- a/src/amd/compiler/README +++ b/src/amd/compiler/README @@ -131,3 +131,12 @@ finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz Currently, we don't do this. +## RDNA / GFX10 hazards + +### SMEM store followed by a load with the same address + +We found that an `s_buffer_load` will produce incorrect results if it is preceded +by an `s_buffer_store` with the same address. Inserting an `s_nop` between them +does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted. +This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use +SMEM stores, so it's not surprising that they didn't notice it. diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 76fa54f2c17..af609132dee 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -221,6 +221,7 @@ struct wait_ctx { uint8_t vs_cnt = 0; bool pending_flat_lgkm = false; bool pending_flat_vm = false; + bool pending_s_buffer_store = false; /* GFX10 workaround */ wait_imm barrier_imm[barrier_count]; @@ -244,6 +245,7 @@ struct wait_ctx { vs_cnt = std::max(vs_cnt, other->vs_cnt); pending_flat_lgkm |= other->pending_flat_lgkm; pending_flat_vm |= other->pending_flat_vm; + pending_s_buffer_store |= other->pending_s_buffer_store; for (std::pair entry : other->gpr_map) { @@ -329,6 +331,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) */ if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) imm.lgkm = 0; + + /* GFX10: A store followed by a load at the same address causes a problem because + * the load doesn't load the correct values unless we wait for the store first. + * This is NOT mitigated by an s_nop. + * + * TODO: Refine this when we have proper alias analysis. + */ + SMEM_instruction *smem = static_cast(instr); + if (ctx.pending_s_buffer_store && + !smem->definitions.empty() && + !smem->can_reorder && smem->barrier == barrier_buffer) { + imm.lgkm = 0; + } } if (instr->format == Format::PSEUDO_BARRIER) { @@ -407,8 +422,10 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) if (imm.vm == 0) ctx.pending_flat_vm = false; - if (imm.lgkm == 0) + if (imm.lgkm == 0) { ctx.pending_flat_lgkm = false; + ctx.pending_s_buffer_store = false; + } return imm; } @@ -576,10 +593,16 @@ void gen(Instruction* instr, wait_ctx& ctx) break; } case Format::SMEM: { + SMEM_instruction *smem = static_cast(instr); update_counters(ctx, event_smem, static_cast(instr)->barrier); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); + else if (ctx.chip_class >= GFX10 && + !smem->can_reorder && + smem->barrier == barrier_buffer) + ctx.pending_s_buffer_store = true; + break; } case Format::DS: {