From 945637514e6e970fcc37745f509eec11ff3b5129 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 16 Aug 2022 08:08:43 +0000 Subject: [PATCH] intel/fs: improve Wa_22013689345 workaround The initial implementation is a pretty big hammer. Implement the HW recommendation to minimize cases in which we need a fence. This improves by 10FPS on some of the Sascha Willems RT demos. Signed-off-by: Lionel Landwerlin Fixes: 6031ad4bf690 ("intel/fs: Add Wa_22013689345") Reviewed-by: Francisco Jerez Part-of: --- src/intel/compiler/brw_eu.h | 37 +++++++++++++++++++++++++++++++++++ src/intel/compiler/brw_fs.cpp | 32 ++++++++++++++++++++++-------- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index f0785046bb6..8b7bcaefa81 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1203,6 +1203,43 @@ lsc_opcode_has_transpose(enum lsc_opcode opcode) return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE; } +static inline bool +lsc_opcode_is_store(enum lsc_opcode opcode) +{ + return opcode == LSC_OP_STORE || + opcode == LSC_OP_STORE_CMASK; +} + +static inline bool +lsc_opcode_is_atomic(enum lsc_opcode opcode) +{ + switch (opcode) { + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_FADD: + case LSC_OP_ATOMIC_FSUB: + case LSC_OP_ATOMIC_FMIN: + case LSC_OP_ATOMIC_FMAX: + case LSC_OP_ATOMIC_FCMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + return true; + + default: + return false; + } +} + static inline uint32_t lsc_data_size_bytes(enum lsc_data_size data_size) { diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 90a7af86438..363c550c148 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6332,18 +6332,34 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) { /* This workaround is about making sure that any instruction writing * through UGM has completed before we hit EOT. - * - * The workaround talks about UGM writes or atomic message but what is - * important is anything that hasn't completed. Usually any SEND - * instruction that has a destination register will be read by something - * else so we don't need to care about those as they will be synchronized - * by other parts of the shader or optimized away. What is left are - * instructions that don't have a destination register. */ if (inst->sfid != GFX12_SFID_UGM) return false; - return inst->dst.file == BAD_FILE; + /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages, + * where the L1-cache override is NOT among {WB, WS, WT} + */ + enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc); + if (lsc_opcode_is_store(opcode)) { + switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) { + case LSC_CACHE_STORE_L1STATE_L3MOCS: + case LSC_CACHE_STORE_L1WB_L3WB: + case LSC_CACHE_STORE_L1S_L3UC: + case LSC_CACHE_STORE_L1S_L3WB: + case LSC_CACHE_STORE_L1WT_L3UC: + case LSC_CACHE_STORE_L1WT_L3WB: + return false; + + default: + return true; + } + } + + /* Any UGM Atomic message WITHOUT return value */ + if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE) + return true; + + return false; } /* Wa_22013689345