anv: Emit state cache invalidation after every compute dispatch

Implement HSD 16028171704/14025112257:
   LSC state cache livelock:- Once state cache entries are full,
   subsequent walker dispatches with two threads per thread group maybe
   gets stuck infinitely because of state cache live lock.

   One thread continuously stuck in loop doing UGM fence + evict and UGM
   read is waiting on UGM read to have certain value. while other thread
   supposed to update the value that first thread is waiting for. But
   since entries are full in state cache, there is second thread never
   make progress.

Closes: #12352
Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37128>
This commit is contained in:
Sagar Ghuge 2025-08-07 18:15:54 -07:00 committed by Marge Bot
parent b03cd7bdce
commit 3e0ad0176b
3 changed files with 25 additions and 0 deletions

View file

@ -173,6 +173,22 @@ genX(cmd_buffer_set_coarse_pixel_active)(struct anv_cmd_buffer *cmd_buffer,
#endif #endif
} }
/*
* TDOD: Add INTEL_NEEDS_WA_14025112257 check once HSD is propogated for all
* other impacted platforms.
*/
static inline void
genX(cmd_buffer_state_cache_inval_wa_14025112257)(
struct anv_cmd_buffer *cmd_buffer)
{
if (cmd_buffer->device->info->ver >= 20 &&
anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
"WA_14025112257");
}
}
void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
struct anv_device *device, struct anv_device *device,
struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_buffer *cmd_buffer,

View file

@ -461,6 +461,8 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
.MOCS = anv_mocs(cmd_buffer->device, .MOCS = anv_mocs(cmd_buffer->device,
indirect_addr.bo, 0), indirect_addr.bo, 0),
); );
genX(cmd_buffer_state_cache_inval_wa_14025112257)(cmd_buffer);
} }
static inline void static inline void
@ -527,6 +529,8 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
.SystolicModeEnable = prog_data->uses_systolic, .SystolicModeEnable = prog_data->uses_systolic,
#endif #endif
); );
genX(cmd_buffer_state_cache_inval_wa_14025112257)(cmd_buffer);
} }
#else /* #if GFX_VERx10 >= 125 */ #else /* #if GFX_VERx10 >= 125 */
@ -1308,6 +1312,8 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
.body = body, .body = body,
); );
genX(cmd_buffer_state_cache_inval_wa_14025112257)(cmd_buffer);
trace_intel_end_rays(&cmd_buffer->trace, trace_intel_end_rays(&cmd_buffer->trace,
params->launch_size[0], params->launch_size[0],
params->launch_size[1], params->launch_size[1],

View file

@ -666,6 +666,9 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) { anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) {
cw.body = body; cw.body = body;
} }
genX(cmd_buffer_state_cache_inval_wa_14025112257)(state->cmd_buffer);
#else /* GFX_VERx10 < 125 */ #else /* GFX_VERx10 < 125 */
const uint32_t vfe_curbe_allocation = const uint32_t vfe_curbe_allocation =
ALIGN(prog_data->push.per_thread.regs * dispatch.threads + ALIGN(prog_data->push.per_thread.regs * dispatch.threads +