From e63ffc2f04ef2922a34fccbec4e81b689737bfc9 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Fri, 3 Dec 2021 13:23:18 +0200
Subject: [PATCH] freedreno,tu: Limit the amount of instructions preloaded into
 icache

Inferring from blob's cmdstream the size of shader instruction
cache for:
- a630 is 64
- a650 is 128
- a660 is 128

On a650 and a660 gpu could hang if we exceed the limit. Though
it is not reproducible with computerator or a single amber
test. Also while blob limits the size to 128 - Turnip still
hangs with it but does not hang with the limit of 127.

On a630 there seem to be no hang when limit is exceeded.

Fixes the hang of compute shader in Alien Isolation on a650/a660.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14044>
---
 src/freedreno/common/freedreno_dev_info.h        | 6 ++++++
 src/freedreno/common/freedreno_devices.py        | 6 ++++++
 src/freedreno/computerator/a6xx.c                | 4 +++-
 src/freedreno/vulkan/tu_pipeline.c               | 5 ++++-
 src/gallium/drivers/freedreno/a6xx/fd6_program.c | 5 ++++-
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index 844a28d511c..0ba952815e1 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -60,6 +60,12 @@ struct fd_dev_info {
 
          uint32_t reg_size_vec4;
 
+         /* The size (in instrlen units (128 bytes)) of instruction cache where
+          * we preload a shader. Loading more than this could trigger a hang
+          * on gen3 and later.
+          */
+         uint32_t instr_cache_size;
+
          /* Whether the PC_MULTIVIEW_MASK register exists. */
          bool supports_multiview_mask;
 
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index e13b5c6c2c1..d787a688d47 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -206,6 +206,7 @@ add_gpus([
 a6xx_gen1 = dict(
         fibers_per_sp = 128 * 16,
         reg_size_vec4 = 96,
+        instr_cache_size = 64,
         ccu_cntl_gmem_unk2 = True,
         indirect_draw_wfm_quirk = True,
         depth_bounds_require_depth_test_quirk = True,
@@ -218,6 +219,7 @@ a6xx_gen1 = dict(
 a6xx_gen2 = dict(
         fibers_per_sp = 128 * 4 * 16,
         reg_size_vec4 = 96,
+        instr_cache_size = 64, # TODO
         supports_multiview_mask = True,
         has_z24uint_s8uint = True,
         indirect_draw_wfm_quirk = True,
@@ -231,6 +233,8 @@ a6xx_gen2 = dict(
 a6xx_gen3 = dict(
         fibers_per_sp = 128 * 2 * 16,
         reg_size_vec4 = 64,
+        # Blob limits it to 128 but we hang with 128
+        instr_cache_size = 127,
         supports_multiview_mask = True,
         has_z24uint_s8uint = True,
         tess_use_shared = True,
@@ -249,6 +253,8 @@ a6xx_gen3 = dict(
 a6xx_gen4 = dict(
         fibers_per_sp = 128 * 2 * 16,
         reg_size_vec4 = 64,
+        # Blob limits it to 128 but we hang with 128
+        instr_cache_size = 127,
         supports_multiview_mask = True,
         has_z24uint_s8uint = True,
         tess_use_shared = True,
diff --git a/src/freedreno/computerator/a6xx.c b/src/freedreno/computerator/a6xx.c
index 188cde2118e..64d79b2c4e6 100644
--- a/src/freedreno/computerator/a6xx.c
+++ b/src/freedreno/computerator/a6xx.c
@@ -196,12 +196,14 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
    OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
    OUT_RELOC(ring, v->bo, 0, 0, 0);
 
+   uint32_t shader_preload_size =
+      MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size);
    OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
-                     CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen));
+                     CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
    OUT_RELOC(ring, v->bo, 0, 0, 0);
 
    if (v->pvtmem_size > 0) {
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 136f30224dd..6694913d6d1 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -545,12 +545,15 @@ tu6_emit_xs(struct tu_cs *cs,
    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
 
+   uint32_t shader_preload_size =
+      MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
+
    tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen));
+                  CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
    tu_cs_emit_qw(cs, binary_iova);
 
    /* emit immediates */
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
index 6a4cfc6ed79..698f84cf801 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
@@ -143,12 +143,15 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
    OUT_PKT4(ring, hw_stack_offset, 1);
    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
 
+   uint32_t shader_preload_size =
+      MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
+
    OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
                      CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
-                     CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen));
+                     CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
    OUT_RELOC(ring, so->bo, 0, 0, 0);
 }