diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 844a28d511c..0ba952815e1 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -60,6 +60,12 @@ struct fd_dev_info { uint32_t reg_size_vec4; + /* The size (in instrlen units (128 bytes)) of instruction cache where + * we preload a shader. Loading more than this could trigger a hang + * on gen3 and later. + */ + uint32_t instr_cache_size; + /* Whether the PC_MULTIVIEW_MASK register exists. */ bool supports_multiview_mask; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index e13b5c6c2c1..d787a688d47 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -206,6 +206,7 @@ add_gpus([ a6xx_gen1 = dict( fibers_per_sp = 128 * 16, reg_size_vec4 = 96, + instr_cache_size = 64, ccu_cntl_gmem_unk2 = True, indirect_draw_wfm_quirk = True, depth_bounds_require_depth_test_quirk = True, @@ -218,6 +219,7 @@ a6xx_gen1 = dict( a6xx_gen2 = dict( fibers_per_sp = 128 * 4 * 16, reg_size_vec4 = 96, + instr_cache_size = 64, # TODO supports_multiview_mask = True, has_z24uint_s8uint = True, indirect_draw_wfm_quirk = True, @@ -231,6 +233,8 @@ a6xx_gen2 = dict( a6xx_gen3 = dict( fibers_per_sp = 128 * 2 * 16, reg_size_vec4 = 64, + # Blob limits it to 128 but we hang with 128 + instr_cache_size = 127, supports_multiview_mask = True, has_z24uint_s8uint = True, tess_use_shared = True, @@ -249,6 +253,8 @@ a6xx_gen3 = dict( a6xx_gen4 = dict( fibers_per_sp = 128 * 2 * 16, reg_size_vec4 = 64, + # Blob limits it to 128 but we hang with 128 + instr_cache_size = 127, supports_multiview_mask = True, has_z24uint_s8uint = True, tess_use_shared = True, diff --git a/src/freedreno/computerator/a6xx.c b/src/freedreno/computerator/a6xx.c index 188cde2118e..64d79b2c4e6 100644 --- a/src/freedreno/computerator/a6xx.c +++ b/src/freedreno/computerator/a6xx.c @@ -196,12 +196,14 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2); OUT_RELOC(ring, v->bo, 0, 0, 0); + uint32_t shader_preload_size = + MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size); OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen)); + CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); OUT_RELOC(ring, v->bo, 0, 0, 0); if (v->pvtmem_size > 0) { diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 136f30224dd..6694913d6d1 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -545,12 +545,15 @@ tu6_emit_xs(struct tu_cs *cs, tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); + uint32_t shader_preload_size = + MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size); + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen)); + CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); tu_cs_emit_qw(cs, binary_iova); /* emit immediates */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 6a4cfc6ed79..698f84cf801 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -143,12 +143,15 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_PKT4(ring, hw_stack_offset, 1); OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size)); + uint32_t shader_preload_size = + MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size); + OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen)); + CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); OUT_RELOC(ring, so->bo, 0, 0, 0); }