From eaf68f744b7ae394b6d05e2e3a3ebfee4eb5fe08 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Thu, 26 Mar 2026 12:03:27 +0100 Subject: [PATCH] panvk/csf: rework IR descriptor handling for tiler OOM Replace the old partial IR state snapshotting with prebuilt per-pass IR descriptor buffers and a queue-attached scratch FBD. 1. emit FBD+DBD+RTDs for each IR-pass+layer 2. store it into some side-band GPU buffer that's passed around through the queue context 3. in the execption handler, copy FBD+DBD+RTDs from the IR desc buffer to some space that's attached to the queue context and not the cmdbuf Fixes: 46f611c9 ("panvk: Also use resolve shaders for Z/S") Signed-off-by: Boris Brezillon Signed-off-by: Ryan zhang Reviewed-by: Lars-Ivar Hesselberg Simonsen Reviewed-by: Christoph Pillmayer Part-of: --- src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 33 +-- src/panfrost/vulkan/csf/panvk_queue.h | 1 + src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 249 +++++----------- .../vulkan/csf/panvk_vX_exception_handler.c | 278 +++++++----------- src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c | 21 ++ src/panfrost/vulkan/panvk_cmd_draw.h | 4 + 6 files changed, 210 insertions(+), 376 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index af85a94ce68..7e7e8922c88 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -61,19 +61,6 @@ enum panvk_incremental_rendering_pass { PANVK_IR_PASS_COUNT }; -struct panvk_ir_fbd_info { - uint32_t word0; - uint32_t word6; - uint32_t word7; - uint32_t word12; -}; - -struct panvk_ir_desc_info { - struct panvk_ir_fbd_info fbd; - uint32_t crc_zs_word0; - uint32_t rtd_word1[MAX_RTS]; -}; - static inline uint32_t get_tiler_oom_handler_idx(bool has_zs_ext, uint32_t rt_count) { @@ -138,8 +125,10 @@ struct panvk_cs_subqueue_context { uint64_t layer_fbd_ptr; /* Pointer to scratch FBD used in the event of IR */ uint64_t ir_scratch_fbd_ptr; - /* Partial descriptor data needed in the event of IR */ - struct panvk_ir_desc_info ir_desc_infos[PANVK_IR_PASS_COUNT]; + /* FBD+DBD+RTDs IR descs to be copied to the scratch FBD when + * IR is triggered. + */ + uint64_t ir_descs[PANVK_IR_PASS_COUNT]; uint32_t td_count; uint32_t layer_count; } tiler_oom_ctx; @@ -880,19 +869,5 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages, void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf, struct panvk_cs_deps deps); -#if PAN_ARCH >= 10 - -void panvk_per_arch(cs_patch_ir_state)( - struct cs_builder *b, const struct cs_tracing_ctx *tracing_ctx, - bool has_zs_ext, uint32_t rt_count, struct cs_index remaining_layers_in_td, - struct cs_index current_fbd_ptr_reg, struct cs_index ir_desc_info_ptr, - struct cs_index ir_fbd_word_0, struct cs_index scratch_fbd_ptr_reg, - struct cs_index scratch_registers_5); - -void panvk_per_arch(cs_ir_update_registers_to_next_layer)( - struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, - struct cs_index current_fbd_ptr_reg, struct cs_index ir_fbd_word_0, - struct cs_index remaining_layers_in_td); -#endif /* PAN_ARCH >= 10 */ #endif /* PANVK_CMD_BUFFER_H */ diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h index 5431674b135..d7fe249a741 100644 --- a/src/panfrost/vulkan/csf/panvk_queue.h +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -28,6 +28,7 @@ enum panvk_subqueue_id { struct panvk_tiler_heap { uint32_t chunk_size; struct panvk_priv_mem desc; + struct panvk_priv_mem oom_fbd; struct { uint32_t handle; uint64_t dev_addr; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 69ac06202ec..8e9caaa35da 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -932,8 +932,7 @@ calc_fbd_size(struct panvk_cmd_buffer *cmdbuf) static uint32_t calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf) { - const uint32_t ir_scratch_fbd = 1; - uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) + ir_scratch_fbd; + uint32_t fbd_count = calc_enabled_layer_count(cmdbuf); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -1241,10 +1240,10 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) !cmdbuf->state.gfx.render.layer_count) return VK_SUCCESS; - const uint32_t ir_scratch_fbd = 1; + uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask; + uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf); uint32_t fbd_sz = calc_fbd_size(cmdbuf); - uint32_t fbds_sz = - (calc_enabled_layer_count(cmdbuf) + ir_scratch_fbd) * fbd_sz; + uint32_t fbds_sz = enabled_layer_count * fbd_sz; cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); @@ -1290,8 +1289,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) /* We prepare all FB descriptors upfront. For multiview, only create FBDs * for enabled views. */ - uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask; - uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf); bool multiview = cmdbuf->state.gfx.render.view_mask; struct pan_tiler_context tiler_ctx; @@ -1326,97 +1323,49 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) fbd_flags = new_fbd_flags; } - const bool has_zs_ext = pan_fb_has_zs(&render->fb.layout); - const uint32_t rt_count = render->fb.layout.rt_count; - struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) { - /* We use the scratch FBD to initialize our IR pass data, then copy - * only IR relevant FBD sections to the subqueue context. - */ - void *scratch_fbd_init_memory = fbds.cpu + (fbd_sz * enabled_layer_count); + struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); - const uint64_t ir_pass_info_offset = - TILER_OOM_CTX_FIELD_OFFSET(ir_desc_infos) + - ir_pass * sizeof(struct panvk_ir_desc_info); + if (!ir_fbds.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; - fbd_info.layer = 0; - tiler_ctx = get_tiler_context(cmdbuf, 0); - fbd_info.load = ir_pass == PANVK_IR_FIRST_PASS ? - &render->fb.load : &render->fb.spill.load; - fbd_info.store = ir_pass == PANVK_IR_LAST_PASS ? - &render->fb.store : &render->fb.spill.store; + uint32_t ir_view_mask_temp = cmdbuf->state.gfx.render.view_mask; - VkResult result = panvk_per_arch(cmd_get_frame_shaders)( - cmdbuf, fbd_info.fb, fbd_info.load, - ir_pass == PANVK_IR_LAST_PASS ? &render->fb.resolve : NULL, - &fbd_info.frame_shaders); - if (result != VK_SUCCESS) - return result; + for (uint32_t i = 0; i < enabled_layer_count; i++) { + uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i; + void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz)); - ASSERTED uint32_t new_fbd_flags = - GENX(pan_emit_fb_desc)(&fbd_info, scratch_fbd_init_memory); + fbd_info.layer = layer_idx; + tiler_ctx = get_tiler_context(cmdbuf, layer_idx); + fbd_info.load = ir_pass == PANVK_IR_FIRST_PASS + ? &render->fb.load + : &render->fb.spill.load; + fbd_info.store = ir_pass == PANVK_IR_LAST_PASS + ? &render->fb.store + : &render->fb.spill.store; - /* Make sure all FBDs have the same flags. */ - assert(new_fbd_flags == fbd_flags); + VkResult result = panvk_per_arch(cmd_get_frame_shaders)( + cmdbuf, fbd_info.fb, fbd_info.load, + ir_pass == PANVK_IR_LAST_PASS ? &render->fb.resolve : NULL, + &fbd_info.frame_shaders); + if (result != VK_SUCCESS) + return result; - { - struct mali_framebuffer_packed *scratch_fbd = scratch_fbd_init_memory; + ASSERTED uint32_t new_fbd_flags = + GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd); - /* Copy IR FBD data word0, dword6 and word12 */ - struct cs_index fbd_registers = cs_scratch_reg_tuple(b, 0, 4); - cs_move32_to(b, cs_scratch_reg32(b, 0), scratch_fbd->opaque[0]); - cs_move32_to(b, cs_scratch_reg32(b, 1), scratch_fbd->opaque[6]); - cs_move32_to(b, cs_scratch_reg32(b, 2), scratch_fbd->opaque[7]); - cs_move32_to(b, cs_scratch_reg32(b, 3), scratch_fbd->opaque[12]); - cs_store( - b, fbd_registers, cs_subqueue_ctx_reg(b), BITFIELD_MASK(4), - ir_pass_info_offset + offsetof(struct panvk_ir_desc_info, fbd)); - - /* Move past base FBD */ - scratch_fbd_init_memory += pan_size(FRAMEBUFFER); - } - - /* Copy IR DBD word0 if present */ - if (has_zs_ext) { - struct mali_zs_crc_extension_packed *scratch_zs_crc = scratch_fbd_init_memory; - - struct cs_index crc_zs_ext_reg = cs_scratch_reg32(b, 4); - - cs_move32_to(b, crc_zs_ext_reg, scratch_zs_crc->opaque[0]); - cs_store32(b, crc_zs_ext_reg, cs_subqueue_ctx_reg(b), - ir_pass_info_offset + - offsetof(struct panvk_ir_desc_info, crc_zs_word0)); - - /* Move past crc_zs_ext */ - scratch_fbd_init_memory += pan_size(ZS_CRC_EXTENSION); - } - - { - /* Assume we have sufficient scratch to avoid wait */ - assert(rt_count + 5 < CS_REG_SCRATCH_COUNT); - - /* Copy IR RTD word1 */ - for (uint32_t rt = 0; rt < rt_count; rt++) { - struct mali_render_target_packed *scratch_rtd = scratch_fbd_init_memory; - struct cs_index rt_reg = cs_scratch_reg32(b, 5 + rt); - - const uint64_t ir_rt_info_offset = - offsetof(struct panvk_ir_desc_info, rtd_word1) + - rt * sizeof(uint32_t); - - cs_move32_to(b, rt_reg, scratch_rtd->opaque[1]); - cs_store32(b, rt_reg, cs_subqueue_ctx_reg(b), - ir_pass_info_offset + ir_rt_info_offset); - - /* Move past current RT */ - scratch_fbd_init_memory += pan_size(RENDER_TARGET); - } + /* Make sure all FBDs have the same flags. */ + assert(new_fbd_flags == fbd_flags); } + static_assert(ARRAY_SIZE(cmdbuf->state.gfx.render.ir.fbds) == PANVK_IR_PASS_COUNT, + "ir.fbds array size must match PANVK_IR_PASS_COUNT"); + cmdbuf->state.gfx.render.ir.fbds[ir_pass] = ir_fbds.gpu; } - /* Wait for ir pass info to complete */ + /* Wait for IR info push to complete */ cs_wait_slot(b, SB_ID(LS)); bool unset_provoking_vertex = @@ -1516,8 +1465,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) /* If we don't know what provoking vertex mode the application wants yet, * leave space to patch it later */ if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { - const uint32_t ir_scratch_fbd = 1; - uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) + ir_scratch_fbd; + uint32_t fbd_count = calc_enabled_layer_count(cmdbuf); /* passed to fn_set_fbds_provoking_vertex */ struct cs_index fbd_count_reg = cs_scratch_reg32(b, 0); cs_move32_to(b, fbd_count_reg, fbd_count); @@ -3338,25 +3286,37 @@ static void setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; + const bool has_zs_ext = pan_fb_has_zs(fb); + struct mali_framebuffer_pointer_packed fb_tag; uint32_t layer_count = cmdbuf->state.gfx.render.layer_count; uint32_t td_count = DIV_ROUND_UP(layer_count, MAX_LAYERS_PER_TILER_DESC); - uint32_t fbd_sz = calc_fbd_size(cmdbuf); - const uint32_t fbd_scratch_offset = fbd_sz * layer_count; + + pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) { + cfg.zs_crc_extension_present = has_zs_ext; + cfg.render_target_count = fb->rt_count; + } struct cs_index counter = cs_scratch_reg32(b, 1); cs_move32_to(b, counter, 0); cs_store32(b, counter, cs_subqueue_ctx_reg(b), TILER_OOM_CTX_FIELD_OFFSET(counter)); - struct cs_index fbd_ptr_reg = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); + struct cs_index fbd_ptr_reg = cs_scratch_reg64(b, 6); + cs_add64(b, fbd_ptr_reg, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), + -(int32_t)fb_tag.opaque[0]); cs_store64(b, fbd_ptr_reg, cs_subqueue_ctx_reg(b), TILER_OOM_CTX_FIELD_OFFSET(layer_fbd_ptr)); - struct cs_index scratch_fbd_ptr_reg = cs_scratch_reg64(b, 2); - cs_add64(b, scratch_fbd_ptr_reg, fbd_ptr_reg, fbd_scratch_offset); - cs_store64(b, scratch_fbd_ptr_reg, cs_subqueue_ctx_reg(b), - TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr)); + for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) { + const uint32_t ir_descs_offset = + TILER_OOM_CTX_FIELD_OFFSET(ir_descs) + (sizeof(uint64_t) * ir_pass); + struct cs_index ir_fbds_reg = cs_scratch_reg64(b, 2); + + cs_move64_to(b, ir_fbds_reg, cmdbuf->state.gfx.render.ir.fbds[ir_pass]); + cs_store64(b, ir_fbds_reg, cs_subqueue_ctx_reg(b), ir_descs_offset); + } struct cs_index td_count_reg = cs_scratch_reg32(b, 4); cs_move32_to(b, td_count_reg, td_count); @@ -3413,8 +3373,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * descriptors are constant (no need to patch them at runtime). */ bool free_render_descs = simul_use && needs_tiling; uint32_t fbd_sz = calc_fbd_size(cmdbuf); - uint32_t scratch_fbd_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count; - uint32_t ir_fbd_desc_sz = sizeof(struct panvk_ir_desc_info); uint32_t td_count = 0; if (needs_tiling) { td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, @@ -3453,19 +3411,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); - /* Use the scratch FBD if incremental render occurred. */ - struct cs_index counter = cs_scratch_reg32(b, 0); - cs_load32_to( - b, counter, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter)); - cs_wait_slot(b, SB_ID(LS)); - cs_if(b, MALI_CS_CONDITION_GREATER, counter) { - cs_update_frag_ctx(b) { - cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - cs_sr_reg64(b, FRAGMENT, FBD_POINTER), scratch_fbd_offset); - } - } - /* Applications tend to forget to describe subpass dependencies, especially * when it comes to write -> read dependencies on attachments. The * proprietary driver forces "others" invalidation as a workaround, and this @@ -3479,85 +3424,23 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_wait_slot(b, SB_ID(IMM_FLUSH)); } - const bool has_zs_ext = pan_fb_has_zs(fb); - const uint32_t rt_count = fb->rt_count; + if (cmdbuf->state.gfx.render.layer_count <= 1) { + cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER); + } else { + struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4); + struct cs_index remaining_layers = cs_scratch_reg32(b, 4); - /* IR was hit: set up IR FBD */ - cs_if(b, MALI_CS_CONDITION_GREATER, counter) { - /* FBD patching registers */ - struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 5); - struct cs_index ir_fbd_word_0 = cs_scratch_reg32(b, 5); - struct cs_index remaining_layers_in_td = cs_scratch_reg32(b, 6); - struct cs_index layer_count = cs_scratch_reg32(b, 7); - struct cs_index layer_fbd_ptr_reg = cs_scratch_reg64(b, 8); - struct cs_index ir_desc_info_ptr = cs_scratch_reg64(b, 10); - struct cs_index scratch_fbd_ptr_reg = cs_scratch_reg64(b, 12); - - /* Run fragment is only used after FBD patching */ - struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 5); - - /* Get base fbd ptr */ - cs_add64(b, layer_fbd_ptr_reg, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), -(int32_t)scratch_fbd_offset); - cs_add64(b, scratch_fbd_ptr_reg, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), 0); - cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC); - - /* Get ir info ptr */ - cs_add64(b, ir_desc_info_ptr, cs_subqueue_ctx_reg(b), - TILER_OOM_CTX_FIELD_OFFSET(ir_desc_infos) + - ir_fbd_desc_sz * PANVK_IR_LAST_PASS); - - cs_load32_to(b, ir_fbd_word_0, ir_desc_info_ptr, - offsetof(struct panvk_ir_desc_info, fbd.word0)); - - if (cmdbuf->state.gfx.render.layer_count <= 1) { - panvk_per_arch(cs_patch_ir_state)( - b, tracing_ctx, has_zs_ext, rt_count, remaining_layers_in_td, - layer_fbd_ptr_reg, ir_desc_info_ptr, ir_fbd_word_0, - scratch_fbd_ptr_reg, scratch_regs); + cs_move32_to(b, remaining_layers, calc_enabled_layer_count(cmdbuf)); + cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) { + cs_add32(b, remaining_layers, remaining_layers, -1); cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); - } else { - cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count); - cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) { - cs_add32(b, layer_count, layer_count, -1); - panvk_per_arch(cs_patch_ir_state)( - b, tracing_ctx, has_zs_ext, rt_count, remaining_layers_in_td, - layer_fbd_ptr_reg, ir_desc_info_ptr, ir_fbd_word_0, - scratch_fbd_ptr_reg, scratch_regs); - - cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, - MALI_TILE_RENDER_ORDER_Z_ORDER); - - panvk_per_arch(cs_ir_update_registers_to_next_layer)( - b, has_zs_ext, rt_count, layer_fbd_ptr_reg, ir_fbd_word_0, - remaining_layers_in_td); - - /* Serialize run fragments since we reuse FBD for the runs */ - cs_wait_slots(b, dev->csf.sb.all_iters_mask); - } - } - } - cs_else(b) { - if (cmdbuf->state.gfx.render.layer_count <= 1) { - cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - false, MALI_TILE_RENDER_ORDER_Z_ORDER); - } else { - struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4); - struct cs_index remaining_layers = cs_scratch_reg32(b, 4); - - cs_move32_to(b, remaining_layers, calc_enabled_layer_count(cmdbuf)); - cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) { - cs_add32(b, remaining_layers, remaining_layers, -1); - - cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, - MALI_TILE_RENDER_ORDER_Z_ORDER); - - cs_update_frag_ctx(b) - cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz); - } + cs_update_frag_ctx(b) + cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), + cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz); } } diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index d55cb5880fe..b4cf6855184 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -23,131 +23,56 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) return CS_REG_RW; } -void -panvk_per_arch(cs_patch_ir_state)( - struct cs_builder *b, const struct cs_tracing_ctx *tracing_ctx, - bool has_zs_ext, uint32_t rt_count, struct cs_index remaining_layers_in_td, - struct cs_index current_fbd_ptr_reg, struct cs_index ir_desc_info_ptr, - struct cs_index ir_fbd_word_0, struct cs_index scratch_fbd_ptr_reg, - struct cs_index scratch_registers_5) +static void +copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, + struct cs_index src_tiler, struct cs_index src_other, + struct cs_index dst) { - assert(scratch_registers_5.type == CS_INDEX_REGISTER && - "invalid register type"); - assert(scratch_registers_5.size == 5 && "scratch register size must be 5"); + /* Copy the FBD from src to dst. Most words come from + * src_other, but the tiler desc pointer is taken from src_tiler. + */ + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, + BITFIELD_MASK(8), 0); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, + BITFIELD_MASK(8), 0); + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 6), src_other, + BITFIELD_MASK(6), 8 * sizeof(uint32_t)); + cs_load64_to(b, cs_scratch_reg64(b, 6), src_tiler, + 14 * sizeof(uint32_t)); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8), + 8 * sizeof(uint32_t)); - const uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count); + if (has_zs_ext) { + const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed); - /* Calculate the *used* ir_desc_infos size */ - const uint32_t used_ir_desc_info_size = sizeof(struct panvk_ir_fbd_info) + - (has_zs_ext ? sizeof(uint32_t) : 0) + - rt_count * sizeof(uint32_t); - - struct cs_index copy_fbd_staging_regs = - cs_extract_tuple(b, scratch_registers_5, 0, 4); - struct cs_index copy_fbd_dword_6_reg = - cs_extract64(b, scratch_registers_5, 0); - struct cs_index copy_fbd_word_reg = cs_extract32(b, scratch_registers_5, 2); - struct cs_index fbd_offset_reg = cs_extract32(b, scratch_registers_5, 4); - - /* Copy fbd+dbd+rtds to scratch */ - { - /* Our loop is copying 16 bytes at a time, so make sure the - * fbd_size is aligned to 16 bytes. */ - const int32_t copy_stride = 16; - assert(fbd_size == ALIGN_POT(fbd_size, copy_stride)); - - /* Copy the current FBD in full to the FBD scratch */ - cs_move32_to(b, fbd_offset_reg, fbd_size); - cs_while(b, MALI_CS_CONDITION_GREATER, fbd_offset_reg) { - cs_add32(b, fbd_offset_reg, fbd_offset_reg, -copy_stride); - - cs_load_to(b, copy_fbd_staging_regs, current_fbd_ptr_reg, - BITFIELD_MASK(4), 0); - cs_store(b, copy_fbd_staging_regs, scratch_fbd_ptr_reg, - BITFIELD_MASK(4), 0); - - cs_add64(b, current_fbd_ptr_reg, current_fbd_ptr_reg, copy_stride); - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, copy_stride); - } - - /* Move scratch FBD ptr back to FBD base */ - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, -fbd_size); - - /* Patch FBD for IR */ - { - /* Load word 12 and dword 6 */ - cs_load64_to(b, copy_fbd_dword_6_reg, ir_desc_info_ptr, - offsetof(struct panvk_ir_desc_info, fbd.word6)); - cs_load32_to(b, copy_fbd_word_reg, ir_desc_info_ptr, - offsetof(struct panvk_ir_desc_info, fbd.word12)); - cs_store32(b, ir_fbd_word_0, scratch_fbd_ptr_reg, 0 * 4); - cs_store64(b, copy_fbd_dword_6_reg, scratch_fbd_ptr_reg, 6 * 4); - cs_store32(b, copy_fbd_word_reg, scratch_fbd_ptr_reg, 12 * 4); - } - - /* Move fbd and info ptr past base fbd */ - cs_add64(b, ir_desc_info_ptr, ir_desc_info_ptr, - sizeof(struct panvk_ir_fbd_info)); - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, - pan_size(FRAMEBUFFER)); - - /* If the IR FBD has crc zs ext descriptor, then copy word0 from it - * to scratch */ - struct cs_index has_zs_ext_reg = copy_fbd_word_reg; - cs_move32_to(b, has_zs_ext_reg, has_zs_ext); - /* Use cs_if for this as the exception handler excepts each instance of - * tiler_oom_handler to be of the same size */ - cs_if(b, MALI_CS_CONDITION_GREATER, has_zs_ext_reg) { - cs_load32_to(b, copy_fbd_word_reg, ir_desc_info_ptr, 0 * 4); - cs_store32(b, copy_fbd_word_reg, scratch_fbd_ptr_reg, 0 * 4); - - /* Move fbd ptr past crc zs ext */ - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, - pan_size(ZS_CRC_EXTENSION)); - } - - /* Always move ir info ptr past crc zs ext */ - cs_add64(b, ir_desc_info_ptr, ir_desc_info_ptr, sizeof(uint32_t)); - - /* Loop to copy IR RTD's word1 */ - struct cs_index rt_count_reg = fbd_offset_reg; - cs_move32_to(b, rt_count_reg, rt_count); - cs_while(b, MALI_CS_CONDITION_GREATER, rt_count_reg) { - cs_add32(b, rt_count_reg, rt_count_reg, -1); - - cs_load32_to(b, copy_fbd_word_reg, ir_desc_info_ptr, 0 * 4); - cs_store32(b, copy_fbd_word_reg, scratch_fbd_ptr_reg, 1 * 4); - - /* Move fbd and info ptr past current RT */ - cs_add64(b, ir_desc_info_ptr, ir_desc_info_ptr, sizeof(uint32_t)); - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, - pan_size(RENDER_TARGET)); - } - - cs_add64(b, ir_desc_info_ptr, ir_desc_info_ptr, -used_ir_desc_info_size); - cs_add64(b, scratch_fbd_ptr_reg, scratch_fbd_ptr_reg, -fbd_size); - cs_add64(b, current_fbd_ptr_reg, current_fbd_ptr_reg, -fbd_size); - cs_flush_stores(b); + /* Copy the whole DBD. */ + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, + BITFIELD_MASK(8), dbd_offset); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, + BITFIELD_MASK(8), dbd_offset); + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, + BITFIELD_MASK(8), dbd_offset + (8 * sizeof(uint32_t))); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, + BITFIELD_MASK(8), dbd_offset + (8 * sizeof(uint32_t))); } -} -void -panvk_per_arch(cs_ir_update_registers_to_next_layer)( - struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, - struct cs_index current_fbd_ptr_reg, struct cs_index ir_fbd_word_0, - struct cs_index remaining_layers_in_td) -{ - const uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count); - cs_add64(b, current_fbd_ptr_reg, current_fbd_ptr_reg, fbd_size); + const uint16_t rts_offset = + sizeof(struct mali_framebuffer_packed) + + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); - cs_add32(b, ir_fbd_word_0, ir_fbd_word_0, 1 << 24); + for (uint32_t rt = 0; rt < rt_count; rt++) { + const uint16_t rt_offset = + rts_offset + (rt * sizeof(struct mali_render_target_packed)); - /* Set remaining_layers_in_td to zero if reached td limit */ - cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1); - cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) { - cs_add32(b, ir_fbd_word_0, ir_fbd_word_0, - -(1 << 24) * MAX_LAYERS_PER_TILER_DESC); - cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC); + /* Copy the whole RTD. */ + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, + BITFIELD_MASK(8), rt_offset); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, + BITFIELD_MASK(8), rt_offset); + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, + BITFIELD_MASK(8), rt_offset + (8 * sizeof(uint32_t))); + cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, + BITFIELD_MASK(8), rt_offset + (8 * sizeof(uint32_t))); } } @@ -159,7 +84,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, { assert(rt_count >= 1 && rt_count <= MAX_RTS); - uint32_t ir_desc_info_size = sizeof(struct panvk_ir_desc_info); + const uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count); const struct drm_panthor_csif_info *csif_info = panthor_kmod_get_csif_props(dev->kmod.dev); @@ -185,6 +110,12 @@ generate_tiler_oom_handler(struct panvk_device *dev, .tracebuf_addr_offset = offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), }; + struct mali_framebuffer_pointer_packed fb_tag; + + pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) { + cfg.zs_crc_extension_present = has_zs_ext; + cfg.render_target_count = rt_count; + } cs_function_def(&b, &handler, handler_ctx) { struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); @@ -198,17 +129,12 @@ generate_tiler_oom_handler(struct panvk_device *dev, struct cs_index completed_bottom = cs_scratch_reg64(&b, 4); struct cs_index td_count = cs_scratch_reg32(&b, 6); - /* Counter is used early before any over lap registers are used */ - struct cs_index counter = cs_scratch_reg32(&b, 0); - /* FBD patching registers */ - struct cs_index scratch_regs = cs_scratch_reg_tuple(&b, 2, 5); - struct cs_index layer_count = cs_scratch_reg32(&b, 7); - struct cs_index ir_fbd_word_0 = cs_scratch_reg32(&b, 8); - struct cs_index remaining_layers_in_td = cs_scratch_reg32(&b, 9); + struct cs_index layer_count = cs_scratch_reg32(&b, 8); + struct cs_index ir_count = cs_scratch_reg32(&b, 9); struct cs_index scratch_fbd_ptr_reg = cs_scratch_reg64(&b, 10); struct cs_index current_fbd_ptr_reg = cs_scratch_reg64(&b, 12); - struct cs_index ir_desc_info_ptr = cs_scratch_reg64(&b, 14); + struct cs_index ir_descs_ptr = cs_scratch_reg64(&b, 14); /* Run fragment registers will only be used after FBD patching */ struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4); @@ -218,55 +144,49 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr)); - cs_load32_to(&b, counter, subqueue_ctx, + cs_load32_to(&b, ir_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(counter)); cs_load32_to(&b, layer_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(layer_count)); cs_load64_to(&b, current_fbd_ptr_reg, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(layer_fbd_ptr)); - cs_add64(&b, ir_desc_info_ptr, subqueue_ctx, - TILER_OOM_CTX_FIELD_OFFSET(ir_desc_infos)); - cs_move32_to(&b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC); - - /* Move FBD pointer to the scratch fbd */ - cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), scratch_fbd_ptr_reg, - 0); - /* Use different framebuffer descriptor depending on whether incremental * rendering has already been triggered */ - cs_if(&b, MALI_CS_CONDITION_GREATER, counter) { - cs_add64(&b, ir_desc_info_ptr, ir_desc_info_ptr, - ir_desc_info_size * PANVK_IR_MIDDLE_PASS); + cs_if(&b, MALI_CS_CONDITION_GREATER, ir_count) { + cs_load64_to( + &b, ir_descs_ptr, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(ir_descs[PANVK_IR_MIDDLE_PASS])); + } + cs_else(&b) { + cs_load64_to( + &b, ir_descs_ptr, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(ir_descs[PANVK_IR_FIRST_PASS])); } - - cs_load32_to(&b, ir_fbd_word_0, ir_desc_info_ptr, - offsetof(struct panvk_ir_desc_info, fbd.word0)); - - /* Increment counter */ - cs_add32(&b, counter, counter, 1); - cs_store32(&b, counter, subqueue_ctx, - TILER_OOM_CTX_FIELD_OFFSET(counter)); cs_wait_slot(&b, SB_ID(LS)); cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { cs_add32(&b, layer_count, layer_count, -1); - panvk_per_arch(cs_patch_ir_state)( - &b, &tracing_ctx, has_zs_ext, rt_count, remaining_layers_in_td, - current_fbd_ptr_reg, ir_desc_info_ptr, ir_fbd_word_0, - scratch_fbd_ptr_reg, scratch_regs); + copy_fbd(&b, has_zs_ext, rt_count, current_fbd_ptr_reg, ir_descs_ptr, + scratch_fbd_ptr_reg); + + /* Flush copies before the RUN_FRAGMENT. */ + cs_wait_slot(&b, SB_ID(LS)); + + /* Set FBD pointer to the scratch fbd */ + cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), + scratch_fbd_ptr_reg, fb_tag.opaque[0]); cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); - panvk_per_arch(cs_ir_update_registers_to_next_layer)( - &b, has_zs_ext, rt_count, current_fbd_ptr_reg, ir_fbd_word_0, - remaining_layers_in_td); - /* Serialize run fragments since we reuse FBD for the runs */ cs_wait_slots(&b, dev->csf.sb.all_iters_mask); + + cs_add64(&b, current_fbd_ptr_reg, current_fbd_ptr_reg, fbd_size); + cs_add64(&b, ir_descs_ptr, ir_descs_ptr, fbd_size); } cs_load32_to(&b, td_count, subqueue_ctx, @@ -289,6 +209,37 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_add32(&b, td_count, td_count, -1); } + /* If this is the first IR call, we need to patch the regular FBD + * to use the last IR config. + */ + cs_if(&b, MALI_CS_CONDITION_EQUAL, ir_count) { + cs_load64_to(&b, current_fbd_ptr_reg, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(layer_fbd_ptr)); + cs_load64_to(&b, ir_descs_ptr, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(ir_descs[PANVK_IR_LAST_PASS])); + cs_load32_to(&b, layer_count, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(layer_count)); + + cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { + cs_add32(&b, layer_count, layer_count, -1); + + /* Preserve the tiler pointer, take the rest from the + * last IR config. + */ + copy_fbd(&b, has_zs_ext, rt_count, current_fbd_ptr_reg, + ir_descs_ptr, current_fbd_ptr_reg); + + cs_add64(&b, current_fbd_ptr_reg, current_fbd_ptr_reg, fbd_size); + cs_add64(&b, ir_descs_ptr, ir_descs_ptr, fbd_size); + } + } + + /* Increment IR counter */ + cs_add32(&b, ir_count, ir_count, 1); + cs_store32(&b, ir_count, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(counter)); + cs_wait_slot(&b, SB_ID(LS)); + /* We need to flush the texture caches so future preloads see the new * content. */ cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE, @@ -306,7 +257,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, return handler.length * sizeof(uint64_t); } -#define TILER_OOM_HANDLER_MAX_SIZE 1024 +#define TILER_OOM_HANDLER_MAX_SIZE 2048 VkResult panvk_per_arch(init_tiler_oom)(struct panvk_device *device) { @@ -321,10 +272,7 @@ panvk_per_arch(init_tiler_oom)(struct panvk_device *device) for (uint32_t zs_ext = 0; zs_ext <= 1; zs_ext++) { for (uint32_t rt_count = 1; rt_count <= MAX_RTS; rt_count++) { uint32_t idx = get_tiler_oom_handler_idx(zs_ext, rt_count); - /* Check that we have calculated a handler_stride if we need it to - * offset addresses. */ - assert(idx == 0 || device->tiler_oom.handler_stride != 0); - size_t offset = idx * device->tiler_oom.handler_stride; + size_t offset = idx * TILER_OOM_HANDLER_MAX_SIZE; struct cs_buffer handler_mem = { .cpu = device->tiler_oom.handlers_bo->addr.host + offset, @@ -337,14 +285,16 @@ panvk_per_arch(init_tiler_oom)(struct panvk_device *device) generate_tiler_oom_handler(device, handler_mem, zs_ext, rt_count, tracing_enabled, &dump_region_size); - - /* All handlers must have the same length */ - assert(idx == 0 || handler_length == device->tiler_oom.handler_stride); - device->tiler_oom.handler_stride = handler_length; + /* Use memset(0) to make sure the remaining space is filled with NOP + * instructions. */ + assert(handler_length <= TILER_OOM_HANDLER_MAX_SIZE); + memset((uint8_t *)handler_mem.cpu + handler_length, 0, + TILER_OOM_HANDLER_MAX_SIZE - handler_length); device->dump_region_size[PANVK_SUBQUEUE_FRAGMENT] = MAX2(device->dump_region_size[PANVK_SUBQUEUE_FRAGMENT], dump_region_size); } + device->tiler_oom.handler_stride = TILER_OOM_HANDLER_MAX_SIZE; } panvk_priv_bo_flush(device->tiler_oom.handlers_bo, 0, diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index 1fda310a539..a2328994c99 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -451,6 +451,16 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) .pos = 0, }; } + + if (subqueue == PANVK_SUBQUEUE_FRAGMENT) { + /* The tiler OOM exception handler is registered to the fragment + * queue, so the scratch FBD buffer is only needed there. We leave + * it to NULL on other queues to make sure any attempt to access it + * results in a NULL deref that can be caught. + */ + cs_ctx->tiler_oom_ctx.ir_scratch_fbd_ptr = + panvk_priv_mem_dev_addr(queue->tiler_heap.oom_fbd); + } } /* We use the geometry buffer for our temporary CS buffer. */ @@ -706,6 +716,15 @@ init_tiler(struct panvk_gpu_queue *queue) tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size; + alloc_info.size = get_fbd_size(true, MAX_RTS); + alloc_info.alignment = pan_alignment(FRAMEBUFFER); + tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) { + result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a scratch FBD"); + goto err_free_desc; + } + struct drm_panthor_tiler_heap_create thc = { .vm_id = pan_kmod_vm_handle(dev->kmod.vm), .chunk_size = tiler_heap->chunk_size, @@ -736,6 +755,7 @@ init_tiler(struct panvk_gpu_queue *queue) err_free_desc: panvk_pool_free_mem(&tiler_heap->desc); + panvk_pool_free_mem(&tiler_heap->oom_fbd); return result; } @@ -752,6 +772,7 @@ cleanup_tiler(struct panvk_gpu_queue *queue) assert(!ret); panvk_pool_free_mem(&tiler_heap->desc); + panvk_pool_free_mem(&tiler_heap->oom_fbd); } struct panvk_queue_submit { diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 930ed2155ac..8de69cfdb42 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -115,6 +115,10 @@ struct panvk_rendering_state { * to a draw. */ uint64_t last; } oq; + + struct { + uint64_t fbds[3]; + } ir; #endif };