From a7ae37656d73810efd01b304d61d21b67f933f3b Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 14:45:56 +0200 Subject: [PATCH] panvk: Implement RUN_FRAGMENT2 Added structure panvk_fb_layer_state and related logic to store and emit per-layer fragment state. Also, move some temporary registers to non-conflicting ones. Incremental rendering is left as TODO for later. Reviewed-by: Lars-Ivar Hesselberg Simonsen Reviewed-by: Boris Brezillon Part-of: --- src/panfrost/genxml/v14.xml | 1 + src/panfrost/lib/pan_fb.c | 63 +---- src/panfrost/lib/pan_fb.h | 54 ++++- src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 76 +++++- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 226 +++++++++++++++++- .../vulkan/csf/panvk_vX_exception_handler.c | 34 ++- src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c | 7 +- 7 files changed, 381 insertions(+), 80 deletions(-) diff --git a/src/panfrost/genxml/v14.xml b/src/panfrost/genxml/v14.xml index d64dfd3d4dc..e41657089ae 100644 --- a/src/panfrost/genxml/v14.xml +++ b/src/panfrost/genxml/v14.xml @@ -1482,6 +1482,7 @@ + diff --git a/src/panfrost/lib/pan_fb.c b/src/panfrost/lib/pan_fb.c index 180c0aba8e8..38ba6d72e52 100644 --- a/src/panfrost/lib/pan_fb.c +++ b/src/panfrost/lib/pan_fb.c @@ -76,20 +76,6 @@ GENX(pan_select_fb_tile_size)(struct pan_fb_layout *fb) #endif } -/** - * Returns true if there's enough space in the tile buffer for at least two - * Z/S tiles. - */ -static inline bool -pan_fb_can_pipeline_zs(const struct pan_fb_layout *fb) -{ - const uint32_t z_B_per_px = sizeof(float) * fb->sample_count; - const uint32_t z_B_per_tile = z_B_per_px * fb->tile_size_px; - - /* The budget is already half the available Z space */ - return z_B_per_tile < fb->tile_z_budget_B; -} - static void align_fb_tiling_area_for_image_plane(struct pan_fb_layout *fb, struct pan_image_plane_ref pref) @@ -377,13 +363,6 @@ GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info, } #if PAN_ARCH >= 5 -static bool -target_has_clear(const struct pan_fb_load_target *target) -{ - return target->in_bounds_load == PAN_FB_LOAD_CLEAR || - target->border_load == PAN_FB_LOAD_CLEAR; -} - static enum mali_msaa translate_msaa_copy_op(const struct pan_fb_layout *fb, const struct pan_image_view *iview, @@ -414,11 +393,6 @@ translate_msaa_copy_op(const struct pan_fb_layout *fb, } } -struct pan_fb_clean_tile { - uint8_t rts; - bool zs, s; -}; - static bool pan_fb_load_target_always(const struct pan_fb_load_target *target) { @@ -435,8 +409,8 @@ pan_fb_store_target_always(const struct pan_fb_store_target *target) return target->store && target->always; } -static struct pan_fb_clean_tile -pan_fb_get_clean_tile(const struct pan_fb_desc_info *info) +struct pan_fb_clean_tile +GENX(pan_fb_get_clean_tile)(const struct pan_fb_desc_info *info) { const struct pan_fb_layout *fb = info->fb; const struct pan_fb_load *load = info->load; @@ -614,7 +588,7 @@ emit_rgb_rt_desc(const struct pan_fb_desc_info *info, cfg.clean_pixel_write_enable = !!(ct.rts & BITFIELD_BIT(rt)); #endif - if (load && target_has_clear(&load->rts[rt])) { + if (load && pan_target_has_clear(&load->rts[rt])) { uint32_t packed[4] = {}; pan_pack_color(GENX(pan_blendable_formats), packed, &load->rts[rt].clear.color, fb->rt_formats[rt], @@ -649,33 +623,12 @@ emit_rgb_rt_desc(const struct pan_fb_desc_info *info, pan_merge(rgb_rt, &desc, RGB_RENDER_TARGET); } -#if PAN_ARCH >= 6 -/* All GPUs starting from Bifrost are affected by issue TSIX-2033: - * - * Forcing clean_tile_writes breaks INTERSECT readbacks - * - * To workaround, use the pre-frame shader mode ALWAYS instead of INTERSECT if - * clean_tile_write_enable is set on either one of the color, depth or stencil - * buffers. Since INTERSECT is a hint that the hardware may ignore, this - * cannot affect correctness, only performance. */ - -static enum mali_pre_post_frame_shader_mode -pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, - bool force_clean_tile) -{ - if (force_clean_tile && mode == MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT) - return MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS; - else - return mode; -} -#endif - static void emit_rts(const struct pan_fb_desc_info *info, struct mali_rgb_render_target_packed *rts) { const struct pan_fb_layout *fb = info->fb; - const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info); + const struct pan_fb_clean_tile ct = GENX(pan_fb_get_clean_tile)(info); uint32_t tile_rt_offset_B = 0; for (unsigned rt = 0; rt < fb->rt_count; rt++) { @@ -696,7 +649,7 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, const struct pan_fb_descs *out) { if (pan_fb_has_zs(info->fb)) { - emit_zs_crc_desc(info, pan_fb_get_clean_tile(info), out->zs_crc); + emit_zs_crc_desc(info, GENX(pan_fb_get_clean_tile)(info), out->zs_crc); } emit_rts(info, out->rts); @@ -711,7 +664,7 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, const struct pan_fb_layout *fb = info->fb; const struct pan_fb_load *load = info->load; const struct pan_fb_store *store = info->store; - const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info); + const struct pan_fb_clean_tile ct = GENX(pan_fb_get_clean_tile)(info); const bool has_zs_crc_ext = pan_fb_has_zs(fb); @@ -793,14 +746,14 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, if (fb->s_format != PIPE_FORMAT_NONE) { cfg.s_clear = - load && target_has_clear(&load->s) ? load->s.clear.stencil : 0; + load && pan_target_has_clear(&load->s) ? load->s.clear.stencil : 0; cfg.s_write_enable = store && store->s.store; } if (fb->z_format != PIPE_FORMAT_NONE) { cfg.z_internal_format = pan_get_z_internal_format(fb->z_format); cfg.z_clear = - load && target_has_clear(&load->z) ? load->z.clear.depth : 0; + load && pan_target_has_clear(&load->z) ? load->z.clear.depth : 0; cfg.z_write_enable = store && store->zs.store; } else { /* Default to 24 bit depth if there's no surface. */ diff --git a/src/panfrost/lib/pan_fb.h b/src/panfrost/lib/pan_fb.h index fb5cbf213e1..29f9602ae47 100644 --- a/src/panfrost/lib/pan_fb.h +++ b/src/panfrost/lib/pan_fb.h @@ -618,15 +618,67 @@ bool GENX(pan_fb_load_shader_key_fill)(struct pan_fb_shader_key *key, const struct pan_fb_load *load, bool zs_prepass); +#if PAN_ARCH >= 5 +struct pan_fb_clean_tile { + uint8_t rts; + bool zs, s; +}; + +struct pan_fb_clean_tile + GENX(pan_fb_get_clean_tile)(const struct pan_fb_desc_info *info); + +static inline bool +pan_target_has_clear(const struct pan_fb_load_target *target) +{ + return target->in_bounds_load == PAN_FB_LOAD_CLEAR || + target->border_load == PAN_FB_LOAD_CLEAR; +} +#endif /* PAN_ARCH >= 5 */ + #if PAN_ARCH >= 6 bool GENX(pan_fb_resolve_shader_key_fill)(struct pan_fb_shader_key *key, const struct pan_fb_layout *fb, const struct pan_fb_resolve *resolve); -#endif + +/* All GPUs starting from Bifrost are affected by issue TSIX-2033: + * + * Forcing clean_tile_writes breaks INTERSECT readbacks + * + * To workaround, use the pre-frame shader mode ALWAYS instead of INTERSECT if + * clean_tile_write_enable is set on either one of the color, depth or stencil + * buffers. Since INTERSECT is a hint that the hardware may ignore, this + * cannot affect correctness, only performance. */ + +static inline enum mali_pre_post_frame_shader_mode +pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, + bool force_clean_tile) +{ + if (force_clean_tile && mode == MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT) + return MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS; + else + return mode; +} +#endif /* PAN_ARCH >= 6 */ struct nir_shader * GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key, const struct nir_shader_compiler_options *nir_options); + +#if PAN_ARCH >= 13 +/** + * Returns true if there's enough space in the tile buffer for at least two + * Z/S tiles. + */ +static inline bool +pan_fb_can_pipeline_zs(const struct pan_fb_layout *fb) +{ + const uint32_t z_B_per_px = sizeof(float) * fb->sample_count; + const uint32_t z_B_per_tile = z_B_per_px * fb->tile_size_px; + + /* The budget is already half the available Z space */ + return z_B_per_tile < fb->tile_z_budget_B; +} +#endif #endif /* PAN_ARCH */ #endif /* __PAN_FB_H */ diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 7e7e8922c88..315cadc0475 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -61,6 +61,37 @@ enum panvk_incremental_rendering_pass { PANVK_IR_PASS_COUNT }; +#if PAN_ARCH >= 14 +/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since + * we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors + * aligned. */ +struct panvk_fb_layer_state { + /** GPU address to the tiler descriptor. */ + uint64_t tiler; + + /** Frame argument. */ + uint64_t frame_argument; + + /** An instance of Fragment Flags 0. */ + struct mali_fragment_flags_0_packed flags0; + + /** An instance of Fragment Flags 2. */ + struct mali_fragment_flags_2_packed flags2; + + /** Z clear value. */ + uint32_t z_clear; + + /** GPU address to the draw call descriptors. It may be 0. */ + uint64_t dcd_pointer; + + /** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */ + uint64_t dbd_pointer; + + /** GPU address to the RENDER_TARGET descriptors. */ + uint64_t rtd_pointer; +} __attribute__((aligned(64))); +#endif /* PAN_ARCH >= 14 */ + static inline uint32_t get_tiler_oom_handler_idx(bool has_zs_ext, uint32_t rt_count) { @@ -74,7 +105,11 @@ static inline uint32_t get_fbd_size(bool has_zs_ext, uint32_t rt_count) { assert(rt_count >= 1 && rt_count <= MAX_RTS); +#if PAN_ARCH >= 14 + uint32_t fbd_size = ALIGN_POT(sizeof(struct panvk_fb_layer_state), 64); +#else uint32_t fbd_size = pan_size(FRAMEBUFFER); +#endif if (has_zs_ext) fbd_size += pan_size(ZS_CRC_EXTENSION); fbd_size += pan_size(RENDER_TARGET) * rt_count; @@ -209,13 +244,25 @@ enum panvk_cs_regs { PANVK_CS_REG_RUN_IDVS_SR_END = 60, #endif +#if PAN_ARCH >= 14 + /* RUN_FRAGMENT2 staging regs. + * SW ABI: + * - r58:59 contain the pointer to the first tiler descriptor. This is + * needed to gather completed heap chunks after a run_fragment2. + */ + PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0, + PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#else /* RUN_FRAGMENT staging regs. * SW ABI: - * - r38:39 contain the pointer to the first tiler descriptor. This is + * - r58:59 contain the pointer to the first tiler descriptor. This is * needed to gather completed heap chunks after a run_fragment. */ PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38, PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#endif /* RUN_COMPUTE staging regs. */ PANVK_CS_REG_RUN_COMPUTE_SR_START = 0, @@ -870,4 +917,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages, void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf, struct panvk_cs_deps deps); +#if PAN_ARCH >= 14 +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct panvk_fb_layer_state, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct panvk_fb_layer_state, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct panvk_fb_layer_state, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct panvk_fb_layer_state, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct panvk_fb_layer_state, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct panvk_fb_layer_state, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct panvk_fb_layer_state, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct panvk_fb_layer_state, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + #endif /* PANVK_CMD_BUFFER_H */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index aec7c79ae53..d735d6ed8a5 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1230,6 +1230,93 @@ get_tiler_context(struct panvk_cmd_buffer *cmdbuf, uint32_t layer) return tiler_ctx; } +#if PAN_ARCH >= 14 +static void +init_layer_fragment_state(const struct pan_fb_desc_info *info, + const struct pan_ptr fbd) +{ + const struct pan_fb_layout *fb = info->fb; + const struct pan_fb_load *load = info->load; + const struct pan_fb_store *store = info->store; + const struct pan_fb_clean_tile ct = GENX(pan_fb_get_clean_tile)(info); + const bool has_zs_crc_ext = pan_fb_has_zs(fb); + + struct panvk_fb_layer_state fbd_data = {0}; + fbd_data.tiler = info->tiler_ctx->valhall.desc; + + /* layer_index in flags0 is used to select the right primitive list in + * the tiler context, and frame_arg is the value that's passed to the + * fragment shader through r62-r63, which we use to pass gl_Layer. Since + * the layer_idx only takes 8-bits, we might use the extra 56-bits we + * have in frame_argument to pass other information to the fragment + * shader at some point. + */ + assert(info->layer >= info->tiler_ctx->valhall.layer_offset); + fbd_data.frame_argument = info->layer; + + pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) { + cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0], + ct.rts || ct.zs || ct.s); + cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1], + ct.rts || ct.zs || ct.s); + cfg.post_frame = info->frame_shaders.modes[2]; + + /* Enabling prepass without pipelineing is generally not good for + * performance, so disable HSR in that case. + */ + cfg.hsr_prepass_enable = + info->allow_hsr_prepass && pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_filter_enable = true; + cfg.hsr_hierarchical_optimizations_enable = true; + + cfg.internal_layer_index = + info->layer - info->tiler_ctx->valhall.layer_offset; + } + + pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) { + if (fb->s_format != PIPE_FORMAT_NONE) { + cfg.s_clear = + load && pan_target_has_clear(&load->s) ? load->s.clear.stencil : 0; + cfg.s_write_enable = store && store->s.store; + } + + if (fb->z_format != PIPE_FORMAT_NONE) { + cfg.z_internal_format = pan_get_z_internal_format(fb->z_format); + cfg.z_write_enable = store && store->zs.store; + } else { + cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24; + assert(!store || !store->zs.store); + } + } + + fbd_data.z_clear = + util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load && + pan_target_has_clear(&load->z) + ? load->z.clear.depth + : 0); + + fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer; + + /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */ + { + uint64_t out_gpu_addr = + fbd.gpu + ALIGN_POT(sizeof(struct panvk_fb_layer_state), 64); + + if (has_zs_crc_ext) { + fbd_data.dbd_pointer = out_gpu_addr; + assert(fbd_data.dbd_pointer % 64 == 0); + out_gpu_addr += pan_size(ZS_CRC_EXTENSION); + } + + fbd_data.rtd_pointer = out_gpu_addr; + assert(fbd_data.rtd_pointer % 64 == 0); + } + + memcpy(fbd.cpu, &fbd_data, sizeof(fbd_data)); +} +#endif /* PAN_ARCH >= 14 */ + static VkResult get_fb_descs(struct panvk_cmd_buffer *cmdbuf) { @@ -1245,8 +1332,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) uint32_t fbd_sz = calc_fbd_size(cmdbuf); uint32_t fbds_sz = enabled_layer_count * fbd_sz; - cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( - cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct panvk_fb_layer_state); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + cmdbuf->state.gfx.render.fbds = + panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment); if (!cmdbuf->state.gfx.render.fbds.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; @@ -1331,13 +1423,25 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) : fbd.cpu + fb_sz, }; uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)(&fbd_info, &fb_descs); +#if PAN_ARCH >= 14 + init_layer_fragment_state(&fbd_info, fbd); +#endif /* Make sure all FBDs have the same flags. */ assert(i == 0 || new_fbd_flags == fbd_flags); fbd_flags = new_fbd_flags; } +#if PAN_ARCH >= 14 + /* fbd_flags is unused on v14+. */ + assert(!fbd_flags); +#endif + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) { struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem( cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); @@ -1377,6 +1481,9 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) }; ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)(&fbd_info, &fb_descs); +#if PAN_ARCH >= 14 + init_layer_fragment_state(&fbd_info, fbd); +#endif /* Make sure all FBDs have the same flags. */ assert(new_fbd_flags == fbd_flags); @@ -1389,16 +1496,17 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) /* Wait for IR info push to complete */ cs_wait_slot(b, SB_ID(LS)); +#endif /* PAN_ARCH >= 14 */ bool unset_provoking_vertex = cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; if (copy_fbds) { - struct cs_index cur_tiler = cs_reg64(b, 38); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); - struct cs_index fbd_idx = cs_reg32(b, 47); - struct cs_index src_fbd_ptr = cs_reg64(b, 48); - struct cs_index remaining_layers_in_td = cs_reg32(b, 50); + struct cs_index fbd_idx = cs_reg32(b, 60); + struct cs_index src_fbd_ptr = cs_reg64(b, 64); + struct cs_index remaining_layers_in_td = cs_reg32(b, 61); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -1481,7 +1589,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_update_frag_ctx(b) { cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbds.gpu | fbd_flags); - cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler); + cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR), + cmdbuf->state.gfx.render.tiler); } /* If we don't know what provoking vertex mode the application wants yet, @@ -3321,6 +3430,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf) static void setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; const bool has_zs_ext = pan_fb_has_zs(fb); @@ -3365,6 +3477,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) TILER_OOM_CTX_FIELD_OFFSET(layer_count)); cs_flush_stores(b); +#endif /* PAN_ARCH >= 14 */ } static uint32_t @@ -3373,24 +3486,95 @@ pack_32_2x16(uint16_t lo, uint16_t hi) return (((uint32_t)hi) << 16) | (uint32_t)lo; } +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panvk_cmd_buffer *cmdbuf) +{ + /* Emit the static fragment staging registers. These don't change per-layer. */ + + const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render; + const struct pan_fb_layout *fb = &render->fb.layout; + + const uint8_t sample_count = render->fb.layout.sample_count; + + const struct pan_fb_bbox fb_area_px = + pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px); + const struct pan_fb_bbox bbox_px = + pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px); + + assert(pan_fb_bbox_is_valid(fb->tiling_area_px)); + + struct mali_fragment_bounding_box_packed bbox; + pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) { + cfg.bound_min_x = bbox_px.min_x; + cfg.bound_min_y = bbox_px.min_y; + cfg.bound_max_x = bbox_px.max_x; + cfg.bound_max_y = bbox_px.max_y; + } + + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width_px; + cfg.height = fb->height_px; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), + bbox.opaque[0]); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), + bbox.opaque[1]); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to( + b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + dev->sample_positions->addr.dev + + pan_sample_positions_offset(pan_sample_pattern(sample_count))); + + /* Flags 1 */ + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + cfg.sample_count = fb->sample_count; + cfg.sample_pattern = pan_sample_pattern(fb->sample_count); + cfg.effective_tile_size = fb->tile_size_px; + cfg.point_sprite_coord_origin_max_y = false; + cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf); + + assert(fb->rt_count > 0); + cfg.render_target_count = fb->rt_count; + cfg.color_buffer_allocation = fb->tile_rt_alloc_B; + } + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} +#endif /* PAN_ARCH >= 14 */ + static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH < 14 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); +#endif const struct cs_tracing_ctx *tracing_ctx = &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing; - const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; /* Now initialize the fragment bits. */ + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); cs_update_frag_ctx(b) { +#if PAN_ARCH >= 14 + cs_emit_static_fragment_state(b, cmdbuf); + cs_emit_layer_fragment_state(b, fbd_pointer); +#else + const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), pack_32_2x16(fb->tiling_area_px.min_x, fb->tiling_area_px.min_y)); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), pack_32_2x16(fb->tiling_area_px.max_x, fb->tiling_area_px.max_y)); +#endif } bool simul_use = @@ -3423,6 +3607,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * state for this renderpass, so it's safe to enable. */ struct cs_index addr_reg = cs_scratch_reg64(b, 0); struct cs_index length_reg = cs_scratch_reg32(b, 2); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf); uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev + handler_idx * dev->tiler_oom.handler_stride; @@ -3430,6 +3617,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride); cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Wait for the tiling to be done before submitting the fragment job. */ wait_finish_tiling(cmdbuf); @@ -3444,8 +3632,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * up. */ cs_move64_to(b, addr_reg, 0); cs_move32_to(b, length_reg, 0); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Applications tend to forget to describe subpass dependencies, especially * when it comes to write -> read dependencies on attachments. The @@ -3461,8 +3653,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) } if (cmdbuf->state.gfx.render.layer_count <= 1) { +#if PAN_ARCH >= 14 + cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif } else { struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4); struct cs_index remaining_layers = cs_scratch_reg32(b, 4); @@ -3471,12 +3668,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) { cs_add32(b, remaining_layers, remaining_layers, -1); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); + cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_update_frag_ctx(b) - cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz); + cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz); } } @@ -3490,8 +3692,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4); struct cs_index completed_top = cs_scratch_reg64(b, 10); struct cs_index completed_bottom = cs_scratch_reg64(b, 12); - struct cs_index cur_tiler = cs_reg64(b, 38); - struct cs_index tiler_count = cs_reg32(b, 47); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); + struct cs_index tiler_count = cs_reg32(b, 60); struct cs_index oq_chain = cs_scratch_reg64(b, 10); struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10); struct cs_index oq_syncobj = cs_scratch_reg64(b, 12); diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index b4cf6855184..ff2c9d23c5b 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -13,8 +13,9 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) { switch (reg) { /* The bbox is set up by the fragment subqueue, we should not modify it. */ - case 42: - case 43: + case MALI_FRAGMENT_SR_BBOX_MIN: + case MALI_FRAGMENT_SR_BBOX_MAX: + /* We should only load from the subqueue context. */ case PANVK_CS_REG_SUBQUEUE_CTX_START: case PANVK_CS_REG_SUBQUEUE_CTX_END: @@ -42,8 +43,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8), 8 * sizeof(uint32_t)); +#if PAN_ARCH >= 14 + const size_t fbd_size = ALIGN_POT(sizeof(struct panvk_fb_layer_state), 64); +#else + const size_t fbd_size = sizeof(struct mali_framebuffer_packed); +#endif + if (has_zs_ext) { - const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed); + const uint16_t dbd_offset = fbd_size; /* Copy the whole DBD. */ cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, @@ -57,8 +64,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, } const uint16_t rts_offset = - sizeof(struct mali_framebuffer_packed) + - (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); + fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); for (uint32_t rt = 0; rt < rt_count; rt++) { const uint16_t rt_offset = @@ -110,12 +116,14 @@ generate_tiler_oom_handler(struct panvk_device *dev, .tracebuf_addr_offset = offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), }; - struct mali_framebuffer_pointer_packed fb_tag; +#if PAN_ARCH < 14 + struct mali_framebuffer_pointer_packed fb_tag; pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) { cfg.zs_crc_extension_present = has_zs_ext; cfg.render_target_count = rt_count; } +#endif cs_function_def(&b, &handler, handler_ctx) { struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); @@ -140,7 +148,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4); /* The tiler pointer is pre-filled. */ - struct cs_index tiler_ptr = cs_reg64(&b, 38); + struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR); cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr)); @@ -176,11 +184,17 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_wait_slot(&b, SB_ID(LS)); /* Set FBD pointer to the scratch fbd */ - cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), - scratch_fbd_ptr_reg, fb_tag.opaque[0]); - + struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER); +#if PAN_ARCH >= 14 + cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0); + cs_emit_layer_fragment_state(&b, fbd_pointer); + cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else + cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, fb_tag.opaque[0]); cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif /* Serialize run fragments since we reuse FBD for the runs */ cs_wait_slots(&b, dev->csf.sb.all_iters_mask); diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index c4848fe575b..6d850a0373a 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue) tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size; alloc_info.size = get_fbd_size(true, MAX_RTS); - alloc_info.alignment = pan_alignment(FRAMEBUFFER); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct panvk_fb_layer_state); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + alloc_info.alignment = fbds_alignment; tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) { result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,