From 066850bb3aebff2cd0b9f69a7a2cb3cf84778607 Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Thu, 17 Apr 2025 12:16:09 +0200 Subject: [PATCH] panfrost: Take tiler memory budget into account in pan_select_tiler_hierarchy_mask On v12+, the hardware report support for 8 levels but effectively only support up to 4 levels. In case more than 4 levels are used, it will default to 0xAA when tile_size is 32x32 or lower, otherwise 0xAC when the tile_size is greater than 32x32. This patch makes it that we now ensure that the bins can fit inside out tiler budget and otherwise drop levels until it fit. This also allows the hardware to decide the hierarchy on v12+ if we know it will fit. This fixes "dEQP-GLES31.functional.fbo.no_attachments.maximums.all" and dEQP-GLES31.functional.fbo.no_attachments.maximums.size" on v12+ but also likely more if we were exhausting the memory budget. Signed-off-by: Mary Guillemard Reviewed-by: Boris Brezillon Backport-to: 25.1 Part-of: (cherry picked from commit 92afeb37bf0ab61846323aa2969b703899dbcb8f) --- .pick_status.json | 2 +- src/gallium/drivers/panfrost/pan_csf.c | 17 +--- src/gallium/drivers/panfrost/pan_jm.c | 8 +- src/panfrost/ci/panfrost-g720-fails.txt | 4 - src/panfrost/ci/panfrost-g725-fails.txt | 4 - src/panfrost/lib/pan_desc.c | 102 +++++++++++++++++++ src/panfrost/lib/pan_desc.h | 7 ++ src/panfrost/lib/pan_util.h | 21 ---- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 5 +- src/panfrost/vulkan/csf/panvk_vX_queue.c | 8 +- src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c | 4 +- src/panfrost/vulkan/panvk_cmd_draw.h | 19 ++-- src/panfrost/vulkan/panvk_physical_device.c | 7 ++ src/panfrost/vulkan/panvk_physical_device.h | 10 ++ 14 files changed, 152 insertions(+), 66 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index f0c86acbb41..fbfc0647364 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -144,7 +144,7 @@ "description": "panfrost: Take tiler memory budget into account in pan_select_tiler_hierarchy_mask", "nominated": true, "nomination_type": 4, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 6c7345a2663..f2633b157d5 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -687,23 +687,16 @@ csf_emit_tiler_desc(struct panfrost_batch *batch, const struct pan_fb_info *fb) { struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_screen *screen = pan_screen(ctx->base.screen); if (!batch->csf.pending_tiler_desc) return; + /* The tiler chunk start with a header of 64 bytes */ pan_pack(batch->csf.pending_tiler_desc, TILER_CONTEXT, tiler) { - tiler.hierarchy_mask = - pan_select_tiler_hierarchy_mask(batch->key.width, - batch->key.height, - dev->tiler_features.max_levels); - - /* Disable hierarchies falling under the effective tile size. */ - uint32_t disable_hierarchies; - for (disable_hierarchies = 0; - fb->tile_size > (16 * 16) << (disable_hierarchies * 2); - disable_hierarchies++) - ; - tiler.hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies); + tiler.hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)( + batch->key.width, batch->key.height, dev->tiler_features.max_levels, + fb->tile_size, screen->csf_tiler_heap.chunk_size - 64); #if PAN_ARCH >= 12 tiler.effective_tile_size = fb->tile_size; diff --git a/src/gallium/drivers/panfrost/pan_jm.c b/src/gallium/drivers/panfrost/pan_jm.c index 0babf81e479..1b49e08e997 100644 --- a/src/gallium/drivers/panfrost/pan_jm.c +++ b/src/gallium/drivers/panfrost/pan_jm.c @@ -426,10 +426,10 @@ jm_emit_tiler_desc(struct panfrost_batch *batch) t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) { - tiler.hierarchy_mask = - pan_select_tiler_hierarchy_mask(batch->key.width, - batch->key.height, - dev->tiler_features.max_levels); + /* On JM, we don't care of passing the tile_size as it only matters for v12+ */ + tiler.hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)( + batch->key.width, batch->key.height, dev->tiler_features.max_levels, 0, + panfrost_bo_size(dev->tiler_heap)); tiler.fb_width = batch->key.width; tiler.fb_height = batch->key.height; diff --git a/src/panfrost/ci/panfrost-g720-fails.txt b/src/panfrost/ci/panfrost-g720-fails.txt index 195f7900dd7..b53fd5d6293 100644 --- a/src/panfrost/ci/panfrost-g720-fails.txt +++ b/src/panfrost/ci/panfrost-g720-fails.txt @@ -9,7 +9,3 @@ dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash # Seems to be a precision issues because of floor fp16 being dropped since v11 (and the conversion done as a result) dEQP-GLES3.functional.shaders.builtin_functions.common.fract.vec2_lowp_vertex,Fail dEQP-GLES31.functional.shaders.builtin_functions.common.fract.vec2_lowp_compute,Fail - -# Issue with color / depth internal buffer overrun -dEQP-GLES31.functional.fbo.no_attachments.maximums.all,Crash -dEQP-GLES31.functional.fbo.no_attachments.maximums.size,Crash diff --git a/src/panfrost/ci/panfrost-g725-fails.txt b/src/panfrost/ci/panfrost-g725-fails.txt index 0b40e0a1a2c..1f197a9a466 100644 --- a/src/panfrost/ci/panfrost-g725-fails.txt +++ b/src/panfrost/ci/panfrost-g725-fails.txt @@ -7,7 +7,3 @@ dEQP-VK.pipeline.fast_linked_library.misc.interpolate_at_sample_no_sample_shadin # Seems to be a precision issues because of floor fp16 being dropped since v11 (and the conversion done as a result) dEQP-GLES3.functional.shaders.builtin_functions.common.fract.vec2_lowp_vertex,Fail dEQP-GLES31.functional.shaders.builtin_functions.common.fract.vec2_lowp_compute,Fail - -# Issue with color / depth internal buffer overrun -dEQP-GLES31.functional.fbo.no_attachments.maximums.all,Crash -dEQP-GLES31.functional.fbo.no_attachments.maximums.size,Crash diff --git a/src/panfrost/lib/pan_desc.c b/src/panfrost/lib/pan_desc.c index b1a13af00fe..79b05e99852 100644 --- a/src/panfrost/lib/pan_desc.c +++ b/src/panfrost/lib/pan_desc.c @@ -34,6 +34,8 @@ #include "pan_props.h" #include "pan_texture.h" +#define PAN_BIN_LEVEL_COUNT 12 + static unsigned mod_to_block_fmt(uint64_t mod) { @@ -1160,3 +1162,103 @@ GENX(pan_emit_fragment_job_payload)(const struct pan_fb_info *fb, uint64_t fbd, } } #endif + +#if PAN_ARCH >= 6 +static uint32_t +pan_calc_bins_pointer_size(uint32_t width, uint32_t height, uint32_t tile_size, + uint32_t hierarchy_mask) +{ + const uint32_t bin_ptr_size = PAN_ARCH >= 12 ? 16 : 8; + + uint32_t bins_x[PAN_BIN_LEVEL_COUNT]; + uint32_t bins_y[PAN_BIN_LEVEL_COUNT]; + uint32_t bins[PAN_BIN_LEVEL_COUNT]; + uint32_t bins_enabled; + + /* On v12+, hierarchy_mask is only used if 4 levels are used at most, + * otherwise it selects another mask (0xAC with a tile_size greater than + * 32x32, 0xAC with 32x32 and lower) */ + if ((hierarchy_mask == 0 || util_bitcount(hierarchy_mask) > 4) && + PAN_ARCH >= 12) { + if (tile_size > 32 * 32) + hierarchy_mask = 0xAC; + else + hierarchy_mask = 0xAA; + } + + bins_x[0] = DIV_ROUND_UP(width, 16); + bins_y[0] = DIV_ROUND_UP(height, 16); + bins[0] = bins_x[0] * bins_y[0]; + + for (uint32_t i = 1; i < ARRAY_SIZE(bins); i++) { + bins_x[i] = DIV_ROUND_UP(bins_x[i - 1], 2); + bins_y[i] = DIV_ROUND_UP(bins_y[i - 1], 2); + bins[i] = bins_x[i] * bins_y[i]; + } + + bins_enabled = 0; + for (uint32_t i = 0; i < ARRAY_SIZE(bins); i++) { + if ((hierarchy_mask & (1 << i)) != 0) + bins_enabled += bins[i]; + } + + return DIV_ROUND_UP(bins_enabled, 8) * 8 * bin_ptr_size; +} + +unsigned +GENX(pan_select_tiler_hierarchy_mask)(unsigned width, unsigned height, + unsigned max_levels, unsigned tile_size, + unsigned mem_budget) +{ + /* On v12+, the hierarchy_mask is deprecated and letting the hardware decide + * is prefered. We attempt to use hierarchy_mask of 0 in case the bins can + * fit in our memory budget. + */ + if (PAN_ARCH >= 12 && + pan_calc_bins_pointer_size(width, height, tile_size, 0) <= mem_budget) + return 0; + + uint32_t max_fb_wh = MAX2(width, height); + uint32_t last_hierarchy_bit = util_last_bit(DIV_ROUND_UP(max_fb_wh, 16)); + uint32_t hierarchy_mask = BITFIELD_MASK(max_levels); + + /* Always enable the level covering the whole FB, and disable the finest + * levels if we don't have enough to cover everything. + * This is suboptimal for small primitives, since it might force + * primitives to be walked multiple times even if they don't cover the + * the tile being processed. On the other hand, it's hard to guess + * the draw pattern, so it's probably good enough for now. + */ + if (last_hierarchy_bit > max_levels) + hierarchy_mask <<= last_hierarchy_bit - max_levels; + + /* Disable hierarchies falling under the effective tile size. */ + uint32_t disable_hierarchies; + for (disable_hierarchies = 0; + tile_size > (16 * 16) << (disable_hierarchies * 2); + disable_hierarchies++) + ; + hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies); + + /* Disable hierachies that would cause the bins to fit in our budget */ + while (disable_hierarchies < PAN_BIN_LEVEL_COUNT) { + uint32_t bins_ptr_size = + pan_calc_bins_pointer_size(width, height, tile_size, hierarchy_mask); + + if (bins_ptr_size < mem_budget) + break; + + disable_hierarchies++; + hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies); + } + + /* We should fit in our budget at this point */ + assert(pan_calc_bins_pointer_size(width, height, tile_size, + hierarchy_mask) <= mem_budget); + + /* Before v12, at least one hierarchy level must be enabled. */ + assert(hierarchy_mask != 0 || PAN_ARCH >= 12); + + return hierarchy_mask; +} +#endif diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index 3b91e8ae6d5..0072a6d0bf2 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -198,6 +198,13 @@ unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tiler_context *tiler_ctx, void *out); +#if PAN_ARCH >= 6 +unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height, + uint32_t max_levels, + uint32_t tile_size, + uint32_t mem_budget); +#endif + #if PAN_ARCH <= 9 void GENX(pan_emit_fragment_job_payload)(const struct pan_fb_info *fb, uint64_t fbd, void *out); diff --git a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h index a4b8deb37d1..8b0deb7f100 100644 --- a/src/panfrost/lib/pan_util.h +++ b/src/panfrost/lib/pan_util.h @@ -79,25 +79,4 @@ panfrost_last_nonnull(uint64_t *ptrs, unsigned count) return 0; } -static inline uint32_t -pan_select_tiler_hierarchy_mask(unsigned width, unsigned height, - unsigned max_levels) -{ - uint32_t max_fb_wh = MAX2(width, height); - uint32_t last_hierarchy_bit = util_last_bit(DIV_ROUND_UP(max_fb_wh, 16)); - uint32_t hierarchy_mask = BITFIELD_MASK(max_levels); - - /* Always enable the level covering the whole FB, and disable the finest - * levels if we don't have enough to cover everything. - * This is suboptimal for small primitives, since it might force - * primitives to be walked multiple times even if they don't cover the - * the tile being processed. On the other hand, it's hard to guess - * the draw pattern, so it's probably good enough for now. - */ - if (last_hierarchy_bit > max_levels) - hierarchy_mask <<= last_hierarchy_bit - max_levels; - - return hierarchy_mask; -} - #endif /* PAN_UTIL_H */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index ff45e337a15..62306ab9a40 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -820,8 +820,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) unsigned max_levels = tiler_features.max_levels; assert(max_levels >= 2); - cfg.hierarchy_mask = - panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx); + /* The tiler chunk start with a header of 64 bytes */ + cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask( + phys_dev, &cmdbuf->state.gfx, phys_dev->csf.tiler.chunk_size - 64); cfg.fb_width = fbinfo->width; cfg.fb_height = fbinfo->height; diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c index 4e89734d54a..7d7007fa57f 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -641,6 +641,8 @@ static VkResult init_tiler(struct panvk_queue *queue) { struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap; VkResult result; @@ -659,13 +661,13 @@ init_tiler(struct panvk_queue *queue) goto err_free_desc; } - tiler_heap->chunk_size = 2 * 1024 * 1024; + tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size; struct drm_panthor_tiler_heap_create thc = { .vm_id = pan_kmod_vm_handle(dev->kmod.vm), .chunk_size = tiler_heap->chunk_size, - .initial_chunk_count = 5, - .max_chunks = 64, + .initial_chunk_count = phys_dev->csf.tiler.initial_chunks, + .max_chunks = phys_dev->csf.tiler.max_chunks, .target_in_flight = 65535, }; diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c index 8d4e662cc94..d521f88d79a 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c @@ -255,8 +255,8 @@ panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf, } pan_pack(&batch->tiler.ctx_templ, TILER_CONTEXT, cfg) { - cfg.hierarchy_mask = - panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx); + cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask( + phys_dev, &cmdbuf->state.gfx, pan_kmod_bo_size(dev->tiler_heap->bo)); cfg.fb_width = fbinfo->width; cfg.fb_height = fbinfo->height; cfg.heap = batch->tiler.heap_desc.gpu; diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index c8d30a206b2..aae044e48f0 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -200,23 +200,16 @@ struct panvk_cmd_graphics_state { static inline uint32_t panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev, - const struct panvk_cmd_graphics_state *state) + const struct panvk_cmd_graphics_state *state, + unsigned bin_ptr_mem_budget) { struct panfrost_tiler_features tiler_features = panfrost_query_tiler_features(&phys_dev->kmod.props); - uint32_t hierarchy_mask = - pan_select_tiler_hierarchy_mask(state->render.fb.info.width, - state->render.fb.info.height, - tiler_features.max_levels); - - /* Disable hierarchies falling under the effective tile size. */ - uint32_t disable_hierarchies; - for (disable_hierarchies = 0; state->render.fb.info.tile_size > - (16 * 16) << (disable_hierarchies * 2); - disable_hierarchies++) - ; - hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies); + uint32_t hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)( + state->render.fb.info.width, state->render.fb.info.height, + tiler_features.max_levels, state->render.fb.info.tile_size, + bin_ptr_mem_budget); return hierarchy_mask; } diff --git a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c index 968180a9994..b599dc2a8f5 100644 --- a/src/panfrost/vulkan/panvk_physical_device.c +++ b/src/panfrost/vulkan/panvk_physical_device.c @@ -1140,6 +1140,13 @@ panvk_physical_device_init(struct panvk_physical_device *device, if (result != VK_SUCCESS) goto fail; + if (arch >= 10) { + /* XXX: Make dri options for thoses */ + device->csf.tiler.chunk_size = 2 * 1024 * 1024; + device->csf.tiler.initial_chunks = 5; + device->csf.tiler.max_chunks = 64; + } + if (arch != 10) vk_warn_non_conformant_implementation("panvk"); diff --git a/src/panfrost/vulkan/panvk_physical_device.h b/src/panfrost/vulkan/panvk_physical_device.h index ebe81284b3e..cc5694a2138 100644 --- a/src/panfrost/vulkan/panvk_physical_device.h +++ b/src/panfrost/vulkan/panvk_physical_device.h @@ -34,6 +34,16 @@ struct panvk_physical_device { const struct panfrost_model *model; + union { + struct { + struct { + uint32_t chunk_size; + uint32_t initial_chunks; + uint32_t max_chunks; + } tiler; + } csf; + }; + struct { dev_t primary_rdev; dev_t render_rdev;