panfrost: Take tiler memory budget into account in pan_select_tiler_hierarchy_mask
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

On v12+, the hardware report support for 8 levels but
effectively only support up to 4 levels.

In case more than 4 levels are used, it will default to 0xAA when
tile_size is 32x32 or lower, otherwise 0xAC when the tile_size is greater than 32x32.

This patch makes it that we now ensure that the bins can fit inside out
tiler budget and otherwise drop levels until it fit.

This also allows the hardware to decide the hierarchy on v12+
if we know it will fit.

This fixes "dEQP-GLES31.functional.fbo.no_attachments.maximums.all" and
dEQP-GLES31.functional.fbo.no_attachments.maximums.size" on v12+ but
also likely more if we were exhausting the memory budget.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Backport-to: 25.1
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34559>
(cherry picked from commit 92afeb37bf)
This commit is contained in:
Mary Guillemard 2025-04-17 12:16:09 +02:00 committed by Eric Engestrom
parent 5bae75e3a0
commit 066850bb3a
14 changed files with 152 additions and 66 deletions

View file

@ -144,7 +144,7 @@
"description": "panfrost: Take tiler memory budget into account in pan_select_tiler_hierarchy_mask",
"nominated": true,
"nomination_type": 4,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -687,23 +687,16 @@ csf_emit_tiler_desc(struct panfrost_batch *batch, const struct pan_fb_info *fb)
{
struct panfrost_context *ctx = batch->ctx;
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct panfrost_screen *screen = pan_screen(ctx->base.screen);
if (!batch->csf.pending_tiler_desc)
return;
/* The tiler chunk start with a header of 64 bytes */
pan_pack(batch->csf.pending_tiler_desc, TILER_CONTEXT, tiler) {
tiler.hierarchy_mask =
pan_select_tiler_hierarchy_mask(batch->key.width,
batch->key.height,
dev->tiler_features.max_levels);
/* Disable hierarchies falling under the effective tile size. */
uint32_t disable_hierarchies;
for (disable_hierarchies = 0;
fb->tile_size > (16 * 16) << (disable_hierarchies * 2);
disable_hierarchies++)
;
tiler.hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies);
tiler.hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)(
batch->key.width, batch->key.height, dev->tiler_features.max_levels,
fb->tile_size, screen->csf_tiler_heap.chunk_size - 64);
#if PAN_ARCH >= 12
tiler.effective_tile_size = fb->tile_size;

View file

@ -426,10 +426,10 @@ jm_emit_tiler_desc(struct panfrost_batch *batch)
t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) {
tiler.hierarchy_mask =
pan_select_tiler_hierarchy_mask(batch->key.width,
batch->key.height,
dev->tiler_features.max_levels);
/* On JM, we don't care of passing the tile_size as it only matters for v12+ */
tiler.hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)(
batch->key.width, batch->key.height, dev->tiler_features.max_levels, 0,
panfrost_bo_size(dev->tiler_heap));
tiler.fb_width = batch->key.width;
tiler.fb_height = batch->key.height;

View file

@ -9,7 +9,3 @@ dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash
# Seems to be a precision issues because of floor fp16 being dropped since v11 (and the conversion done as a result)
dEQP-GLES3.functional.shaders.builtin_functions.common.fract.vec2_lowp_vertex,Fail
dEQP-GLES31.functional.shaders.builtin_functions.common.fract.vec2_lowp_compute,Fail
# Issue with color / depth internal buffer overrun
dEQP-GLES31.functional.fbo.no_attachments.maximums.all,Crash
dEQP-GLES31.functional.fbo.no_attachments.maximums.size,Crash

View file

@ -7,7 +7,3 @@ dEQP-VK.pipeline.fast_linked_library.misc.interpolate_at_sample_no_sample_shadin
# Seems to be a precision issues because of floor fp16 being dropped since v11 (and the conversion done as a result)
dEQP-GLES3.functional.shaders.builtin_functions.common.fract.vec2_lowp_vertex,Fail
dEQP-GLES31.functional.shaders.builtin_functions.common.fract.vec2_lowp_compute,Fail
# Issue with color / depth internal buffer overrun
dEQP-GLES31.functional.fbo.no_attachments.maximums.all,Crash
dEQP-GLES31.functional.fbo.no_attachments.maximums.size,Crash

View file

@ -34,6 +34,8 @@
#include "pan_props.h"
#include "pan_texture.h"
#define PAN_BIN_LEVEL_COUNT 12
static unsigned
mod_to_block_fmt(uint64_t mod)
{
@ -1160,3 +1162,103 @@ GENX(pan_emit_fragment_job_payload)(const struct pan_fb_info *fb, uint64_t fbd,
}
}
#endif
#if PAN_ARCH >= 6
static uint32_t
pan_calc_bins_pointer_size(uint32_t width, uint32_t height, uint32_t tile_size,
uint32_t hierarchy_mask)
{
const uint32_t bin_ptr_size = PAN_ARCH >= 12 ? 16 : 8;
uint32_t bins_x[PAN_BIN_LEVEL_COUNT];
uint32_t bins_y[PAN_BIN_LEVEL_COUNT];
uint32_t bins[PAN_BIN_LEVEL_COUNT];
uint32_t bins_enabled;
/* On v12+, hierarchy_mask is only used if 4 levels are used at most,
* otherwise it selects another mask (0xAC with a tile_size greater than
* 32x32, 0xAC with 32x32 and lower) */
if ((hierarchy_mask == 0 || util_bitcount(hierarchy_mask) > 4) &&
PAN_ARCH >= 12) {
if (tile_size > 32 * 32)
hierarchy_mask = 0xAC;
else
hierarchy_mask = 0xAA;
}
bins_x[0] = DIV_ROUND_UP(width, 16);
bins_y[0] = DIV_ROUND_UP(height, 16);
bins[0] = bins_x[0] * bins_y[0];
for (uint32_t i = 1; i < ARRAY_SIZE(bins); i++) {
bins_x[i] = DIV_ROUND_UP(bins_x[i - 1], 2);
bins_y[i] = DIV_ROUND_UP(bins_y[i - 1], 2);
bins[i] = bins_x[i] * bins_y[i];
}
bins_enabled = 0;
for (uint32_t i = 0; i < ARRAY_SIZE(bins); i++) {
if ((hierarchy_mask & (1 << i)) != 0)
bins_enabled += bins[i];
}
return DIV_ROUND_UP(bins_enabled, 8) * 8 * bin_ptr_size;
}
unsigned
GENX(pan_select_tiler_hierarchy_mask)(unsigned width, unsigned height,
unsigned max_levels, unsigned tile_size,
unsigned mem_budget)
{
/* On v12+, the hierarchy_mask is deprecated and letting the hardware decide
* is prefered. We attempt to use hierarchy_mask of 0 in case the bins can
* fit in our memory budget.
*/
if (PAN_ARCH >= 12 &&
pan_calc_bins_pointer_size(width, height, tile_size, 0) <= mem_budget)
return 0;
uint32_t max_fb_wh = MAX2(width, height);
uint32_t last_hierarchy_bit = util_last_bit(DIV_ROUND_UP(max_fb_wh, 16));
uint32_t hierarchy_mask = BITFIELD_MASK(max_levels);
/* Always enable the level covering the whole FB, and disable the finest
* levels if we don't have enough to cover everything.
* This is suboptimal for small primitives, since it might force
* primitives to be walked multiple times even if they don't cover the
* the tile being processed. On the other hand, it's hard to guess
* the draw pattern, so it's probably good enough for now.
*/
if (last_hierarchy_bit > max_levels)
hierarchy_mask <<= last_hierarchy_bit - max_levels;
/* Disable hierarchies falling under the effective tile size. */
uint32_t disable_hierarchies;
for (disable_hierarchies = 0;
tile_size > (16 * 16) << (disable_hierarchies * 2);
disable_hierarchies++)
;
hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies);
/* Disable hierachies that would cause the bins to fit in our budget */
while (disable_hierarchies < PAN_BIN_LEVEL_COUNT) {
uint32_t bins_ptr_size =
pan_calc_bins_pointer_size(width, height, tile_size, hierarchy_mask);
if (bins_ptr_size < mem_budget)
break;
disable_hierarchies++;
hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies);
}
/* We should fit in our budget at this point */
assert(pan_calc_bins_pointer_size(width, height, tile_size,
hierarchy_mask) <= mem_budget);
/* Before v12, at least one hierarchy level must be enabled. */
assert(hierarchy_mask != 0 || PAN_ARCH >= 12);
return hierarchy_mask;
}
#endif

View file

@ -198,6 +198,13 @@ unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
const struct pan_tiler_context *tiler_ctx,
void *out);
#if PAN_ARCH >= 6
unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height,
uint32_t max_levels,
uint32_t tile_size,
uint32_t mem_budget);
#endif
#if PAN_ARCH <= 9
void GENX(pan_emit_fragment_job_payload)(const struct pan_fb_info *fb,
uint64_t fbd, void *out);

View file

@ -79,25 +79,4 @@ panfrost_last_nonnull(uint64_t *ptrs, unsigned count)
return 0;
}
static inline uint32_t
pan_select_tiler_hierarchy_mask(unsigned width, unsigned height,
unsigned max_levels)
{
uint32_t max_fb_wh = MAX2(width, height);
uint32_t last_hierarchy_bit = util_last_bit(DIV_ROUND_UP(max_fb_wh, 16));
uint32_t hierarchy_mask = BITFIELD_MASK(max_levels);
/* Always enable the level covering the whole FB, and disable the finest
* levels if we don't have enough to cover everything.
* This is suboptimal for small primitives, since it might force
* primitives to be walked multiple times even if they don't cover the
* the tile being processed. On the other hand, it's hard to guess
* the draw pattern, so it's probably good enough for now.
*/
if (last_hierarchy_bit > max_levels)
hierarchy_mask <<= last_hierarchy_bit - max_levels;
return hierarchy_mask;
}
#endif /* PAN_UTIL_H */

View file

@ -820,8 +820,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
unsigned max_levels = tiler_features.max_levels;
assert(max_levels >= 2);
cfg.hierarchy_mask =
panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
/* The tiler chunk start with a header of 64 bytes */
cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask(
phys_dev, &cmdbuf->state.gfx, phys_dev->csf.tiler.chunk_size - 64);
cfg.fb_width = fbinfo->width;
cfg.fb_height = fbinfo->height;

View file

@ -641,6 +641,8 @@ static VkResult
init_tiler(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
VkResult result;
@ -659,13 +661,13 @@ init_tiler(struct panvk_queue *queue)
goto err_free_desc;
}
tiler_heap->chunk_size = 2 * 1024 * 1024;
tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size;
struct drm_panthor_tiler_heap_create thc = {
.vm_id = pan_kmod_vm_handle(dev->kmod.vm),
.chunk_size = tiler_heap->chunk_size,
.initial_chunk_count = 5,
.max_chunks = 64,
.initial_chunk_count = phys_dev->csf.tiler.initial_chunks,
.max_chunks = phys_dev->csf.tiler.max_chunks,
.target_in_flight = 65535,
};

View file

@ -255,8 +255,8 @@ panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf,
}
pan_pack(&batch->tiler.ctx_templ, TILER_CONTEXT, cfg) {
cfg.hierarchy_mask =
panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask(
phys_dev, &cmdbuf->state.gfx, pan_kmod_bo_size(dev->tiler_heap->bo));
cfg.fb_width = fbinfo->width;
cfg.fb_height = fbinfo->height;
cfg.heap = batch->tiler.heap_desc.gpu;

View file

@ -200,23 +200,16 @@ struct panvk_cmd_graphics_state {
static inline uint32_t
panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev,
const struct panvk_cmd_graphics_state *state)
const struct panvk_cmd_graphics_state *state,
unsigned bin_ptr_mem_budget)
{
struct panfrost_tiler_features tiler_features =
panfrost_query_tiler_features(&phys_dev->kmod.props);
uint32_t hierarchy_mask =
pan_select_tiler_hierarchy_mask(state->render.fb.info.width,
state->render.fb.info.height,
tiler_features.max_levels);
/* Disable hierarchies falling under the effective tile size. */
uint32_t disable_hierarchies;
for (disable_hierarchies = 0; state->render.fb.info.tile_size >
(16 * 16) << (disable_hierarchies * 2);
disable_hierarchies++)
;
hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies);
uint32_t hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)(
state->render.fb.info.width, state->render.fb.info.height,
tiler_features.max_levels, state->render.fb.info.tile_size,
bin_ptr_mem_budget);
return hierarchy_mask;
}

View file

@ -1140,6 +1140,13 @@ panvk_physical_device_init(struct panvk_physical_device *device,
if (result != VK_SUCCESS)
goto fail;
if (arch >= 10) {
/* XXX: Make dri options for thoses */
device->csf.tiler.chunk_size = 2 * 1024 * 1024;
device->csf.tiler.initial_chunks = 5;
device->csf.tiler.max_chunks = 64;
}
if (arch != 10)
vk_warn_non_conformant_implementation("panvk");

View file

@ -34,6 +34,16 @@ struct panvk_physical_device {
const struct panfrost_model *model;
union {
struct {
struct {
uint32_t chunk_size;
uint32_t initial_chunks;
uint32_t max_chunks;
} tiler;
} csf;
};
struct {
dev_t primary_rdev;
dev_t render_rdev;