mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 09:38:07 +02:00
panfrost: Integrate kernel names for tiler FBD
These names are from the replay workaround in kbase; they begin to shine some light on the meaning of these fields. In particular, we now understand why the "tiler_meta" field has the effect it does on performance in certain scenes (controlling tile granularity). Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
This commit is contained in:
parent
1a7caac9e9
commit
85e745f2b4
3 changed files with 65 additions and 47 deletions
|
|
@ -2,6 +2,7 @@
|
|||
* © Copyright 2017-2018 Alyssa Rosenzweig
|
||||
* © Copyright 2017-2018 Connor Abbott
|
||||
* © Copyright 2017-2018 Lyude Paul
|
||||
* © Copyright2019 Collabora
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
|
@ -1362,16 +1363,16 @@ struct mali_single_framebuffer {
|
|||
u32 zero6[7];
|
||||
|
||||
/* Very weird format, see generation code in trans_builder.c */
|
||||
u32 resolution_check;
|
||||
|
||||
u32 tiler_resolution_check;
|
||||
u32 tiler_flags;
|
||||
|
||||
u64 unknown_address_1; /* Pointing towards... a zero buffer? */
|
||||
u64 unknown_address_2;
|
||||
/* Guesses? */
|
||||
mali_ptr tiler_scratch_start; /* Pointing towards... a zero buffer? */
|
||||
mali_ptr tiler_scratch_middle;
|
||||
|
||||
/* See mali_kbase_replay.c */
|
||||
u64 tiler_heap_free;
|
||||
u64 tiler_heap_end;
|
||||
mali_ptr tiler_heap_free;
|
||||
mali_ptr tiler_heap_end;
|
||||
|
||||
/* More below this, maybe */
|
||||
} __attribute__((packed));
|
||||
|
|
@ -1519,18 +1520,29 @@ struct bifrost_framebuffer {
|
|||
u32 clear_stencil : 8;
|
||||
u32 unk3 : 24; // = 0x100
|
||||
float clear_depth;
|
||||
mali_ptr tiler_meta;
|
||||
/* 0x40 */
|
||||
|
||||
|
||||
/* Tiler section begins here */
|
||||
u32 tiler_unknown;
|
||||
|
||||
/* Name known from the replay workaround in the kernel. What exactly is
|
||||
* flagged here is less known. We do that (tiler_flags & 0x1ff)
|
||||
* specifies a mask of hierarchy weights, which explains some of the
|
||||
* performance mysteries around setting it. We also known (1 << 16)
|
||||
* should be set, but there's no explanation in the kernel why. */
|
||||
u32 tiler_flags;
|
||||
|
||||
/* Note: these are guesses! */
|
||||
mali_ptr tiler_scratch_start;
|
||||
mali_ptr tiler_scratch_middle;
|
||||
|
||||
/* These are not, since we see symmetry with replay jobs which name these explicitly */
|
||||
mali_ptr tiler_heap_start;
|
||||
/* These are not, since we see symmetry with replay
|
||||
* jobs which name these explicitly */
|
||||
|
||||
mali_ptr tiler_heap_start; /* tiler heap_free_address */
|
||||
mali_ptr tiler_heap_end;
|
||||
|
||||
u64 zero9, zero10, zero11, zero12;
|
||||
u32 tiler_weights[8];
|
||||
|
||||
/* optional: struct bifrost_fb_extra extra */
|
||||
/* struct bifrost_render_target rts[] */
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, i
|
|||
* The formula itself was discovered mostly by manual bruteforce and
|
||||
* aggressive algebraic simplification. */
|
||||
|
||||
fb->resolution_check = ((w + h) / 3) << 4;
|
||||
fb->tiler_resolution_check = ((w + h) / 3) << 4;
|
||||
}
|
||||
|
||||
struct mali_single_framebuffer
|
||||
|
|
@ -118,8 +118,8 @@ panfrost_emit_sfbd(struct panfrost_context *ctx)
|
|||
.format = 0x30000000,
|
||||
.clear_flags = 0x1000,
|
||||
.unknown_address_0 = ctx->scratchpad.gpu,
|
||||
.unknown_address_1 = ctx->misc_0.gpu,
|
||||
.unknown_address_2 = ctx->misc_0.gpu + 40960,
|
||||
.tiler_scratch_start = ctx->misc_0.gpu,
|
||||
.tiler_scratch_middle = ctx->misc_0.gpu + 40960,
|
||||
.tiler_flags = 0xf0,
|
||||
.tiler_heap_free = ctx->tiler_heap.gpu,
|
||||
.tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size,
|
||||
|
|
@ -134,28 +134,22 @@ struct bifrost_framebuffer
|
|||
panfrost_emit_mfbd(struct panfrost_context *ctx)
|
||||
{
|
||||
struct bifrost_framebuffer framebuffer = {
|
||||
/* It is not yet clear what tiler_meta means or how it's
|
||||
* calculated, but we can tell the lower 32-bits are a
|
||||
* (monotonically increasing?) function of tile count and
|
||||
* geometry complexity; I suspect it defines a memory size of
|
||||
* some kind? for the tiler. It's really unclear at the
|
||||
* moment... but to add to the confusion, the hardware is happy
|
||||
* enough to accept a zero in this field, so we don't even have
|
||||
* to worry about it right now.
|
||||
*
|
||||
* The byte (just after the 32-bit mark) is much more
|
||||
* interesting. The higher nibble I've only ever seen as 0xF,
|
||||
* but the lower one I've seen as 0x0 or 0xF, and it's not
|
||||
* obvious what the difference is. But what -is- obvious is
|
||||
* that when the lower nibble is zero, performance is severely
|
||||
* degraded compared to when the lower nibble is set.
|
||||
* Evidently, that nibble enables some sort of fast path,
|
||||
* perhaps relating to caching or tile flush? Regardless, at
|
||||
* this point there's no clear reason not to set it, aside from
|
||||
* substantially increased memory requirements (of the misc_0
|
||||
* buffer) */
|
||||
/* It is not yet clear what this means or how it's
|
||||
* calculated, but we can tell it is a (monotonically
|
||||
* increasing?) function of tile count and geometry complexity;
|
||||
* I suspect it defines a memory size of some kind? for the
|
||||
* tiler. It's really unclear at the moment... but to add to
|
||||
* the confusion, the hardware is happy enough to accept a zero
|
||||
* in this field, so we don't even have to worry about it right
|
||||
* now. */
|
||||
|
||||
.tiler_meta = ((uint64_t) 0xff << 32) | 0x0,
|
||||
.tiler_unknown = 0x0,
|
||||
|
||||
/* The lower 0xff controls the hierarchy mask. Set more bits
|
||||
* on for more tile granularity (which can be a performance win
|
||||
* on some scenes, at memory bandwidth costs). For now, be lazy
|
||||
* and enable everything. This might be a terrible idea. */
|
||||
.tiler_flags = 0xff,
|
||||
|
||||
.width1 = MALI_POSITIVE(ctx->pipe_framebuffer.width),
|
||||
.height1 = MALI_POSITIVE(ctx->pipe_framebuffer.height),
|
||||
|
|
|
|||
|
|
@ -463,10 +463,10 @@ pandecode_replay_sfbd(uint64_t gpu_va, int job_no)
|
|||
}
|
||||
|
||||
MEMORY_PROP(s, unknown_address_0);
|
||||
MEMORY_PROP(s, unknown_address_1);
|
||||
MEMORY_PROP(s, unknown_address_2);
|
||||
MEMORY_PROP(s, tiler_scratch_start);
|
||||
MEMORY_PROP(s, tiler_scratch_middle);
|
||||
|
||||
pandecode_prop("resolution_check = 0x%" PRIx32, s->resolution_check);
|
||||
pandecode_prop("tiler_resolution_check = 0x%" PRIx32, s->tiler_resolution_check);
|
||||
pandecode_prop("tiler_flags = 0x%" PRIx32, s->tiler_flags);
|
||||
|
||||
MEMORY_PROP(s, tiler_heap_free);
|
||||
|
|
@ -640,12 +640,12 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
|
|||
if (fb->sample_locations)
|
||||
pandecode_prop("sample_locations = sample_locations_%d", job_no);
|
||||
|
||||
/* Assume that unknown1 and tiler_meta were emitted in the last job for
|
||||
/* Assume that unknown1 was emitted in the last job for
|
||||
* now */
|
||||
/*pandecode_prop("unknown1 = unknown1_%d_p", job_no - 1);
|
||||
pandecode_prop("tiler_meta = tiler_meta_%d_p", job_no - 1);*/
|
||||
MEMORY_PROP(fb, unknown1);
|
||||
MEMORY_PROP(fb, tiler_meta);
|
||||
|
||||
pandecode_prop("tiler_unknown = 0x%x", fb->tiler_unknown);
|
||||
pandecode_prop("tiler_flags = 0x%x", fb->tiler_flags);
|
||||
|
||||
pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1);
|
||||
pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1);
|
||||
|
|
@ -668,14 +668,26 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
|
|||
MEMORY_PROP(fb, tiler_heap_start);
|
||||
MEMORY_PROP(fb, tiler_heap_end);
|
||||
|
||||
if (fb->zero3 || fb->zero4 || fb->zero9 || fb->zero10 || fb->zero11 || fb->zero12) {
|
||||
if (fb->zero3 || fb->zero4) {
|
||||
pandecode_msg("framebuffer zeros tripped\n");
|
||||
pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3);
|
||||
pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4);
|
||||
pandecode_prop("zero9 = 0x%" PRIx64, fb->zero9);
|
||||
pandecode_prop("zero10 = 0x%" PRIx64, fb->zero10);
|
||||
pandecode_prop("zero11 = 0x%" PRIx64, fb->zero11);
|
||||
pandecode_prop("zero12 = 0x%" PRIx64, fb->zero12);
|
||||
}
|
||||
|
||||
bool nonzero_weights = false;
|
||||
|
||||
for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
|
||||
nonzero_weights |= fb->tiler_weights[w] != 0x0;
|
||||
}
|
||||
|
||||
if (nonzero_weights) {
|
||||
pandecode_log(".tiler_weights = {");
|
||||
|
||||
for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) {
|
||||
pandecode_log("%d, ", fb->tiler_weights[w]);
|
||||
}
|
||||
|
||||
pandecode_log("},");
|
||||
}
|
||||
|
||||
pandecode_indent--;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue