pan/earlyzs: Support the shader ZS read-only case and its optimization on v10+

We are about to allow ZS tile buffer reads in panvk in order to support
VK_KHR_dynamic_rendering_local_read, and this requires dealing with
a new case in the early ZS logic.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32540>
This commit is contained in:
Boris Brezillon 2025-03-06 14:33:48 +01:00 committed by Marge Bot
parent d2cd5ca609
commit fe21da08ed
10 changed files with 84 additions and 35 deletions

View file

@ -508,7 +508,7 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx, uint64_t *blend_shaders,
struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
ctx->blend->base.alpha_to_coverage,
ctx->depth_stencil->zs_always_passes);
ctx->depth_stencil->zs_always_passes, false);
cfg.properties.pixel_kill_operation = earlyzs.kill;
cfg.properties.zs_update_operation = earlyzs.update;

View file

@ -1208,7 +1208,7 @@ csf_emit_draw_state(struct panfrost_batch *batch,
struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
ctx->blend->base.alpha_to_coverage,
ctx->depth_stencil->zs_always_passes);
ctx->depth_stencil->zs_always_passes, false);
cfg.pixel_kill_operation = (enum mali_pixel_kill)earlyzs.kill;
cfg.zs_update_operation = (enum mali_pixel_kill)earlyzs.update;

View file

@ -581,7 +581,7 @@ jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,
struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
ctx->blend->base.alpha_to_coverage,
ctx->depth_stencil->zs_always_passes);
ctx->depth_stencil->zs_always_passes, false);
cfg.flags_0.pixel_kill_operation = earlyzs.kill;
cfg.flags_0.zs_update_operation = earlyzs.update;

View file

@ -355,6 +355,7 @@ panfrost_new_variant_locked(struct panfrost_context *ctx,
struct panfrost_uncompiled_shader *uncompiled,
struct panfrost_shader_key *key)
{
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct panfrost_compiled_shader *prog = panfrost_alloc_variant(uncompiled);
*prog = (struct panfrost_compiled_shader){
@ -365,7 +366,7 @@ panfrost_new_variant_locked(struct panfrost_context *ctx,
panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, uncompiled,
&ctx->base.debug, prog, 0);
prog->earlyzs = pan_earlyzs_analyze(&prog->info);
prog->earlyzs = pan_earlyzs_analyze(&prog->info, dev->arch);
return prog;
}

View file

@ -30,9 +30,9 @@
* force early.
*/
static enum pan_earlyzs
best_early_mode(bool zs_always_passes)
best_early_mode(bool zs_always_passes, bool force_early)
{
if (zs_always_passes)
if (zs_always_passes && !force_early)
return PAN_EARLYZS_WEAK_EARLY;
else
return PAN_EARLYZS_FORCE_EARLY;
@ -45,7 +45,8 @@ best_early_mode(bool zs_always_passes)
*/
static struct pan_earlyzs_state
analyze(const struct pan_shader_info *s, bool writes_zs_or_oq,
bool alpha_to_coverage, bool zs_always_passes)
bool alpha_to_coverage, bool zs_always_passes,
bool shader_reads_zs, bool can_optimize_shader_readonly_zs)
{
/* If the shader writes depth or stencil, all depth/stencil tests must
* be deferred until the value is known after the ZS_EMIT instruction,
@ -62,6 +63,8 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq,
bool shader_writes_zs = (s->fs.writes_depth || s->fs.writes_stencil);
bool late_update = shader_writes_zs || alpha_to_coverage;
bool late_kill = shader_writes_zs;
bool force_early_update = s->fs.early_fragment_tests;
bool force_early_kill = s->fs.early_fragment_tests;
/* Late coverage updates are required if the coverage mask depends on
* the results of the shader. Discards are implemented as coverage mask
@ -90,16 +93,37 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq,
*/
late_kill |= s->writes_global;
/* Shader reads require late depth/stencil tests to ensure the shader
* isn't killed before the side effects execute, unless the HW supports
* read-only ZS optimization, in which case it can be lowered to
* force-early. */
bool optimize_shader_read_only_zs = false;
if (shader_reads_zs) {
if (!late_update && can_optimize_shader_readonly_zs) {
optimize_shader_read_only_zs = true;
force_early_update |= true;
} else {
late_update |= true;
}
if (!late_kill && can_optimize_shader_readonly_zs) {
optimize_shader_read_only_zs = true;
force_early_kill |= true;
}
}
/* Finally, the shader may override and force early fragment tests */
late_update &= !s->fs.early_fragment_tests;
late_kill &= !s->fs.early_fragment_tests;
/* Collect results */
enum pan_earlyzs early_mode = best_early_mode(zs_always_passes);
return (struct pan_earlyzs_state){
.update = late_update ? PAN_EARLYZS_FORCE_LATE : early_mode,
.kill = late_kill ? PAN_EARLYZS_FORCE_LATE : early_mode,
.update = late_update
? PAN_EARLYZS_FORCE_LATE
: best_early_mode(zs_always_passes, force_early_update),
.kill = late_kill ? PAN_EARLYZS_FORCE_LATE
: best_early_mode(zs_always_passes, force_early_kill),
.shader_readonly_zs = optimize_shader_read_only_zs,
};
}
@ -108,14 +132,20 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq,
* Returns a lookup table of configurations indexed by the API state.
*/
struct pan_earlyzs_lut
pan_earlyzs_analyze(const struct pan_shader_info *s)
pan_earlyzs_analyze(const struct pan_shader_info *s, unsigned arch)
{
/* Shader read-only ZS optimization appeared in v10. */
bool can_optimize_shader_readonly_zs = arch >= 10;
struct pan_earlyzs_lut lut;
for (unsigned v0 = 0; v0 < 2; ++v0) {
for (unsigned v1 = 0; v1 < 2; ++v1) {
for (unsigned v2 = 0; v2 < 2; ++v2)
lut.states[v0][v1][v2] = analyze(s, v0, v1, v2);
for (unsigned v2 = 0; v2 < 2; ++v2) {
for (unsigned v3 = 0; v3 < 2; ++v3) {
lut.states[v0][v1][v2][v3] =
analyze(s, v0, v1, v2, v3, can_optimize_shader_readonly_zs);
}
}
}
}

View file

@ -45,8 +45,11 @@ struct pan_earlyzs_state {
/* Pixel kill */
enum pan_earlyzs kill : 2;
/* True if the shader read-only ZS optimization should be enabled */
bool shader_readonly_zs : 1;
/* So it fits in a byte */
unsigned padding : 4;
unsigned padding : 3;
};
/* Internal lookup table. Users should treat as an opaque structure and only
@ -54,7 +57,7 @@ struct pan_earlyzs_state {
* for definition of the arrays.
*/
struct pan_earlyzs_lut {
struct pan_earlyzs_state states[2][2][2];
struct pan_earlyzs_state states[2][2][2][2];
};
/*
@ -63,14 +66,17 @@ struct pan_earlyzs_lut {
*/
static inline struct pan_earlyzs_state
pan_earlyzs_get(struct pan_earlyzs_lut lut, bool writes_zs_or_oq,
bool alpha_to_coverage, bool zs_always_passes)
bool alpha_to_coverage, bool zs_always_passes,
bool shader_reads_zs)
{
return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes];
return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes]
[shader_reads_zs];
}
struct pan_shader_info;
struct pan_earlyzs_lut pan_earlyzs_analyze(const struct pan_shader_info *s);
struct pan_earlyzs_lut pan_earlyzs_analyze(const struct pan_shader_info *s,
unsigned arch);
#ifdef __cplusplus
} /* extern C */

View file

@ -34,15 +34,17 @@
* under test, only the external API. So we test only the composition.
*/
#define ZS_WRITEMASK BITFIELD_BIT(0)
#define ALPHA2COV BITFIELD_BIT(1)
#define ZS_ALWAYS_PASSES BITFIELD_BIT(2)
#define DISCARD BITFIELD_BIT(3)
#define WRITES_Z BITFIELD_BIT(4)
#define WRITES_S BITFIELD_BIT(5)
#define WRITES_COV BITFIELD_BIT(6)
#define SIDEFX BITFIELD_BIT(7)
#define API_EARLY BITFIELD_BIT(8)
#define ZS_WRITEMASK BITFIELD_BIT(0)
#define ALPHA2COV BITFIELD_BIT(1)
#define ZS_ALWAYS_PASSES BITFIELD_BIT(2)
#define DISCARD BITFIELD_BIT(3)
#define WRITES_Z BITFIELD_BIT(4)
#define WRITES_S BITFIELD_BIT(5)
#define WRITES_COV BITFIELD_BIT(6)
#define SIDEFX BITFIELD_BIT(7)
#define API_EARLY BITFIELD_BIT(8)
#define SHADER_READS_ZS BITFIELD_BIT(9)
#define ARCH_HAS_READONLY_ZS_OPT BITFIELD_BIT(10)
static void
test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill,
@ -56,9 +58,10 @@ test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill,
info.fs.early_fragment_tests = !!(flags & API_EARLY);
info.writes_global = !!(flags & SIDEFX);
struct pan_earlyzs_state result =
pan_earlyzs_get(pan_earlyzs_analyze(&info), !!(flags & ZS_WRITEMASK),
!!(flags & ALPHA2COV), !!(flags & ZS_ALWAYS_PASSES));
struct pan_earlyzs_state result = pan_earlyzs_get(
pan_earlyzs_analyze(&info, flags & ARCH_HAS_READONLY_ZS_OPT ? 10 : 9),
!!(flags & ZS_WRITEMASK), !!(flags & ALPHA2COV),
!!(flags & ZS_ALWAYS_PASSES), !!(flags & SHADER_READS_ZS));
ASSERT_EQ(result.update, expected_update);
ASSERT_EQ(result.kill, expected_kill);
@ -134,6 +137,15 @@ TEST(EarlyZS, NoSideFXNoShaderZS)
CASE(FORCE_EARLY, FORCE_EARLY, ZS_WRITEMASK);
}
TEST(EarlyZS, ShaderReadOnlyZS)
{
CASE(FORCE_LATE, FORCE_LATE, SIDEFX | SHADER_READS_ZS);
CASE(FORCE_EARLY, FORCE_LATE, SIDEFX | SHADER_READS_ZS | ARCH_HAS_READONLY_ZS_OPT);
CASE(FORCE_EARLY, FORCE_EARLY, SHADER_READS_ZS | ARCH_HAS_READONLY_ZS_OPT);
CASE(FORCE_LATE, WEAK_EARLY, SHADER_READS_ZS | ZS_ALWAYS_PASSES);
CASE(FORCE_LATE, FORCE_EARLY, SHADER_READS_ZS);
}
TEST(EarlyZS, NoSideFXNoShaderZSAlt)
{
CASE(WEAK_EARLY, WEAK_EARLY, ZS_ALWAYS_PASSES);

View file

@ -1534,7 +1534,7 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
struct pan_earlyzs_state earlyzs =
pan_earlyzs_get(fs->fs.earlyzs_lut, writes_zs || oq,
alpha_to_coverage, zs_always_passes);
alpha_to_coverage, zs_always_passes, false);
cfg.pixel_kill_operation = (enum mali_pixel_kill)earlyzs.kill;
cfg.zs_update_operation = (enum mali_pixel_kill)earlyzs.update;

View file

@ -296,7 +296,7 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf,
struct pan_earlyzs_state earlyzs =
pan_earlyzs_get(fs->fs.earlyzs_lut, writes_zs || oq,
alpha_to_coverage, zs_always_passes);
alpha_to_coverage, zs_always_passes, false);
cfg.properties.pixel_kill_operation = earlyzs.kill;
cfg.properties.zs_update_operation = earlyzs.update;

View file

@ -922,7 +922,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
break;
case MESA_SHADER_FRAGMENT:
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info);
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH);
break;
default:
@ -1339,7 +1339,7 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
break;
case MESA_SHADER_FRAGMENT:
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info);
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH);
break;
default: