diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index b3fdd51b5a4..b98594f827d 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -508,7 +508,7 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx, uint64_t *blend_shaders, struct pan_earlyzs_state earlyzs = pan_earlyzs_get( fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, ctx->blend->base.alpha_to_coverage, - ctx->depth_stencil->zs_always_passes); + ctx->depth_stencil->zs_always_passes, false); cfg.properties.pixel_kill_operation = earlyzs.kill; cfg.properties.zs_update_operation = earlyzs.update; diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 0091eba77e6..13a7af48cbe 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -1208,7 +1208,7 @@ csf_emit_draw_state(struct panfrost_batch *batch, struct pan_earlyzs_state earlyzs = pan_earlyzs_get( fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, ctx->blend->base.alpha_to_coverage, - ctx->depth_stencil->zs_always_passes); + ctx->depth_stencil->zs_always_passes, false); cfg.pixel_kill_operation = (enum mali_pixel_kill)earlyzs.kill; cfg.zs_update_operation = (enum mali_pixel_kill)earlyzs.update; diff --git a/src/gallium/drivers/panfrost/pan_jm.c b/src/gallium/drivers/panfrost/pan_jm.c index 37091699d7f..f9c05ee1b61 100644 --- a/src/gallium/drivers/panfrost/pan_jm.c +++ b/src/gallium/drivers/panfrost/pan_jm.c @@ -581,7 +581,7 @@ jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch, struct pan_earlyzs_state earlyzs = pan_earlyzs_get( fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, ctx->blend->base.alpha_to_coverage, - ctx->depth_stencil->zs_always_passes); + ctx->depth_stencil->zs_always_passes, false); cfg.flags_0.pixel_kill_operation = earlyzs.kill; cfg.flags_0.zs_update_operation = earlyzs.update; diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index 9f2bed31b3b..3c805e42c4b 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -355,6 +355,7 @@ panfrost_new_variant_locked(struct panfrost_context *ctx, struct panfrost_uncompiled_shader *uncompiled, struct panfrost_shader_key *key) { + struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_compiled_shader *prog = panfrost_alloc_variant(uncompiled); *prog = (struct panfrost_compiled_shader){ @@ -365,7 +366,7 @@ panfrost_new_variant_locked(struct panfrost_context *ctx, panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, uncompiled, &ctx->base.debug, prog, 0); - prog->earlyzs = pan_earlyzs_analyze(&prog->info); + prog->earlyzs = pan_earlyzs_analyze(&prog->info, dev->arch); return prog; } diff --git a/src/panfrost/lib/pan_earlyzs.c b/src/panfrost/lib/pan_earlyzs.c index eb941847320..dee65aaa076 100644 --- a/src/panfrost/lib/pan_earlyzs.c +++ b/src/panfrost/lib/pan_earlyzs.c @@ -30,9 +30,9 @@ * force early. */ static enum pan_earlyzs -best_early_mode(bool zs_always_passes) +best_early_mode(bool zs_always_passes, bool force_early) { - if (zs_always_passes) + if (zs_always_passes && !force_early) return PAN_EARLYZS_WEAK_EARLY; else return PAN_EARLYZS_FORCE_EARLY; @@ -45,7 +45,8 @@ best_early_mode(bool zs_always_passes) */ static struct pan_earlyzs_state analyze(const struct pan_shader_info *s, bool writes_zs_or_oq, - bool alpha_to_coverage, bool zs_always_passes) + bool alpha_to_coverage, bool zs_always_passes, + bool shader_reads_zs, bool can_optimize_shader_readonly_zs) { /* If the shader writes depth or stencil, all depth/stencil tests must * be deferred until the value is known after the ZS_EMIT instruction, @@ -62,6 +63,8 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq, bool shader_writes_zs = (s->fs.writes_depth || s->fs.writes_stencil); bool late_update = shader_writes_zs || alpha_to_coverage; bool late_kill = shader_writes_zs; + bool force_early_update = s->fs.early_fragment_tests; + bool force_early_kill = s->fs.early_fragment_tests; /* Late coverage updates are required if the coverage mask depends on * the results of the shader. Discards are implemented as coverage mask @@ -90,16 +93,37 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq, */ late_kill |= s->writes_global; + /* Shader reads require late depth/stencil tests to ensure the shader + * isn't killed before the side effects execute, unless the HW supports + * read-only ZS optimization, in which case it can be lowered to + * force-early. */ + bool optimize_shader_read_only_zs = false; + if (shader_reads_zs) { + if (!late_update && can_optimize_shader_readonly_zs) { + optimize_shader_read_only_zs = true; + force_early_update |= true; + } else { + late_update |= true; + } + + if (!late_kill && can_optimize_shader_readonly_zs) { + optimize_shader_read_only_zs = true; + force_early_kill |= true; + } + } + /* Finally, the shader may override and force early fragment tests */ late_update &= !s->fs.early_fragment_tests; late_kill &= !s->fs.early_fragment_tests; /* Collect results */ - enum pan_earlyzs early_mode = best_early_mode(zs_always_passes); - return (struct pan_earlyzs_state){ - .update = late_update ? PAN_EARLYZS_FORCE_LATE : early_mode, - .kill = late_kill ? PAN_EARLYZS_FORCE_LATE : early_mode, + .update = late_update + ? PAN_EARLYZS_FORCE_LATE + : best_early_mode(zs_always_passes, force_early_update), + .kill = late_kill ? PAN_EARLYZS_FORCE_LATE + : best_early_mode(zs_always_passes, force_early_kill), + .shader_readonly_zs = optimize_shader_read_only_zs, }; } @@ -108,14 +132,20 @@ analyze(const struct pan_shader_info *s, bool writes_zs_or_oq, * Returns a lookup table of configurations indexed by the API state. */ struct pan_earlyzs_lut -pan_earlyzs_analyze(const struct pan_shader_info *s) +pan_earlyzs_analyze(const struct pan_shader_info *s, unsigned arch) { + /* Shader read-only ZS optimization appeared in v10. */ + bool can_optimize_shader_readonly_zs = arch >= 10; struct pan_earlyzs_lut lut; for (unsigned v0 = 0; v0 < 2; ++v0) { for (unsigned v1 = 0; v1 < 2; ++v1) { - for (unsigned v2 = 0; v2 < 2; ++v2) - lut.states[v0][v1][v2] = analyze(s, v0, v1, v2); + for (unsigned v2 = 0; v2 < 2; ++v2) { + for (unsigned v3 = 0; v3 < 2; ++v3) { + lut.states[v0][v1][v2][v3] = + analyze(s, v0, v1, v2, v3, can_optimize_shader_readonly_zs); + } + } } } diff --git a/src/panfrost/lib/pan_earlyzs.h b/src/panfrost/lib/pan_earlyzs.h index fdc626b00aa..7002f46dd36 100644 --- a/src/panfrost/lib/pan_earlyzs.h +++ b/src/panfrost/lib/pan_earlyzs.h @@ -45,8 +45,11 @@ struct pan_earlyzs_state { /* Pixel kill */ enum pan_earlyzs kill : 2; + /* True if the shader read-only ZS optimization should be enabled */ + bool shader_readonly_zs : 1; + /* So it fits in a byte */ - unsigned padding : 4; + unsigned padding : 3; }; /* Internal lookup table. Users should treat as an opaque structure and only @@ -54,7 +57,7 @@ struct pan_earlyzs_state { * for definition of the arrays. */ struct pan_earlyzs_lut { - struct pan_earlyzs_state states[2][2][2]; + struct pan_earlyzs_state states[2][2][2][2]; }; /* @@ -63,14 +66,17 @@ struct pan_earlyzs_lut { */ static inline struct pan_earlyzs_state pan_earlyzs_get(struct pan_earlyzs_lut lut, bool writes_zs_or_oq, - bool alpha_to_coverage, bool zs_always_passes) + bool alpha_to_coverage, bool zs_always_passes, + bool shader_reads_zs) { - return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes]; + return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes] + [shader_reads_zs]; } struct pan_shader_info; -struct pan_earlyzs_lut pan_earlyzs_analyze(const struct pan_shader_info *s); +struct pan_earlyzs_lut pan_earlyzs_analyze(const struct pan_shader_info *s, + unsigned arch); #ifdef __cplusplus } /* extern C */ diff --git a/src/panfrost/lib/tests/test-earlyzs.cpp b/src/panfrost/lib/tests/test-earlyzs.cpp index 23efd301a29..33234960b52 100644 --- a/src/panfrost/lib/tests/test-earlyzs.cpp +++ b/src/panfrost/lib/tests/test-earlyzs.cpp @@ -34,15 +34,17 @@ * under test, only the external API. So we test only the composition. */ -#define ZS_WRITEMASK BITFIELD_BIT(0) -#define ALPHA2COV BITFIELD_BIT(1) -#define ZS_ALWAYS_PASSES BITFIELD_BIT(2) -#define DISCARD BITFIELD_BIT(3) -#define WRITES_Z BITFIELD_BIT(4) -#define WRITES_S BITFIELD_BIT(5) -#define WRITES_COV BITFIELD_BIT(6) -#define SIDEFX BITFIELD_BIT(7) -#define API_EARLY BITFIELD_BIT(8) +#define ZS_WRITEMASK BITFIELD_BIT(0) +#define ALPHA2COV BITFIELD_BIT(1) +#define ZS_ALWAYS_PASSES BITFIELD_BIT(2) +#define DISCARD BITFIELD_BIT(3) +#define WRITES_Z BITFIELD_BIT(4) +#define WRITES_S BITFIELD_BIT(5) +#define WRITES_COV BITFIELD_BIT(6) +#define SIDEFX BITFIELD_BIT(7) +#define API_EARLY BITFIELD_BIT(8) +#define SHADER_READS_ZS BITFIELD_BIT(9) +#define ARCH_HAS_READONLY_ZS_OPT BITFIELD_BIT(10) static void test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill, @@ -56,9 +58,10 @@ test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill, info.fs.early_fragment_tests = !!(flags & API_EARLY); info.writes_global = !!(flags & SIDEFX); - struct pan_earlyzs_state result = - pan_earlyzs_get(pan_earlyzs_analyze(&info), !!(flags & ZS_WRITEMASK), - !!(flags & ALPHA2COV), !!(flags & ZS_ALWAYS_PASSES)); + struct pan_earlyzs_state result = pan_earlyzs_get( + pan_earlyzs_analyze(&info, flags & ARCH_HAS_READONLY_ZS_OPT ? 10 : 9), + !!(flags & ZS_WRITEMASK), !!(flags & ALPHA2COV), + !!(flags & ZS_ALWAYS_PASSES), !!(flags & SHADER_READS_ZS)); ASSERT_EQ(result.update, expected_update); ASSERT_EQ(result.kill, expected_kill); @@ -134,6 +137,15 @@ TEST(EarlyZS, NoSideFXNoShaderZS) CASE(FORCE_EARLY, FORCE_EARLY, ZS_WRITEMASK); } +TEST(EarlyZS, ShaderReadOnlyZS) +{ + CASE(FORCE_LATE, FORCE_LATE, SIDEFX | SHADER_READS_ZS); + CASE(FORCE_EARLY, FORCE_LATE, SIDEFX | SHADER_READS_ZS | ARCH_HAS_READONLY_ZS_OPT); + CASE(FORCE_EARLY, FORCE_EARLY, SHADER_READS_ZS | ARCH_HAS_READONLY_ZS_OPT); + CASE(FORCE_LATE, WEAK_EARLY, SHADER_READS_ZS | ZS_ALWAYS_PASSES); + CASE(FORCE_LATE, FORCE_EARLY, SHADER_READS_ZS); +} + TEST(EarlyZS, NoSideFXNoShaderZSAlt) { CASE(WEAK_EARLY, WEAK_EARLY, ZS_ALWAYS_PASSES); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 49f66a2eab7..a720f17c4aa 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1534,7 +1534,7 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf) struct pan_earlyzs_state earlyzs = pan_earlyzs_get(fs->fs.earlyzs_lut, writes_zs || oq, - alpha_to_coverage, zs_always_passes); + alpha_to_coverage, zs_always_passes, false); cfg.pixel_kill_operation = (enum mali_pixel_kill)earlyzs.kill; cfg.zs_update_operation = (enum mali_pixel_kill)earlyzs.update; diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c index 235b0cd7255..25bc5ecaa09 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c @@ -296,7 +296,7 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf, struct pan_earlyzs_state earlyzs = pan_earlyzs_get(fs->fs.earlyzs_lut, writes_zs || oq, - alpha_to_coverage, zs_always_passes); + alpha_to_coverage, zs_always_passes, false); cfg.properties.pixel_kill_operation = earlyzs.kill; cfg.properties.zs_update_operation = earlyzs.update; diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index da473764c8e..6bd08b69a4c 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -922,7 +922,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, break; case MESA_SHADER_FRAGMENT: - shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info); + shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH); break; default: @@ -1339,7 +1339,7 @@ panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob, break; case MESA_SHADER_FRAGMENT: - shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info); + shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH); break; default: