From 85e211efd88d6c9890356f3dfc6e512588ead90e Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:41:35 +0100 Subject: [PATCH 01/49] pan/genxml: Add missing enum values on v9-v13 Note block-linear interleaved clump orderings are not supported on all v10 architectures. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/v10.xml | 10 ++++++++++ src/panfrost/genxml/v12.xml | 6 ++++++ src/panfrost/genxml/v13.xml | 6 ++++++ src/panfrost/genxml/v9.xml | 2 ++ 4 files changed, 24 insertions(+) diff --git a/src/panfrost/genxml/v10.xml b/src/panfrost/genxml/v10.xml index 2fd4bb86637..95204c4a496 100644 --- a/src/panfrost/genxml/v10.xml +++ b/src/panfrost/genxml/v10.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1163,6 +1166,13 @@ + + + + + + diff --git a/src/panfrost/genxml/v12.xml b/src/panfrost/genxml/v12.xml index 0d651f01b0d..e3716030601 100644 --- a/src/panfrost/genxml/v12.xml +++ b/src/panfrost/genxml/v12.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1426,6 +1429,9 @@ + + + diff --git a/src/panfrost/genxml/v13.xml b/src/panfrost/genxml/v13.xml index c644d2bd49c..30285e4c351 100644 --- a/src/panfrost/genxml/v13.xml +++ b/src/panfrost/genxml/v13.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1728,6 +1731,9 @@ + + + diff --git a/src/panfrost/genxml/v9.xml b/src/panfrost/genxml/v9.xml index d5bc4c1e110..3935d4dea99 100644 --- a/src/panfrost/genxml/v9.xml +++ b/src/panfrost/genxml/v9.xml @@ -1,5 +1,6 @@ @@ -103,6 +104,7 @@ + From 661ef96526fd4ac1892e2dd2f107d92041ea13a1 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 10 Apr 2026 12:10:31 +0200 Subject: [PATCH 02/49] pan/genxml: Add base v14 definition This is just a copy of v13.xml to help spot any missing changes while working on v14. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/v14.xml | 2759 +++++++++++++++++++++++++++++++++++ 1 file changed, 2759 insertions(+) create mode 100644 src/panfrost/genxml/v14.xml diff --git a/src/panfrost/genxml/v14.xml b/src/panfrost/genxml/v14.xml new file mode 100644 index 00000000000..30285e4c351 --- /dev/null +++ b/src/panfrost/genxml/v14.xml @@ -0,0 +1,2759 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 96cec69ce88179fd6aeb0ae07609d230fb45eb6a Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:44:53 +0100 Subject: [PATCH 03/49] pan/genxml: Add v14 definition Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/gen_macros.h | 3 + src/panfrost/genxml/meson.build | 2 +- src/panfrost/genxml/v14.xml | 344 +++++++++++++++---------------- 3 files changed, 173 insertions(+), 176 deletions(-) diff --git a/src/panfrost/genxml/gen_macros.h b/src/panfrost/genxml/gen_macros.h index b9e856f8533..c1e8ab1fbae 100644 --- a/src/panfrost/genxml/gen_macros.h +++ b/src/panfrost/genxml/gen_macros.h @@ -61,6 +61,9 @@ #elif (PAN_ARCH == 13) #define GENX(X) X##_v13 #include "genxml/v13_pack.h" +#elif (PAN_ARCH == 14) +#define GENX(X) X##_v14 +#include "genxml/v14_pack.h" #else #error "Need to add suffixing macro for this architecture" #endif diff --git a/src/panfrost/genxml/meson.build b/src/panfrost/genxml/meson.build index 3712b84822d..c60cc0c777d 100644 --- a/src/panfrost/genxml/meson.build +++ b/src/panfrost/genxml/meson.build @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MIT pan_packers = [] -foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13'] +foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14'] pan_packers += custom_target( packer + '_pack.h', input : ['gen_pack.py', packer + '.xml'], diff --git a/src/panfrost/genxml/v14.xml b/src/panfrost/genxml/v14.xml index 30285e4c351..fe340dcfb16 100644 --- a/src/panfrost/genxml/v14.xml +++ b/src/panfrost/genxml/v14.xml @@ -4,7 +4,7 @@ SPDX-License-Identifier: MIT --> - + @@ -122,30 +122,18 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + @@ -161,10 +149,12 @@ + + @@ -326,15 +316,17 @@ - - + + + + @@ -342,8 +334,7 @@ - - + @@ -448,6 +439,11 @@ + + + + + @@ -467,6 +463,13 @@ + + + + + + + @@ -517,11 +520,11 @@ - + @@ -530,7 +533,6 @@ - @@ -546,8 +548,6 @@ - - @@ -557,6 +557,7 @@ + @@ -601,11 +602,6 @@ - - - - - @@ -802,6 +798,11 @@ + + + + + @@ -817,7 +818,6 @@ - @@ -876,22 +876,6 @@ - - - - - - - - - - - - - - - - @@ -910,7 +894,6 @@ - @@ -922,7 +905,6 @@ - @@ -931,16 +913,17 @@ - + - - + + + + - @@ -1056,7 +1039,6 @@ - @@ -1151,14 +1133,12 @@ - - @@ -1166,7 +1146,6 @@ - @@ -1468,13 +1447,36 @@ - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1779,18 +1781,11 @@ - - - - - - - - - + + + - @@ -1818,21 +1813,18 @@ - - - + + + - - - - - - - - - - - + + + + + + + + @@ -1851,33 +1843,15 @@ + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -1885,7 +1859,6 @@ - @@ -2251,7 +2224,14 @@ - + + + + + + + + @@ -2260,44 +2240,73 @@ + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2612,21 +2621,6 @@ - - - - -
-
- - - - - - - - - From 8c744c5dc0464b55a7680d48b20d51152243949f Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 11:16:05 +0200 Subject: [PATCH 04/49] pan/genxml: Implement RUN_FRAGMENT2 Add support for emitting and decoding RUN_FRAGMENT2 instructions. Some existing decoding logic from decode.c is modified to be reusable by the new RUN_FRAGMENT2 decoding logic. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/cs_builder.h | 69 +++++++++++++++ src/panfrost/genxml/decode.c | 111 +++++++++++++----------- src/panfrost/genxml/decode.h | 18 ++++ src/panfrost/genxml/decode_csf.c | 143 +++++++++++++++++++++++++++++++ 4 files changed, 293 insertions(+), 48 deletions(-) diff --git a/src/panfrost/genxml/cs_builder.h b/src/panfrost/genxml/cs_builder.h index a109f4d113b..ae0653a1f84 100644 --- a/src/panfrost/genxml/cs_builder.h +++ b/src/panfrost/genxml/cs_builder.h @@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask) case MALI_CS_OPCODE_STORE_MULTIPLE: case MALI_CS_OPCODE_RUN_COMPUTE: case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: +#else case MALI_CS_OPCODE_RUN_FRAGMENT: +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: #if PAN_ARCH >= 12 case MALI_CS_OPCODE_RUN_IDVS2: @@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, } #endif +#if PAN_ARCH >= 14 +static inline void +cs_run_fragment2(struct cs_builder *b, bool enable_tem, + enum mali_tile_render_order tile_order) +{ + /* Staging regs */ + cs_flush_loads(b); + + b->req_resource_mask |= CS_FRAG_RES; + + cs_emit(b, RUN_FRAGMENT2, I) { + I.enable_tem = enable_tem; + I.tile_order = tile_order; + } +} +#else static inline void cs_run_fragment(struct cs_builder *b, bool enable_tem, enum mali_tile_render_order tile_order) @@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem, I.tile_order = tile_order; } } +#endif static inline void cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, @@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, (int16_t)(offsetof(struct cs_##__type##_trace, __field) - \ sizeof(struct cs_##__type##_trace)) +#if PAN_ARCH >= 14 +#define CS_RUN_FRAGMENT2_SR_COUNT 56 +#define CS_RUN_FRAGMENT2_SR_MASK BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT) +struct cs_run_fragment2_trace { + uint64_t ip; + uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, bool enable_tem, + enum mali_tile_render_order tile_order) +{ + if (likely(!ctx->enabled)) { + cs_run_fragment2(b, enable_tem, tile_order); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_fragment2_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_fragment2(b, enable_tem, tile_order); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip)); + + ASSERTED unsigned sr_count = 0; + unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr); + for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) { + unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16); + if (!mask) + continue; + + cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask, + sr_offset); + sr_offset += util_bitcount(mask) * sizeof(uint32_t); + sr_count += util_bitcount(mask); + } + assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT); + + cs_flush_stores(b); +} +#else struct cs_run_fragment_trace { uint64_t ip; uint32_t sr[7]; @@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_fragment, sr)); cs_flush_stores(b); } +#endif #if PAN_ARCH >= 13 #define CS_RUN_FULLSCREEN_SR_MASK \ diff --git a/src/panfrost/genxml/decode.c b/src/panfrost/genxml/decode.c index 38a2e696e4d..bda0431d33e 100644 --- a/src/panfrost/genxml/decode.c +++ b/src/panfrost/genxml/decode.c @@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va) } -static void -pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va, - const struct MALI_FRAMEBUFFER_PARAMETERS *fb) +void +GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va, + uint32_t render_target_count) { pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va); ctx->indent++; - for (int i = 0; i < (fb->render_target_count); i++) + for (int i = 0; i < render_target_count; i++) pandecode_rt(ctx, i, gpu_va); ctx->indent--; pandecode_log(ctx, "\n"); } -static void -pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va) +void +GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va) { const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR( ctx, zs_crc_packed, (uint64_t)gpu_va); @@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va) #if PAN_ARCH >= 6 -static void -pandecode_sample_locations(struct pandecode_context *ctx, const void *fb) +void +GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx, + uint64_t dcd_pointer, unsigned pre_frame_0, + unsigned pre_frame_1, unsigned post_frame, + unsigned job_type_param, uint64_t gpu_id) { - pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); + const unsigned dcd_size = pan_size(DRAW); - const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations); + if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (0 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer, + pre_frame_0); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } - pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", - params.sample_locations); + if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (1 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n", + dcd_pointer + (1 * dcd_size)); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } + + if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (2 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Post frame:\n"); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } +} + +void +GENX(pandecode_sample_locations)(struct pandecode_context *ctx, + uint64_t sample_locations) +{ + const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations); + + pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations); for (int i = 0; i < 33; i++) { pandecode_log(ctx, " (%d, %d),\n", samples[2 * i] - 128, samples[2 * i + 1] - 128); } } -#endif +#endif /* PAN_ARCH >= 6 */ +#if PAN_ARCH < 14 struct pandecode_fbd GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, bool is_fragment, uint64_t gpu_id) @@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n"); #if PAN_ARCH >= 6 - pandecode_sample_locations(ctx, fb); + GENX(pandecode_sample_locations)(ctx, params.sample_locations); - unsigned dcd_size = pan_size(DRAW); unsigned job_type_param = 0; #if PAN_ARCH <= 9 job_type_param = MALI_JOB_TYPE_FRAGMENT; #endif - if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (0 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", - params.frame_shader_dcds, params.pre_frame_0); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } - - if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (1 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n", - params.frame_shader_dcds + (1 * dcd_size)); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } - - if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (2 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Post frame:\n"); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } + GENX(pandecode_frame_shader_dcds) + (ctx, params.frame_shader_dcds, params.pre_frame_0, params.pre_frame_1, + params.post_frame, job_type_param, gpu_id); #else DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); @@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, gpu_va += pan_size(FRAMEBUFFER); if (params.has_zs_crc_extension) { - pandecode_zs_crc_ext(ctx, gpu_va); + GENX(pandecode_zs_crc_ext)(ctx, gpu_va); gpu_va += pan_size(ZS_CRC_EXTENSION); } if (is_fragment) - pandecode_rts(ctx, gpu_va, ¶ms); + GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count); return (struct pandecode_fbd){ .rt_count = params.render_target_count, @@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, }; #endif } +#endif /* PAN_ARCH < 14 */ #if PAN_ARCH >= 5 uint64_t diff --git a/src/panfrost/genxml/decode.h b/src/panfrost/genxml/decode.h index f7d83ca5525..bc9f743f9b7 100644 --- a/src/panfrost/genxml/decode.h +++ b/src/panfrost/genxml/decode.h @@ -275,4 +275,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx, #endif +#if PAN_ARCH >= 6 +void GENX(pandecode_sample_locations)(struct pandecode_context *ctx, + uint64_t sample_locations); + +void + GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx, + uint64_t dcd_pointer, unsigned pre_frame_0, + unsigned pre_frame_1, unsigned post_frame, + unsigned job_type_param, uint64_t gpu_id); +#endif + +#if PAN_ARCH >= 5 +void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va, + uint32_t render_target_count); + +void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va); +#endif + #endif /* __MMAP_TRACE_H__ */ diff --git a/src/panfrost/genxml/decode_csf.c b/src/panfrost/genxml/decode_csf.c index ca3b4807950..efb8be00544 100644 --- a/src/panfrost/genxml/decode_csf.c +++ b/src/panfrost/genxml/decode_csf.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2022-2023 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -343,6 +344,23 @@ print_cs_instr(FILE *fp, const uint64_t *instr) } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + static const char *tile_order[] = { + "zorder", "horizontal", "vertical", "unknown", + "unknown", "rev_horizontal", "rev_vertical", "unknown", + "unknown", "unknown", "unknown", "unknown", + "unknown", "unknown", "unknown", "unknown", + }; + + cs_unpack(instr, CS_RUN_FRAGMENT2, I); + + fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s", + I.enable_tem ? ".tile_enable_map_enable" : "", + tile_order[I.tile_order]); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { static const char *tile_order[] = { "zorder", "horizontal", "vertical", "unknown", @@ -350,6 +368,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", }; + cs_unpack(instr, CS_RUN_FRAGMENT, I); fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s", @@ -358,6 +377,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) tile_order[I.tile_order]); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { cs_unpack(instr, CS_RUN_FULLSCREEN, I); @@ -1097,6 +1117,101 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp, } #endif +#if PAN_ARCH >= 14 +static void +pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp, + struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I) +{ + if (qctx->in_exception_handler) + return; + + ctx->indent++; + + pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0)); + pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1)); + pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n", + cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER)); + pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE)); + + for (unsigned i = 0; i < 11; ++i) { + const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2); + pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i, + cs_get_u64(qctx, reg)); + } + + DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3], + "Flags 3:\n"); + DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX, + &qctx->regs[MALI_FRAGMENT_SR_BOUNDING_BOX], "Bounding Box:\n"); + DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE], + "Frame size:\n"); + + pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx + ->regs[MALI_FRAGMENT_SR_FLAGS_0], + FRAGMENT_FLAGS_0, flags0_unpacked) + ; + DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n"); + + pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx + ->regs[MALI_FRAGMENT_SR_FLAGS_1], + FRAGMENT_FLAGS_1, flags1_unpacked) + ; + DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n"); + + DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2], + "Flags 2:\n"); + pandecode_log(ctx, "Z clear: %f\n", + uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR))); + + const uint64_t tiler_pointer = + cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER); + pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n", + tiler_pointer); + + const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER); + pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer); + + const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER); + pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer); + + pandecode_log(ctx, "Frame argument: %" PRIx64 "\n", + cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG)); + + const uint64_t sample_locations = + cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER); + pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations); + + const uint64_t dcd_pointer = + cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER); + pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer); + + DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE], + "VRS image:\n"); + + GENX(pandecode_sample_locations) + (ctx, sample_locations); + + const unsigned job_type_param = 0; + GENX(pandecode_frame_shader_dcds) + (ctx, dcd_pointer, flags0_unpacked.pre_frame_0, flags0_unpacked.pre_frame_1, + flags0_unpacked.post_frame, job_type_param, qctx->gpu_id); + + if (tiler_pointer) + GENX(pandecode_tiler)(ctx, tiler_pointer); + + if (dbd_pointer) + GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer); + + if (rtd_pointer) + GENX(pandecode_rts) + (ctx, rtd_pointer, flags1_unpacked.render_target_count); + + ctx->indent--; +} +#else static void pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp, struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I) @@ -1115,6 +1230,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp, ctx->indent--; } +#endif /* PAN_ARCH >= 14 */ static void pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp, @@ -1261,11 +1377,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx) } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + cs_unpack(bytes, CS_RUN_FRAGMENT2, I); + pandecode_run_fragment2(ctx, fp, qctx, &I); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { cs_unpack(bytes, CS_RUN_FRAGMENT, I); pandecode_run_fragment(ctx, fp, qctx, &I); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { cs_unpack(bytes, CS_RUN_FULLSCREEN, I); @@ -2430,7 +2554,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin, #else case MALI_CS_OPCODE_RUN_IDVS: #endif + +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: +#else case MALI_CS_OPCODE_RUN_FRAGMENT: +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: case MALI_CS_OPCODE_RUN_COMPUTE: case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: @@ -2539,6 +2668,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace, } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + struct cs_run_fragment2_trace *frag_trace = trace_data; + + assert(trace_size >= sizeof(*frag_trace)); + cs_unpack(instr, CS_RUN_FRAGMENT2, I); + memcpy(®s[0], frag_trace->sr, sizeof(frag_trace->sr)); + pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I); + trace_data = frag_trace + 1; + trace_size -= sizeof(*frag_trace); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { struct cs_run_fragment_trace *frag_trace = trace_data; @@ -2550,6 +2692,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace, trace_size -= sizeof(*frag_trace); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { struct cs_run_fullscreen_trace *fs_trace = trace_data; From cb6e788548df661dbfa26ae51f053ef3bddbe61d Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 22 Apr 2026 10:24:36 +0200 Subject: [PATCH 05/49] pan/decode: Remove progress-related decoding logic Progress is no longer encoded by the CS builder. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/decode_csf.c | 97 ++++++++------------------------ 1 file changed, 22 insertions(+), 75 deletions(-) diff --git a/src/panfrost/genxml/decode_csf.c b/src/panfrost/genxml/decode_csf.c index efb8be00544..10f062cebda 100644 --- a/src/panfrost/genxml/decode_csf.c +++ b/src/panfrost/genxml/decode_csf.c @@ -118,8 +118,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) case MALI_CS_OPCODE_WAIT: { cs_unpack(instr, CS_WAIT, I); - fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "", - I.wait_mask); + fprintf(fp, "WAIT #%x", I.wait_mask); break; } @@ -131,15 +130,13 @@ print_cs_instr(FILE *fp, const uint64_t *instr) * since we'll print them implicitly later. */ #if PAN_ARCH >= 12 - fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", - I.progress_increment ? ".progress_inc" : "", axes[I.task_axis], - I.srt_select, I.spd_select, I.tsd_select, I.fau_select, - I.task_increment, I.ep_limit); + fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", + axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select, + I.fau_select, I.task_increment, I.ep_limit); #else - fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u", - I.progress_increment ? ".progress_inc" : "", axes[I.task_axis], - I.srt_select, I.spd_select, I.tsd_select, I.fau_select, - I.task_increment); + fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u", + axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select, + I.fau_select, I.task_increment); #endif break; } @@ -147,8 +144,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) #if PAN_ARCH == 10 case MALI_CS_OPCODE_RUN_TILING: { cs_unpack(instr, CS_RUN_TILING, I); - fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d", - I.progress_increment ? ".progress_inc" : "", I.srt_select, + fprintf(fp, "RUN_TILING.srt%d.spd%d.tsd%d.fau%d", I.srt_select, I.spd_select, I.tsd_select, I.fau_select); break; } @@ -159,8 +155,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) cs_unpack(instr, CS_RUN_IDVS, I); fprintf( fp, - "RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", + "RUN_IDVS%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64, I.malloc_enable ? "" : ".no_malloc", I.draw_id_register_enable ? ".draw_id_enable" : "", I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select, @@ -179,8 +174,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) ".INVALID", }; - fprintf(fp, "RUN_IDVS2%s%s%s%s r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", + fprintf(fp, "RUN_IDVS2%s%s%s r%u, #%" PRIx64, I.malloc_enable ? "" : ".no_malloc", I.draw_id_register_enable ? ".draw_id_enable" : "", vertex_shading_str[I.vertex_shading_mode], I.draw_id, @@ -319,27 +313,15 @@ print_cs_instr(FILE *fp, const uint64_t *instr) case MALI_CS_OPCODE_SHARED_SB_INC: { cs_unpack(instr, CS_SHARED_SB_INC, I); - const char *progress_increment_name[] = { - ".no_increment", - ".increment", - }; - - fprintf(fp, "SHARED_SB_INC%s%s #%u, #%u", - progress_increment_name[I.progress_increment], - defer_mode_str(I), I.sb_mask, I.shared_entry); + fprintf(fp, "SHARED_SB_INC%s #%u, #%u", defer_mode_str(I), I.sb_mask, + I.shared_entry); break; } case MALI_CS_OPCODE_SHARED_SB_DEC: { cs_unpack(instr, CS_SHARED_SB_DEC, I); - const char *progress_increment_name[] = { - ".no_increment", - ".increment", - }; - - fprintf(fp, "SHARED_SB_DEC%s #%u", - progress_increment_name[I.progress_increment], I.shared_entry); + fprintf(fp, "SHARED_SB_DEC #%u", I.shared_entry); break; } #endif @@ -371,8 +353,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) cs_unpack(instr, CS_RUN_FRAGMENT, I); - fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s", - I.progress_increment ? ".progress_inc" : "", + fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s", I.enable_tem ? ".tile_enable_map_enable" : "", tile_order[I.tile_order]); break; @@ -381,16 +362,13 @@ print_cs_instr(FILE *fp, const uint64_t *instr) case MALI_CS_OPCODE_RUN_FULLSCREEN: { cs_unpack(instr, CS_RUN_FULLSCREEN, I); - fprintf(fp, "RUN_FULLSCREEN%s r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", I.dcd, - I.flags_override); + fprintf(fp, "RUN_FULLSCREEN r%u, #%" PRIx64, I.dcd, I.flags_override); break; } case MALI_CS_OPCODE_FINISH_TILING: { cs_unpack(instr, CS_FINISH_TILING, I); - fprintf(fp, "FINISH_TILING%s", - I.progress_increment ? ".progress_inc" : ""); + fprintf(fp, "FINISH_TILING"); break; } @@ -463,12 +441,6 @@ print_cs_instr(FILE *fp, const uint64_t *instr) break; } - case MALI_CS_OPCODE_PROGRESS_WAIT: { - cs_unpack(instr, CS_PROGRESS_WAIT, I); - fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue); - break; - } - case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: { cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I); fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length); @@ -567,29 +539,16 @@ print_cs_instr(FILE *fp, const uint64_t *instr) break; } - case MALI_CS_OPCODE_PROGRESS_STORE: { - cs_unpack(instr, CS_PROGRESS_STORE, I); - fprintf(fp, "PROGRESS_STORE d%u", I.source); - break; - } - - case MALI_CS_OPCODE_PROGRESS_LOAD: { - cs_unpack(instr, CS_PROGRESS_LOAD, I); - fprintf(fp, "PROGRESS_LOAD d%u", I.destination); - break; - } - case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: { cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I); #if PAN_ARCH >= 12 - fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", - I.progress_increment ? ".progress_inc" : "", I.srt_select, - I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task, - I.ep_limit); + fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u, #%u", + I.srt_select, I.spd_select, I.tsd_select, I.fau_select, + I.workgroups_per_task, I.ep_limit); #else - fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u", - I.progress_increment ? ".progress_inc" : "", I.srt_select, - I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task); + fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u", + I.srt_select, I.spd_select, I.tsd_select, I.fau_select, + I.workgroups_per_task); #endif break; @@ -2316,18 +2275,6 @@ collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg, break; } - case MALI_CS_OPCODE_PROGRESS_LOAD: { - cs_unpack(instr, CS_PROGRESS_LOAD, I); - for (unsigned i = 0; i < 16; i++) { - if (BITSET_TEST(track_map, I.destination) || - BITSET_TEST(track_map, I.destination + 1)) { - ibranch->has_unknown_targets = true; - return; - } - } - break; - } - default: break; } From 3687fc515b2f9555bc994cbe3891a56cab0b0f77 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 11:01:07 +0200 Subject: [PATCH 06/49] pan/genxml: Build libpanfrost_decode for v14 Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/genxml/decode.h | 7 +++++++ src/panfrost/genxml/decode_common.c | 9 +++++++++ src/panfrost/genxml/meson.build | 2 +- src/panfrost/lib/pan_format.h | 3 +++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/panfrost/genxml/decode.h b/src/panfrost/genxml/decode.h index bc9f743f9b7..47fe28f798f 100644 --- a/src/panfrost/genxml/decode.h +++ b/src/panfrost/genxml/decode.h @@ -132,6 +132,13 @@ void pandecode_cs_binary_v13(struct pandecode_context *ctx, uint64_t bin, void pandecode_cs_trace_v13(struct pandecode_context *ctx, uint64_t trace, uint32_t trace_size, uint64_t gpu_id); +void pandecode_interpret_cs_v14(struct pandecode_context *ctx, uint64_t queue, + uint32_t size, uint64_t gpu_id, uint32_t *regs); +void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin, + uint32_t bin_size); +void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace, + uint32_t trace_size, uint64_t gpu_id); + /* Logging infrastructure */ static void pandecode_make_indent(struct pandecode_context *ctx) diff --git a/src/panfrost/genxml/decode_common.c b/src/panfrost/genxml/decode_common.c index 208d28a8cb5..399fec9f335 100644 --- a/src/panfrost/genxml/decode_common.c +++ b/src/panfrost/genxml/decode_common.c @@ -423,6 +423,9 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va, case 13: pandecode_interpret_cs_v13(ctx, queue_gpu_va, size, gpu_id, regs); break; + case 14: + pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -446,6 +449,9 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va, case 13: pandecode_cs_binary_v13(ctx, bin_gpu_va, size); break; + case 14: + pandecode_cs_binary_v14(ctx, bin_gpu_va, size); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -469,6 +475,9 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va, case 13: pandecode_cs_trace_v13(ctx, trace_gpu_va, size, gpu_id); break; + case 14: + pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id); + break; default: UNREACHABLE("Unsupported architecture"); } diff --git a/src/panfrost/genxml/meson.build b/src/panfrost/genxml/meson.build index c60cc0c777d..ee4b4adea3f 100644 --- a/src/panfrost/genxml/meson.build +++ b/src/panfrost/genxml/meson.build @@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency( libpanfrost_decode_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_decode_per_arch += static_library( 'pandecode-arch-v' + ver, ['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers], diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index 7c641c24105..08e26f79000 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -168,6 +168,8 @@ extern const struct pan_blendable_format pan_blendable_formats_v12[PIPE_FORMAT_COUNT]; extern const struct pan_blendable_format pan_blendable_formats_v13[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + pan_blendable_formats_v14[PIPE_FORMAT_COUNT]; uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats); @@ -199,6 +201,7 @@ extern const struct pan_format pan_pipe_format_v9[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT]; +extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT]; static inline const struct pan_format * pan_format_table(unsigned arch) From 6c89a14e1b998f39775ddc61eb077b9cf0f58beb Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:48:47 +0100 Subject: [PATCH 07/49] pan/clc: Build for v14 Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/clc/pan_compile.c | 2 +- src/panfrost/libpan/libpan.h | 2 ++ src/panfrost/libpan/libpan_shaders.h | 2 ++ src/panfrost/libpan/meson.build | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/panfrost/clc/pan_compile.c b/src/panfrost/clc/pan_compile.c index b2e25e7c53b..3a34897c21b 100644 --- a/src/panfrost/clc/pan_compile.c +++ b/src/panfrost/clc/pan_compile.c @@ -275,7 +275,7 @@ main(int argc, const char **argv) unsigned target_arch = atoi(target_arch_str); - if (target_arch < 4 || target_arch > 13) { + if (target_arch < 4 || target_arch > 14) { fprintf(stderr, "Unsupported target arch %d\n", target_arch); return 1; } diff --git a/src/panfrost/libpan/libpan.h b/src/panfrost/libpan/libpan.h index ed7c5c66f29..cc79ea92b74 100644 --- a/src/panfrost/libpan/libpan.h +++ b/src/panfrost/libpan/libpan.h @@ -28,6 +28,8 @@ #include "libpan_v12.h" #elif (PAN_ARCH == 13) #include "libpan_v13.h" +#elif (PAN_ARCH == 14) +#include "libpan_v14.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/libpan_shaders.h b/src/panfrost/libpan/libpan_shaders.h index 5154cef68d7..d51761abf64 100644 --- a/src/panfrost/libpan/libpan_shaders.h +++ b/src/panfrost/libpan/libpan_shaders.h @@ -26,6 +26,8 @@ #include "libpan_shaders_v12.h" #elif (PAN_ARCH == 13) #include "libpan_shaders_v13.h" +#elif (PAN_ARCH == 14) +#include "libpan_shaders_v14.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/meson.build b/src/panfrost/libpan/meson.build index 734660b5735..dfe40fff9c1 100644 --- a/src/panfrost/libpan/meson.build +++ b/src/panfrost/libpan/meson.build @@ -11,7 +11,7 @@ libpan_shader_files = files( idep_libpan_per_arch = {} -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpan_spv = custom_target( input : libpan_shader_files, output : 'libpan_v' + ver + '.spv', From 1527d88bc15b9838831e6e2b204349b9f7f859b1 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 14:33:42 +0200 Subject: [PATCH 08/49] pan/fb: Implement pan_emit_fb_desc for RUN_FRAGMENT2 Add a new structure that is used to store per-layer RUN_FRAGMENT2 state. Any other state will be emitted directly to registers. Also, modify pan_emit_fb_desc's signature to take a pan_ptr to the framebuffer memory instead of the CPU-mapped pointer. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/pan_fb.c | 121 ++++++++++++++++++- src/panfrost/lib/pan_fb.h | 41 ++++++- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 7 +- src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c | 2 +- 4 files changed, 162 insertions(+), 9 deletions(-) diff --git a/src/panfrost/lib/pan_fb.c b/src/panfrost/lib/pan_fb.c index f9b6c85b2ce..3b3c6c86c5f 100644 --- a/src/panfrost/lib/pan_fb.c +++ b/src/panfrost/lib/pan_fb.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2026 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ #include "pan_fb.h" @@ -669,9 +670,124 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, } #endif +#if PAN_ARCH >= 14 uint32_t -GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out) +GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, + const struct pan_ptr framebuffer) { + /* Emit the dynamic framebuffer state. That is, state that may change per-layer. */ + + void *out = framebuffer.cpu; + const struct pan_fb_layout *fb = info->fb; + const struct pan_fb_load *load = info->load; + const struct pan_fb_store *store = info->store; + const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info); + const bool has_zs_crc_ext = pan_fb_has_zs(fb); + + struct pan_fbd_layer fbd_data = {0}; + fbd_data.tiler = info->tiler_ctx->valhall.desc; + + /* layer_index in flags0 is used to select the right primitive list in + * the tiler context, and frame_arg is the value that's passed to the + * fragment shader through r62-r63, which we use to pass gl_Layer. Since + * the layer_idx only takes 8-bits, we might use the extra 56-bits we + * have in frame_argument to pass other information to the fragment + * shader at some point. + */ + assert(info->layer >= info->tiler_ctx->valhall.layer_offset); + fbd_data.frame_argument = info->layer; + + pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) { + cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0], + ct.rts || ct.zs || ct.s); + cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1], + ct.rts || ct.zs || ct.s); + cfg.post_frame = info->frame_shaders.modes[2]; + + /* Enabling prepass without pipelineing is generally not good for + * performance, so disable HSR in that case. + */ + cfg.hsr_prepass_enable = info->allow_hsr_prepass && + pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_filter_enable = true; + cfg.hsr_hierarchical_optimizations_enable = true; + + cfg.internal_layer_index = + info->layer - info->tiler_ctx->valhall.layer_offset; + } + + pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) { + if (fb->s_format != PIPE_FORMAT_NONE) { + cfg.s_clear = load && target_has_clear(&load->s) ? + load->s.clear.stencil : 0; + cfg.s_write_enable = store && store->s.store; + } + + if (fb->z_format != PIPE_FORMAT_NONE) { + cfg.z_internal_format = pan_get_z_internal_format(fb->z_format); + cfg.z_write_enable = store && store->zs.store; + } else { + cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24; + assert(!store || !store->zs.store); + } + } + + fbd_data.z_clear = + util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load && + target_has_clear(&load->z) + ? load->z.clear.depth + : 0); + + fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer; + + { + /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */ + uint64_t out_gpu_addr = + framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64); + + if (has_zs_crc_ext) { + fbd_data.dbd_pointer = out_gpu_addr; + assert(fbd_data.dbd_pointer % 64 == 0); + out_gpu_addr += pan_size(ZS_CRC_EXTENSION); + } + + fbd_data.rtd_pointer = out_gpu_addr; + assert(fbd_data.rtd_pointer % 64 == 0); + } + + memcpy(out, &fbd_data, sizeof(fbd_data)); + out += ALIGN_POT(sizeof(fbd_data), 64); + + if (has_zs_crc_ext) { + struct mali_zs_crc_extension_packed zs_crc; + emit_zs_crc_desc(info, ct, &zs_crc); + memcpy(out, &zs_crc, sizeof(zs_crc)); + out += sizeof(zs_crc); + } + + uint32_t tile_rt_offset_B = 0; + for (unsigned rt = 0; rt < fb->rt_count; rt++) { + struct mali_rgb_render_target_packed rgb_rt; + emit_rgb_rt_desc(info, ct, rt, tile_rt_offset_B, &rgb_rt); + memcpy(out, &rgb_rt, sizeof(rgb_rt)); + out += sizeof(rgb_rt); + + if (fb->rt_formats[rt] != PIPE_FORMAT_NONE) { + tile_rt_offset_B += pan_bytes_per_pixel_tib(fb->rt_formats[rt]) * + fb->tile_size_px * fb->sample_count; + } + } + assert(tile_rt_offset_B <= fb->tile_rt_alloc_B); + + return 0; +} +#else /* PAN_ARCH < 14 */ +uint32_t +GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, + const struct pan_ptr framebuffer) +{ + void *out = framebuffer.cpu; const struct pan_fb_layout *fb = info->fb; const struct pan_fb_load *load = info->load; const struct pan_fb_store *store = info->store; @@ -823,4 +939,5 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out) } return tag.opaque[0]; } -#endif +#endif /* PAN_ARCH >= 14 */ +#endif /* PAN_ARCH >= 5 */ diff --git a/src/panfrost/lib/pan_fb.h b/src/panfrost/lib/pan_fb.h index c4635f3f4c2..48bfc888b1c 100644 --- a/src/panfrost/lib/pan_fb.h +++ b/src/panfrost/lib/pan_fb.h @@ -1,14 +1,20 @@ /* * Copyright (C) 2026 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ #ifndef __PAN_FB_H #define __PAN_FB_H +#if PAN_ARCH >= 14 +#include "genxml/cs_builder.h" +#endif + +#include "compiler/shader_enums.h" #include "genxml/gen_macros.h" #include "util/format/u_formats.h" -#include "compiler/shader_enums.h" +#include "pan_pool.h" struct nir_shader; struct nir_shader_compiler_options; @@ -481,7 +487,7 @@ void GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info, struct pan_fb_info *fbinfo); uint32_t GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, - void *out); + const struct pan_ptr framebuffer); #endif enum ENUM_PACKED pan_fb_shader_op { @@ -620,4 +626,35 @@ GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key, const struct nir_shader_compiler_options *nir_options); #endif +#if PAN_ARCH >= 14 +/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since + * we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors + * aligned. */ +struct pan_fbd_layer { + /** GPU address to the tiler descriptor. */ + uint64_t tiler; + + /** Frame argument. */ + uint64_t frame_argument; + + /** An instance of Fragment Flags 0. */ + struct mali_fragment_flags_0_packed flags0; + + /** An instance of Fragment Flags 2. */ + struct mali_fragment_flags_2_packed flags2; + + /** Z clear value. */ + uint32_t z_clear; + + /** GPU address to the draw call descriptors. It may be 0. */ + uint64_t dcd_pointer; + + /** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */ + uint64_t dbd_pointer; + + /** GPU address to the RENDER_TARGET descriptors. */ + uint64_t rtd_pointer; +} __attribute__((aligned(64))); +#endif /* PAN_ARCH >= 14 */ + #endif /* __PAN_FB_H */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 87e7b647df7..55069924624 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1316,7 +1316,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) tiler_ctx = get_tiler_context(cmdbuf, layer_idx); uint32_t new_fbd_flags = - GENX(pan_emit_fb_desc)(&fbd_info, fbds.cpu + fbd_sz * i); + GENX(pan_emit_fb_desc)(&fbd_info, pan_ptr_offset(fbds, fbd_sz * i)); /* Make sure all FBDs have the same flags. */ assert(i == 0 || new_fbd_flags == fbd_flags); @@ -1335,7 +1335,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) for (uint32_t i = 0; i < enabled_layer_count; i++) { uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i; - void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz)); fbd_info.layer = layer_idx; tiler_ctx = get_tiler_context(cmdbuf, layer_idx); @@ -1353,8 +1352,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) if (result != VK_SUCCESS) return result; - ASSERTED uint32_t new_fbd_flags = - GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd); + ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)( + &fbd_info, pan_ptr_offset(ir_fbds, fbd_sz * i)); /* Make sure all FBDs have the same flags. */ assert(new_fbd_flags == fbd_flags); diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c index 0579034aea2..9879ca8b112 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c @@ -181,7 +181,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf) fbd_info.layer = layer_id; fbd_info.frame_shaders = fs; fbd_info.frame_shaders.dcd_pointer += layer_id * 3 * pan_size(DRAW); - tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd.cpu); + tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd); result = panvk_cmd_prepare_fragment_job(cmdbuf, tagged_fbd_ptr); if (result != VK_SUCCESS) From 589dedf2f21734f186fdc4f84572f4b7f05edbf9 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 15:28:51 +0200 Subject: [PATCH 09/49] pan/desc: Implement pan_emit_fbd for RUN_FRAGMENT2 Reuses the same structure that is used by pan_emit_fb_desc. Also, modify pan_emit_fbd's signature to take a pan_ptr to the framebuffer memory instead of the CPU-mapped pointer. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/gallium/drivers/panfrost/pan_csf.c | 4 +- src/gallium/drivers/panfrost/pan_jm.c | 4 +- src/panfrost/lib/pan_desc.c | 155 ++++++++++++++++++++++++- src/panfrost/lib/pan_desc.h | 2 +- 4 files changed, 158 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 2246804b85c..357d4cfeff0 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -758,7 +758,7 @@ GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb) (_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS] #define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx) \ GET_FBD(_ctx, _pass).gpu |= \ - GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu) + GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass)) void GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, @@ -771,7 +771,7 @@ GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, /* Default framebuffer descriptor */ batch->framebuffer.gpu |= - GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu); + GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer); if (batch->draw_count == 0) return; diff --git a/src/gallium/drivers/panfrost/pan_jm.c b/src/gallium/drivers/panfrost/pan_jm.c index 845c238853e..818846927fd 100644 --- a/src/gallium/drivers/panfrost/pan_jm.c +++ b/src/gallium/drivers/panfrost/pan_jm.c @@ -257,8 +257,8 @@ GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, { PAN_TRACE_FUNC(PAN_TRACE_GL_JM); - batch->framebuffer.gpu |= GENX(pan_emit_fbd)( - fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu); + batch->framebuffer.gpu |= + GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer); } void diff --git a/src/panfrost/lib/pan_desc.c b/src/panfrost/lib/pan_desc.c index 3df01de0090..cf9f08aae5b 100644 --- a/src/panfrost/lib/pan_desc.c +++ b/src/panfrost/lib/pan_desc.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2021 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -11,6 +12,7 @@ #include "pan_afrc.h" #include "pan_desc.h" #include "pan_encoder.h" +#include "pan_fb.h" #include "pan_props.h" #include "pan_texture.h" #include "pan_trace.h" @@ -1172,11 +1174,156 @@ check_fb_attachments(const struct pan_fb_info *fb) #endif } +#if PAN_ARCH >= 14 unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, void *out) + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) { + void *out = framebuffer.cpu; + + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); + + check_fb_attachments(fb); + + const int crc_rt = GENX(pan_select_crc_rt)(fb, fb->tile_size); + const bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0); + const struct pan_clean_tile clean_tile = pan_get_clean_tile_info(fb); + + /* Emit to memory the state that might change per-layer. The static + * state is emitted directly to CSF registers by + * cs_emit_static_fragment_state(). + */ + + struct pan_fbd_layer fbd_data = {0}; + fbd_data.tiler = tiler_ctx->valhall.desc; + + /* internal_layer_index in flags0 is used to select the right + * primitive list in the tiler context, and frame_arg is the value + * that's passed to the fragment shader through r62-r63, which we use + * to pass gl_Layer. Since the layer_idx only takes 8-bits, we might + * use the extra 56-bits we have in frame_argument to pass other + * information to the fragment shader at some point. + */ + assert(layer_idx >= tiler_ctx->valhall.layer_offset); + fbd_data.frame_argument = layer_idx; + + pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) { + cfg.pre_frame_0 = + pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], + pan_clean_tile_write_any_set(clean_tile)); + cfg.pre_frame_1 = + pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1], + pan_clean_tile_write_any_set(clean_tile)); + cfg.post_frame = fb->bifrost.pre_post.modes[2]; + + const unsigned zs_bytes_per_pixel = pan_zsbuf_bytes_per_pixel(fb); + /* We can interleave HSR if we have space for two ZS tiles in + * the tile buffer. */ + const unsigned max_zs_tile_size_interleave = + fb->z_tile_buf_budget >> util_logbase2_ceil(zs_bytes_per_pixel); + const bool hsr_can_interleave = + fb->tile_size <= max_zs_tile_size_interleave; + + /* Enabling prepass without interleave is generally not good for + * performance, so disable HSR in that case. */ + cfg.hsr_prepass_enable = fb->allow_hsr_prepass && hsr_can_interleave; + cfg.hsr_prepass_interleaving_enable = hsr_can_interleave; + cfg.hsr_prepass_filter_enable = true; + cfg.hsr_hierarchical_optimizations_enable = true; + + cfg.internal_layer_index = layer_idx - tiler_ctx->valhall.layer_offset; + } + + fbd_data.dcd_pointer = fb->bifrost.pre_post.dcds.gpu; + + pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) { + cfg.s_clear = fb->zs.clear_value.stencil; + cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s); + + /* Default to 24 bit depth if there's no surface. */ + cfg.z_internal_format = + fb->zs.view.zs ? pan_get_z_internal_format(fb->zs.view.zs->format) + : MALI_Z_INTERNAL_FORMAT_D24; + cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z); + + if (crc_rt >= 0) { + bool *valid = fb->rts[crc_rt].crc_valid; + bool full = !fb->draw_extent.minx && !fb->draw_extent.miny && + fb->draw_extent.maxx == (fb->width - 1) && + fb->draw_extent.maxy == (fb->height - 1); + + /* If the CRC was valid it stays valid, if it wasn't, we must + * ensure the render operation covers the full frame, and + * clean tiles are pushed to memory. */ + bool new_valid = *valid | (full && pan_clean_tile_write_rt_enabled( + clean_tile, crc_rt)); + + cfg.crc_read_enable = *valid; + + /* If the data is currently invalid, still write CRC + * data if we are doing a full write, so that it is + * valid for next time. */ + cfg.crc_write_enable = new_valid; + + *valid = new_valid; + } + } + + fbd_data.z_clear = util_bitpack_float(fb->zs.clear_value.depth); + + { + /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */ + uint64_t out_gpu_addr = + framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64); + + if (has_zs_crc_ext) { + fbd_data.dbd_pointer = out_gpu_addr; + assert(fbd_data.dbd_pointer % 64 == 0); + out_gpu_addr += pan_size(ZS_CRC_EXTENSION); + } + + fbd_data.rtd_pointer = out_gpu_addr; + assert(fbd_data.rtd_pointer % 64 == 0); + } + + memcpy(out, &fbd_data, sizeof(fbd_data)); + out += ALIGN_POT(sizeof(fbd_data), 64); + + if (has_zs_crc_ext) { + struct mali_zs_crc_extension_packed *zs_crc_ext = out; + pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, zs_crc_ext, clean_tile); + out += pan_size(ZS_CRC_EXTENSION); + } + + const unsigned rt_count = MAX2(fb->rt_count, 1); + unsigned cbuf_offset = 0; + for (unsigned i = 0; i < rt_count; i++) { + pan_emit_rt(fb, layer_idx, i, cbuf_offset, out, clean_tile); + out += pan_size(RENDER_TARGET); + if (!fb->rts[i].view) + continue; + + cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) * + fb->tile_size * + pan_image_view_get_nr_samples(fb->rts[i].view); + + if (i != crc_rt && fb->rts[i].crc_valid != NULL) + *(fb->rts[i].crc_valid) = false; + } + + return 0; +} +#else +unsigned +GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, + const struct pan_tls_info *tls, + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) +{ + void *out = framebuffer.cpu; + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); check_fb_attachments(fb); @@ -1351,6 +1498,7 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, } return tag.opaque[0]; } +#endif /* PAN_ARCH >= 14 */ #else /* PAN_ARCH == 4 */ static enum mali_color_format pan_sfbd_raw_format(unsigned bits) @@ -1378,8 +1526,11 @@ GENX(pan_select_tile_size)(struct pan_fb_info *fb) unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, void *fbd) + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) { + void *fbd = framebuffer.cpu; + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); assert(fb->rt_count <= 1); diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index db5b6588ad3..7cc7639c897 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -341,7 +341,7 @@ void GENX(pan_emit_afrc_color_attachment)(const struct pan_attachment_info *att, unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, const struct pan_tiler_context *tiler_ctx, - void *out); + const struct pan_ptr framebuffer); #if PAN_ARCH >= 6 unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height, From 59c6549fc47737843a46aca82814bc6841504588 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 22 Apr 2026 09:13:14 +0200 Subject: [PATCH 10/49] pan/texture: Add v14+ YUV pipe format mappings v14+ no longer uses specific clump formats for YUV. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/pan_texture.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/panfrost/lib/pan_texture.c b/src/panfrost/lib/pan_texture.c index 286b5c18b67..58a413278cf 100644 --- a/src/panfrost/lib/pan_texture.c +++ b/src/panfrost/lib/pan_texture.c @@ -223,6 +223,25 @@ pan_clump_format(enum pipe_format format) /* YUV-sampling has special cases */ if (pan_format_is_yuv(format)) { switch (format) { +#if PAN_ARCH >= 14 + case PIPE_FORMAT_R8G8_R8B8_UNORM: + case PIPE_FORMAT_G8R8_B8R8_UNORM: + case PIPE_FORMAT_R8B8_R8G8_UNORM: + case PIPE_FORMAT_B8R8_G8R8_UNORM: + case PIPE_FORMAT_R8_G8B8_422_UNORM: + case PIPE_FORMAT_R8_B8G8_422_UNORM: + case PIPE_FORMAT_R8_G8B8_420_UNORM: + case PIPE_FORMAT_R8_B8G8_420_UNORM: + case PIPE_FORMAT_R8_G8_B8_420_UNORM: + case PIPE_FORMAT_R8_B8_G8_420_UNORM: + case PIPE_FORMAT_R8G8B8_420_UNORM_PACKED: + return MALI_CLUMP_FORMAT_RAW8; + case PIPE_FORMAT_R10_G10B10_420_UNORM: + case PIPE_FORMAT_R10G10B10_420_UNORM_PACKED: + case PIPE_FORMAT_R10_G10B10_422_UNORM: + case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM: + return MALI_CLUMP_FORMAT_R10_PACKED; +#else case PIPE_FORMAT_R8G8_R8B8_UNORM: case PIPE_FORMAT_G8R8_B8R8_UNORM: case PIPE_FORMAT_R8B8_R8G8_UNORM: @@ -242,6 +261,7 @@ pan_clump_format(enum pipe_format format) case PIPE_FORMAT_R10_G10B10_422_UNORM: case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM: return MALI_CLUMP_FORMAT_Y10_UV10_422; +#endif /* PAN_ARCH >= 14 */ default: UNREACHABLE("unhandled clump format"); } From 198d3855353199243ba160212a40626a2d0e8f05 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 22 Apr 2026 09:19:22 +0200 Subject: [PATCH 11/49] pan/format: Add v14+ YUV pipe format mappings Map the multiplane and special internal formats to the new v14+ YUV formats. Note v14+ has a much simplified list of formats. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/pan_format.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/panfrost/lib/pan_format.c b/src/panfrost/lib/pan_format.c index f67a3528ebb..7db35f5ac78 100644 --- a/src/panfrost/lib/pan_format.c +++ b/src/panfrost/lib/pan_format.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2019 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -184,7 +185,27 @@ const struct pan_blendable_format const struct pan_format GENX(pan_pipe_format)[PIPE_FORMAT_COUNT] = { FMT(NONE, CONSTANT, 0000, L, VTR_IB), -#if PAN_ARCH >= 7 +#if PAN_ARCH >= 14 + /* Multiplane formats */ + FMT_YUV(R8G8_R8B8_UNORM, Y8U8Y8V8_422, UVYA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(G8R8_B8R8_UNORM, U8Y8V8Y8_422, UYVA, SWAP, CENTER_422, _T____), + FMT_YUV(R8B8_R8G8_UNORM, Y8U8Y8V8_422, VYUA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(B8R8_G8R8_UNORM, U8Y8V8Y8_422, VUYA, SWAP, CENTER_422, _T____), + FMT_YUV(R8_G8B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_B8G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_G8_B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_B8_G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____), + + FMT_YUV(R8_G8B8_422_UNORM, Y8U8Y8V8_422, YUVA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(R8_B8G8_422_UNORM, U8Y8V8Y8_422, YVUA, NO_SWAP, CENTER_422, _T____), + + FMT_YUV(R10_G10B10_420_UNORM, YUYAAYVYAA_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R10_G10B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, YUVA, NO_SWAP, CENTER_422, _T____), + /* special internal formats */ + FMT_YUV(R8G8B8_420_UNORM_PACKED, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R10G10B10_420_UNORM_PACKED, Y10U10V10_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(X6R10X6G10_X6R10X6B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, UVYA, NO_SWAP, CENTER_422, _T____), +#elif PAN_ARCH >= 7 /* Multiplane formats */ FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T____), FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP, CENTER_422, _T____), From 0c162269c3a294821f73bc3004e990e67b9d2f31 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 22 Apr 2026 09:29:48 +0200 Subject: [PATCH 12/49] pan/afbc: Add v14+ AFBC YUV compression mappings On v14+, many AFBC YUV modes map to generic RGB compression modes. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/pan_afbc.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/panfrost/lib/pan_afbc.h b/src/panfrost/lib/pan_afbc.h index 035b77011b5..f0328a0ba44 100644 --- a/src/panfrost/lib/pan_afbc.h +++ b/src/panfrost/lib/pan_afbc.h @@ -3,6 +3,7 @@ * Copyright (C) 2014 Broadcom * Copyright (C) 2018-2019 Alyssa Rosenzweig * Copyright (C) 2019-2020 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -711,6 +712,32 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode) case PAN_AFBC_MODE_R16G16B16A16: return MALI_AFBC_COMPRESSION_MODE_R16G16B16A16; #endif +#if PAN_ARCH >= 14 + case PAN_AFBC_MODE_YUV420_6C8: + return MALI_AFBC_COMPRESSION_MODE_Y8U8V8_420; + case PAN_AFBC_MODE_YUV420_2C8: + return MALI_AFBC_COMPRESSION_MODE_R8G8; + case PAN_AFBC_MODE_YUV420_1C8: + return MALI_AFBC_COMPRESSION_MODE_R8; + case PAN_AFBC_MODE_YUV420_6C10: + return MALI_AFBC_COMPRESSION_MODE_Y10U10V10_420; + case PAN_AFBC_MODE_YUV420_2C10: + return MALI_AFBC_COMPRESSION_MODE_R10G10; + case PAN_AFBC_MODE_YUV420_1C10: + return MALI_AFBC_COMPRESSION_MODE_R10; + case PAN_AFBC_MODE_YUV422_4C8: + return MALI_AFBC_COMPRESSION_MODE_Y8U8Y8V8_422; + case PAN_AFBC_MODE_YUV422_2C8: + return MALI_AFBC_COMPRESSION_MODE_R8G8; + case PAN_AFBC_MODE_YUV422_1C8: + return MALI_AFBC_COMPRESSION_MODE_R8; + case PAN_AFBC_MODE_YUV422_4C10: + return MALI_AFBC_COMPRESSION_MODE_Y10U10Y10V10_422; + case PAN_AFBC_MODE_YUV422_2C10: + return MALI_AFBC_COMPRESSION_MODE_R10G10; + case PAN_AFBC_MODE_YUV422_1C10: + return MALI_AFBC_COMPRESSION_MODE_R10; +#else case PAN_AFBC_MODE_YUV420_6C8: return MALI_AFBC_COMPRESSION_MODE_YUV420_6C8; case PAN_AFBC_MODE_YUV420_2C8: @@ -735,6 +762,7 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode) return MALI_AFBC_COMPRESSION_MODE_YUV422_2C10; case PAN_AFBC_MODE_YUV422_1C10: return MALI_AFBC_COMPRESSION_MODE_YUV422_1C10; +#endif /* PAN_ARCH >= 14 */ #if PAN_ARCH == 9 case PAN_AFBC_MODE_R16: case PAN_AFBC_MODE_R16G16: From 1e350ef79c2b53225006242d31fc3b6a8a208106 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 22 Apr 2026 09:38:01 +0200 Subject: [PATCH 13/49] pan/afrc: Add v14+ AFRC YUV compression mappings v14+ no longer uses specific AFRC compression formats for YUV. Instead, generic R8/R8G8 and R10/R10G10 formats are used. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/pan_afrc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/panfrost/lib/pan_afrc.h b/src/panfrost/lib/pan_afrc.h index 4a96eb374ea..306e48fb55e 100644 --- a/src/panfrost/lib/pan_afrc.h +++ b/src/panfrost/lib/pan_afrc.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2023 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -347,6 +348,25 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier, return (scan ? MALI_AFRC_FORMAT_R10G10B10A10_SCAN : MALI_AFRC_FORMAT_R10G10B10A10_ROT); +#if PAN_ARCH >= 14 + case PAN_AFRC_ICHANGE_FORMAT_YUV444: + case PAN_AFRC_ICHANGE_FORMAT_YUV422: + case PAN_AFRC_ICHANGE_FORMAT_YUV420: + if (info.bpc == 8) { + if (plane == 0 || info.num_planes == 3) + return (scan ? MALI_AFRC_FORMAT_R8_SCAN : MALI_AFRC_FORMAT_R8_ROT); + + return (scan ? MALI_AFRC_FORMAT_R8G8_SCAN : MALI_AFRC_FORMAT_R8G8_ROT); + } + + if (plane == 0 || info.num_planes == 3) + return (scan ? MALI_AFRC_FORMAT_R10_SCAN : MALI_AFRC_FORMAT_R10_ROT); + + assert(info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV422 || + info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV420); + return (scan ? MALI_AFRC_FORMAT_R10G10_SCAN + : MALI_AFRC_FORMAT_R10G10_ROT); +#else case PAN_AFRC_ICHANGE_FORMAT_YUV444: if (info.bpc == 8) { if (plane == 0 || info.num_planes == 3) @@ -394,6 +414,7 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier, return (scan ? MALI_AFRC_FORMAT_R10G10_420_SCAN : MALI_AFRC_FORMAT_R10G10_420_ROT); +#endif /* PAN_ARCH >= 14 */ default: return MALI_AFRC_FORMAT_INVALID; From 52d6c192931b23d8603926c0e4ab4e6f936775fe Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 11:40:11 +0200 Subject: [PATCH 14/49] pan/lib: Build for v14 Enable building libpanfrost for v14. Also, modify format mappings to account for the new architecture specification. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/lib/meson.build | 4 ++-- src/panfrost/lib/pan_format.h | 2 ++ src/panfrost/lib/pan_mod.h | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build index 8c5b3d5537d..a4572db619c 100644 --- a/src/panfrost/lib/meson.build +++ b/src/panfrost/lib/meson.build @@ -4,7 +4,7 @@ subdir('kmod') -pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13'] +pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_pixel_format = [] deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep] @@ -22,7 +22,7 @@ endforeach libpanfrost_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_per_arch += static_library( 'pan-arch-v' + ver, [ diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index 08e26f79000..770d8a1bf56 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -186,6 +186,7 @@ pan_blendable_format_table(unsigned arch) FMT_TABLE(10); FMT_TABLE(12); FMT_TABLE(13); + FMT_TABLE(14); #undef FMT_TABLE default: assert(!"Unsupported architecture"); @@ -216,6 +217,7 @@ pan_format_table(unsigned arch) FMT_TABLE(10); FMT_TABLE(12); FMT_TABLE(13); + FMT_TABLE(14); #undef FMT_TABLE default: assert(!"Unsupported architecture"); diff --git a/src/panfrost/lib/pan_mod.h b/src/panfrost/lib/pan_mod.h index 25ecaa25d50..1bd9a759a44 100644 --- a/src/panfrost/lib/pan_mod.h +++ b/src/panfrost/lib/pan_mod.h @@ -84,6 +84,7 @@ const struct pan_mod_handler *pan_mod_get_handler_v9(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier); +const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier); static inline const struct pan_mod_handler * pan_mod_get_handler(unsigned arch, uint64_t modifier) @@ -105,6 +106,8 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier) return pan_mod_get_handler_v12(modifier); case 13: return pan_mod_get_handler_v13(modifier); + case 14: + return pan_mod_get_handler_v14(modifier); default: UNREACHABLE("Unsupported arch"); } From d425c52a8ab664004f809099efd5c06de706bbf9 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 14:45:56 +0200 Subject: [PATCH 15/49] panvk: Hook up RUN_FRAGMENT2 Set the FBD size/alignment correctly and emit the fragment staging registers before issuing fragment commands. Also, move some temporary registers to non-conflicting ones. Incremental rendering is left as TODO for later. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 47 +++++- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 147 ++++++++++++++++-- .../vulkan/csf/panvk_vX_exception_handler.c | 32 +++- src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c | 7 +- 4 files changed, 212 insertions(+), 21 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 7e7e8922c88..ad2aa434d48 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -74,7 +74,11 @@ static inline uint32_t get_fbd_size(bool has_zs_ext, uint32_t rt_count) { assert(rt_count >= 1 && rt_count <= MAX_RTS); +#if PAN_ARCH >= 14 + uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64); +#else uint32_t fbd_size = pan_size(FRAMEBUFFER); +#endif if (has_zs_ext) fbd_size += pan_size(ZS_CRC_EXTENSION); fbd_size += pan_size(RENDER_TARGET) * rt_count; @@ -209,13 +213,27 @@ enum panvk_cs_regs { PANVK_CS_REG_RUN_IDVS_SR_END = 60, #endif +#if PAN_ARCH >= 14 + /* RUN_FRAGMENT2 staging regs. + * SW ABI: + * - r54:55 contain the pointer to the current FBD layer state. + * - r58:59 contain the pointer to the first tiler descriptor. This is + * needed to gather completed heap chunks after a run_fragment2. + */ + PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0, + PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55, + PANVK_CS_REG_FBD_LAYER_PTR = 54, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#else /* RUN_FRAGMENT staging regs. * SW ABI: - * - r38:39 contain the pointer to the first tiler descriptor. This is + * - r58:59 contain the pointer to the first tiler descriptor. This is * needed to gather completed heap chunks after a run_fragment. */ PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38, PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#endif /* RUN_COMPUTE staging regs. */ PANVK_CS_REG_RUN_COMPUTE_SR_START = 0, @@ -870,4 +888,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages, void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf, struct panvk_cs_deps deps); +#if PAN_ARCH >= 14 +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct pan_fbd_layer, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct pan_fbd_layer, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct pan_fbd_layer, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct pan_fbd_layer, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + #endif /* PANVK_CMD_BUFFER_H */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 55069924624..f8ce561acb2 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1245,8 +1245,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) uint32_t fbd_sz = calc_fbd_size(cmdbuf); uint32_t fbds_sz = enabled_layer_count * fbd_sz; - cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( - cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct pan_fbd_layer); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + cmdbuf->state.gfx.render.fbds = + panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment); if (!cmdbuf->state.gfx.render.fbds.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; @@ -1323,7 +1328,16 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) fbd_flags = new_fbd_flags; } +#if PAN_ARCH >= 14 + /* fbd_flags is unused on v14+. */ + assert(!fbd_flags); +#endif + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) { struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem( cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); @@ -1366,16 +1380,21 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) /* Wait for IR info push to complete */ cs_wait_slot(b, SB_ID(LS)); +#endif /* PAN_ARCH >= 14 */ bool unset_provoking_vertex = cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; if (copy_fbds) { - struct cs_index cur_tiler = cs_reg64(b, 38); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); +#if PAN_ARCH >= 14 + struct cs_index dst_fbd_ptr = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR); +#else struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); - struct cs_index fbd_idx = cs_reg32(b, 47); - struct cs_index src_fbd_ptr = cs_reg64(b, 48); - struct cs_index remaining_layers_in_td = cs_reg32(b, 50); +#endif + struct cs_index fbd_idx = cs_reg32(b, 60); + struct cs_index src_fbd_ptr = cs_reg64(b, 64); + struct cs_index remaining_layers_in_td = cs_reg32(b, 61); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -1455,10 +1474,16 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) -(full_td_count * pan_size(TILER_CONTEXT))); } } else { +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR); +#else + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); +#endif + cs_update_frag_ctx(b) { - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - fbds.gpu | fbd_flags); - cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler); + cs_move64_to(b, fbd_pointer, fbds.gpu | fbd_flags); + cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR), + cmdbuf->state.gfx.render.tiler); } /* If we don't know what provoking vertex mode the application wants yet, @@ -3295,6 +3320,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf) static void setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; const bool has_zs_ext = pan_fb_has_zs(fb); @@ -3339,6 +3367,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) TILER_OOM_CTX_FIELD_OFFSET(layer_count)); cs_flush_stores(b); +#endif /* PAN_ARCH >= 14 */ } static uint32_t @@ -3347,17 +3376,87 @@ pack_32_2x16(uint16_t lo, uint16_t hi) return (((uint32_t)hi) << 16) | (uint32_t)lo; } +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panvk_cmd_buffer *cmdbuf) +{ + /* Emit the static fragment staging registers. These don't change per-layer. */ + + const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render; + const struct pan_fb_layout *fb = &render->fb.layout; + + const uint8_t sample_count = render->fb.layout.sample_count; + + const struct pan_fb_bbox fb_area_px = + pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px); + const struct pan_fb_bbox bbox_px = + pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px); + + assert(pan_fb_bbox_is_valid(fb->tiling_area_px)); + + struct mali_fragment_bounding_box_packed bbox; + pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) { + cfg.bound_min_x = bbox_px.min_x; + cfg.bound_min_y = bbox_px.min_y; + cfg.bound_max_x = bbox_px.max_x; + cfg.bound_max_y = bbox_px.max_y; + } + + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width_px; + cfg.height = fb->height_px; + } + + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX), + bbox.opaque[0] | (uint64_t)bbox.opaque[1] << 32); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to( + b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + dev->sample_positions->addr.dev + + pan_sample_positions_offset(pan_sample_pattern(sample_count))); + + /* Flags 1 */ + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + cfg.sample_count = fb->sample_count; + cfg.sample_pattern = pan_sample_pattern(fb->sample_count); + cfg.effective_tile_size = fb->tile_size_px; + cfg.point_sprite_coord_origin_max_y = false; + cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf); + + assert(fb->rt_count > 0); + cfg.render_target_count = fb->rt_count; + cfg.color_buffer_allocation = fb->tile_rt_alloc_B; + } + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} +#endif /* PAN_ARCH >= 14 */ + static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH < 14 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); +#endif const struct cs_tracing_ctx *tracing_ctx = &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing; - const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; /* Now initialize the fragment bits. */ +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR); + cs_update_frag_ctx(b) { + cs_emit_static_fragment_state(b, cmdbuf); + cs_emit_layer_fragment_state(b, fbd_pointer); + } +#else + const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; cs_update_frag_ctx(b) { cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), pack_32_2x16(fb->tiling_area_px.min_x, @@ -3366,6 +3465,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) pack_32_2x16(fb->tiling_area_px.max_x, fb->tiling_area_px.max_y)); } +#endif bool simul_use = cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; @@ -3397,6 +3497,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * state for this renderpass, so it's safe to enable. */ struct cs_index addr_reg = cs_scratch_reg64(b, 0); struct cs_index length_reg = cs_scratch_reg32(b, 2); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf); uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev + handler_idx * dev->tiler_oom.handler_stride; @@ -3404,6 +3507,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride); cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Wait for the tiling to be done before submitting the fragment job. */ wait_finish_tiling(cmdbuf); @@ -3418,8 +3522,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * up. */ cs_move64_to(b, addr_reg, 0); cs_move32_to(b, length_reg, 0); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Applications tend to forget to describe subpass dependencies, especially * when it comes to write -> read dependencies on attachments. The @@ -3435,8 +3543,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) } if (cmdbuf->state.gfx.render.layer_count <= 1) { +#if PAN_ARCH >= 14 + cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif } else { struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4); struct cs_index remaining_layers = cs_scratch_reg32(b, 4); @@ -3445,12 +3558,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) { cs_add32(b, remaining_layers, remaining_layers, -1); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); + cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); +#endif cs_update_frag_ctx(b) - cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz); + cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz); } } @@ -3464,8 +3583,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4); struct cs_index completed_top = cs_scratch_reg64(b, 10); struct cs_index completed_bottom = cs_scratch_reg64(b, 12); - struct cs_index cur_tiler = cs_reg64(b, 38); - struct cs_index tiler_count = cs_reg32(b, 47); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); + struct cs_index tiler_count = cs_reg32(b, 60); struct cs_index oq_chain = cs_scratch_reg64(b, 10); struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10); struct cs_index oq_syncobj = cs_scratch_reg64(b, 12); diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index b4cf6855184..f85c21a0eea 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -13,8 +13,13 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) { switch (reg) { /* The bbox is set up by the fragment subqueue, we should not modify it. */ +#if PAN_ARCH >= 14 + case 28: + case 29: +#else case 42: case 43: +#endif /* We should only load from the subqueue context. */ case PANVK_CS_REG_SUBQUEUE_CTX_START: case PANVK_CS_REG_SUBQUEUE_CTX_END: @@ -42,8 +47,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8), 8 * sizeof(uint32_t)); +#if PAN_ARCH >= 14 + const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64); +#else + const size_t fbd_size = sizeof(struct mali_framebuffer_packed); +#endif + if (has_zs_ext) { - const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed); + const uint16_t dbd_offset = fbd_size; /* Copy the whole DBD. */ cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, @@ -57,8 +68,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, } const uint16_t rts_offset = - sizeof(struct mali_framebuffer_packed) + - (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); + fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); for (uint32_t rt = 0; rt < rt_count; rt++) { const uint16_t rt_offset = @@ -110,12 +120,14 @@ generate_tiler_oom_handler(struct panvk_device *dev, .tracebuf_addr_offset = offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), }; - struct mali_framebuffer_pointer_packed fb_tag; +#if PAN_ARCH < 14 + struct mali_framebuffer_pointer_packed fb_tag; pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) { cfg.zs_crc_extension_present = has_zs_ext; cfg.render_target_count = rt_count; } +#endif cs_function_def(&b, &handler, handler_ctx) { struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); @@ -140,7 +152,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4); /* The tiler pointer is pre-filled. */ - struct cs_index tiler_ptr = cs_reg64(&b, 38); + struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR); cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr)); @@ -175,12 +187,22 @@ generate_tiler_oom_handler(struct panvk_device *dev, /* Flush copies before the RUN_FRAGMENT. */ cs_wait_slot(&b, SB_ID(LS)); +#if PAN_ARCH >= 14 + /* Set FBD pointer to the scratch fbd */ + struct cs_index fbd_pointer = cs_reg64(&b, PANVK_CS_REG_FBD_LAYER_PTR); + cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0); + cs_emit_layer_fragment_state(&b, fbd_pointer); + + cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else /* Set FBD pointer to the scratch fbd */ cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), scratch_fbd_ptr_reg, fb_tag.opaque[0]); cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif /* Serialize run fragments since we reuse FBD for the runs */ cs_wait_slots(&b, dev->csf.sb.all_iters_mask); diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index c4848fe575b..b738be274d3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue) tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size; alloc_info.size = get_fbd_size(true, MAX_RTS); - alloc_info.alignment = pan_alignment(FRAMEBUFFER); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct pan_fbd_layer); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + alloc_info.alignment = fbds_alignment; tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) { result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, From fab9558ab81b1c0aac6ff3cfe09961b0e9a3dc7d Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:49:39 +0100 Subject: [PATCH 16/49] panvk: Handle provoking vertex and simultaneous reuse on v14 The provoking vertex bit in RUN_FRAGMENT2 is located in a register instead of a descriptor stored in memory. That means we don't need to patch memory, resulting in a much leaner implementation compared to RUN_FRAGMENT. Also, implement the simultaneous reuse copy path with the corresponding tiler pointer patching. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 36 +++++++++++++++++++-- src/panfrost/vulkan/panvk_cmd_draw.h | 5 ++- src/panfrost/vulkan/panvk_vX_device.c | 6 ++-- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index f8ce561acb2..7d430591aa6 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -51,6 +51,7 @@ #include "vk_render_pass.h" #include "poly/geometry.h" +#if PAN_ARCH < 14 static enum cs_reg_perm provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg) { @@ -202,6 +203,7 @@ panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev) panvk_priv_bo_unref(dev->draw_ctx->fns_bo); vk_free(&dev->vk.alloc, dev->draw_ctx); } +#endif /* PAN_ARCH < 14 */ static void emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf, @@ -1382,9 +1384,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_wait_slot(b, SB_ID(LS)); #endif /* PAN_ARCH >= 14 */ - bool unset_provoking_vertex = - cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; - if (copy_fbds) { struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); #if PAN_ARCH >= 14 @@ -1418,10 +1417,27 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) * framebuffer size is aligned on 64-bytes. */ assert(fbd_sz == ALIGN_POT(fbd_sz, 64)); +#if PAN_ARCH >= 14 + for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + + /* Patch the Tiler pointer. */ + if (fbd_off == 0) + cs_add64(b, cs_scratch_reg64(b, 0), cur_tiler, 0); + + cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + } +#else + bool unset_provoking_vertex = + cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { if (fbd_off == 0) { cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr, BITFIELD_MASK(14), fbd_off); + + /* Patch the Tiler pointer. */ cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0); /* If we don't know what provoking vertex mode the @@ -1441,6 +1457,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr, BITFIELD_MASK(16), fbd_off); } +#endif /* Finish stores to pass_dst_fbd_ptr. */ cs_flush_stores(b); @@ -1486,6 +1503,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cmdbuf->state.gfx.render.tiler); } +#if PAN_ARCH < 14 /* If we don't know what provoking vertex mode the application wants yet, * leave space to patch it later */ if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { @@ -1507,6 +1525,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) cs_call(b, addr_reg, length_reg); } +#endif } return VK_SUCCESS; @@ -3433,6 +3452,17 @@ cs_emit_static_fragment_state(struct cs_builder *b, } cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + /* If we don't know what provoking vertex mode the application wants yet, + * leave space to patch it later */ + if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { + cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) + { + /* provoking_vertex flag is bit 14 of Fragment Flags 1. */ + cs_add32(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), + cs_sr_reg32(b, FRAGMENT, FLAGS_1), -(1 << 14)); + } + } + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ } #endif /* PAN_ARCH >= 14 */ diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 8de69cfdb42..7c11787fd44 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -243,7 +243,7 @@ struct panvk_cmd_graphics_state { } \ } while (0) -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 struct panvk_device_draw_context { struct panvk_priv_bo *fns_bo; uint64_t fn_set_fbds_provoking_vertex_stride; @@ -376,8 +376,7 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state, gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \ } while (0) - -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 VkResult panvk_per_arch(device_draw_context_init)(struct panvk_device *dev); diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index c32d2f279e8..93b8a8e21af 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -550,7 +550,7 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, goto err_free_precomp; } -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 result = panvk_per_arch(device_draw_context_init)(device); if (result != VK_SUCCESS) goto err_free_mem_cache; @@ -616,7 +616,7 @@ err_finish_queues: panvk_meta_cleanup(device); err_free_draw_ctx: -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 panvk_per_arch(device_draw_context_cleanup)(device); err_free_mem_cache: #endif @@ -679,7 +679,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, } panvk_precomp_cleanup(device); -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 panvk_per_arch(device_draw_context_cleanup)(device); #endif panvk_meta_cleanup(device); From 74c0426ae75d32f385cdf765882173cb2ae27970 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 14:04:44 +0200 Subject: [PATCH 17/49] panvk: Build for v14 Enable building panvk for v14. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/vulkan/meson.build | 7 ++++--- src/panfrost/vulkan/panvk_macros.h | 8 ++++++++ src/panfrost/vulkan/panvk_physical_device.c | 2 ++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index d79bcf885a7..ce06192d50a 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -14,6 +14,7 @@ panvk_entrypoints = custom_target( '--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7', '--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10', '--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13', + '--device-prefix', 'panvk_v14', '--beta', with_vulkan_beta.to_string() ], depend_files : vk_entrypoints_gen_depend_files, @@ -65,7 +66,7 @@ valhall_archs = [9, 10] valhall_inc_dir = ['valhall'] valhall_files = [] -fifthgen_archs = [12, 13] +fifthgen_archs = [12, 13, 14] fifthgen_inc_dir = ['fifthgen'] fifthgen_files = [] @@ -83,7 +84,7 @@ jm_files = [ 'jm/panvk_vX_gpu_queue.c', ] -csf_archs = [10, 12, 13] +csf_archs = [10, 12, 13, 14] csf_inc_dir = ['csf'] csf_files = [ 'csf/panvk_vX_bind_queue.c', @@ -126,7 +127,7 @@ common_per_arch_files = [ sha1_h, ] -foreach arch : [6, 7, 10, 12, 13] +foreach arch : [6, 7, 10, 12, 13, 14] per_arch_files = common_per_arch_files inc_panvk_per_arch = [] diff --git a/src/panfrost/vulkan/panvk_macros.h b/src/panfrost/vulkan/panvk_macros.h index 940d00522bb..09253ffdb93 100644 --- a/src/panfrost/vulkan/panvk_macros.h +++ b/src/panfrost/vulkan/panvk_macros.h @@ -61,6 +61,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 13: \ panvk_arch_name(name, v13)(__VA_ARGS__); \ break; \ + case 14: \ + panvk_arch_name(name, v14)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -84,6 +87,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 13: \ ret = panvk_arch_name(name, v13)(__VA_ARGS__); \ break; \ + case 14: \ + ret = panvk_arch_name(name, v14)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -102,6 +108,8 @@ panvk_catch_indirect_alloc_failure(VkResult error) #define panvk_per_arch(name) panvk_arch_name(name, v12) #elif PAN_ARCH == 13 #define panvk_per_arch(name) panvk_arch_name(name, v13) +#elif PAN_ARCH == 14 +#define panvk_per_arch(name) panvk_arch_name(name, v14) #else #error "Unsupported arch" #endif diff --git a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c index 1e95c5c9390..bb18df6b49a 100644 --- a/src/panfrost/vulkan/panvk_physical_device.c +++ b/src/panfrost/vulkan/panvk_physical_device.c @@ -64,6 +64,7 @@ PER_ARCH_FUNCS(7); PER_ARCH_FUNCS(10); PER_ARCH_FUNCS(12); PER_ARCH_FUNCS(13); +PER_ARCH_FUNCS(14); static VkResult create_kmod_dev(struct panvk_physical_device *device, @@ -411,6 +412,7 @@ panvk_physical_device_init(struct panvk_physical_device *device, switch (arch) { case 6: case 7: + case 14: if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) { result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, "WARNING: panvk is not well-tested on v%d, " From f11725a21963d5242461ea09bb6a2b8c688d246d Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:50:45 +0100 Subject: [PATCH 18/49] pan: Add v14 support Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/compiler/pan_compiler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/panfrost/compiler/pan_compiler.c b/src/panfrost/compiler/pan_compiler.c index 7325c16f8b6..d1a4dc08eed 100644 --- a/src/panfrost/compiler/pan_compiler.c +++ b/src/panfrost/compiler/pan_compiler.c @@ -52,6 +52,7 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg) case 11: case 12: case 13: + case 14: return merge_wg ? &bifrost_nir_options_v11_merge_wg : &bifrost_nir_options_v11; default: From 6dedfd66a446e49694bbc32d5e6a01c456c16d06 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Tue, 14 Apr 2026 10:29:54 +0200 Subject: [PATCH 19/49] pan/va: Fix packing test for LdVarBufImmF16 on v11 Encoding for LdVarBufImmF16 on v11 changed compared to v10. Updated the test to check for the right encoding. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- .../bifrost/valhall/test/test-packing.cpp | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 0b0a7654437..96e11ee180a 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2021 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -9,9 +10,9 @@ #include -#define CASE(instr, expected) \ +#define CASE_ARCH(instr, arch, expected) \ do { \ - uint64_t _value = va_pack_instr(instr, 10); \ + uint64_t _value = va_pack_instr(instr, arch); \ if (_value != expected) { \ fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, \ (uint64_t)expected); \ @@ -21,6 +22,8 @@ } \ } while (0) +#define CASE(instr, expected) CASE_ARCH(instr, 10, expected) + class ValhallPacking : public testing::Test { protected: ValhallPacking() @@ -278,11 +281,17 @@ TEST_F(ValhallPacking, LdVarBufImmF16) BI_VECSIZE_V4, 0), 0x005d80843300003d); - CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID, - BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, - BI_VECSIZE_V4, 8), - 0x005d80443308003d); + CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, + BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, + BI_UPDATE_STORE, BI_VECSIZE_V4, 8), + 10, 0x005d80443308003d); + + CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, + BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, + BI_UPDATE_STORE, BI_VECSIZE_V4, 8), + 11, 0x005d80443300083d); } TEST_F(ValhallPacking, LeaBufImm) From 95596dbc0cc0437042ec188ab43052f471bd822f Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Tue, 31 Mar 2026 13:05:04 +0200 Subject: [PATCH 20/49] pan/bi,va: Use dedicated LD_VAR_BUF_FLAT* opcodes on v14+ On v14+, flat source formats are no longer supported by LD_VAR_BUF and LD_VAR_BUF_IMM opcodes. This patch makes the compiler emit the dedicated LD_VAR_BUF_FLAT* opcodes instead. Add the ISA definitions, handle the new opcodes, and add packing tests for both immediate and indirect forms. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- .../compiler/bifrost/bifrost_compile.c | 29 +++++++++++++++---- src/panfrost/compiler/bifrost/valhall/ISA.xml | 26 +++++++++++++++++ .../bifrost/valhall/test/test-packing.cpp | 24 +++++++++++++++ .../bifrost/valhall/va_gather_hsr_info.c | 2 ++ .../compiler/bifrost/valhall/va_pack.c | 4 +++ 5 files changed, 80 insertions(+), 5 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 2f08cddc49e..989a36b7046 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -703,8 +703,10 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr) assert(intr->intrinsic == nir_intrinsic_load_var_buf_pan || intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan); + const unsigned arch = b->shader->arch; + /* These are only available on Valhall+ */ - assert(b->shader->arch >= 9); + assert(arch >= 9); const bool flat = intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan; const nir_alu_type src_type = nir_intrinsic_src_type(intr); @@ -757,19 +759,36 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr) bool use_imm_form = false; if (nir_src_is_const(intr->src[0])) { imm_offset = nir_src_as_uint(intr->src[0]); - assert(imm_offset < pan_ld_var_buf_off_size(b->shader->arch)); + assert(imm_offset < pan_ld_var_buf_off_size(arch)); use_imm_form = true; } + /* On v14+, flat source formats are removed from LD_VAR_BUF/LD_VAR_BUF_IMM, + * so flat buffer varyings must use the dedicated LD_VAR_BUF_FLAT*. + */ if (use_imm_form) { - bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, + if (arch >= 14 && flat) { + bi_ld_var_buf_flat_imm_to(b, dest, regfmt, vecsize, imm_offset); + } else { + bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, BI_UPDATE_STORE, vecsize, imm_offset); + } } else { bi_index offset = bi_src_index(&intr->src[0]); - bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample, - source_format, BI_UPDATE_STORE, vecsize); + if (arch >= 14 && flat) { + bi_ld_var_buf_flat_to(b, dest, offset, regfmt, vecsize); + } else { + bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample, + source_format, BI_UPDATE_STORE, vecsize); + } } + + /* LD_VAR_BUF_FLAT* only support register formats F16 and F32. */ + assert( + arch < 14 || !flat || + (regfmt == BI_REGISTER_FORMAT_F16 || regfmt == BI_REGISTER_FORMAT_F32)); + bi_split_def(b, &intr->def); } diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 43b292f2c57..92d7ecbdc11 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -940,6 +940,32 @@ + + + + + Fetches a given flat varying from hardware buffer + + + + + + + + + + + + + Fetches a given flat varying from hardware buffer + + + + + + + + Interpolates a given varying from hardware buffer diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 96e11ee180a..0ac71cc2f4f 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -294,6 +294,30 @@ TEST_F(ValhallPacking, LdVarBufImmF16) 11, 0x005d80443300083d); } +TEST_F(ValhallPacking, LdVarBufFlatImmFormat) +{ + CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), + BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4, 0x12), + 14, 0x0040800832001200); + + CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), + BI_REGISTER_FORMAT_F16, + BI_VECSIZE_V4, 0x12), + 14, 0x0040800433001200); +} + +TEST_F(ValhallPacking, LdVarBufFlat) +{ + CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4), + 14, 0x005f80083200003d); + + CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4), + 14, 0x005f80043300003d); +} + TEST_F(ValhallPacking, LeaBufImm) { CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))), diff --git a/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c b/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c index 6fc81ebbb12..2d5ca159bd3 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c +++ b/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c @@ -77,6 +77,8 @@ walk_bir_shader(bi_context *ctx, struct pan_shader_info *info) if (instr->sample == BI_SAMPLE_CENTROID) info->fs.hsr.centroid_interpolation = true; FALLTHROUGH; + case BI_OPCODE_LD_VAR_BUF_FLAT: + case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: case BI_OPCODE_LD_VAR_FLAT: case BI_OPCODE_LD_VAR_FLAT_IMM: if (!found_atest) diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index 9665cc1cfd5..fc7ac40ccb8 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -566,6 +566,10 @@ va_pack_alu(const bi_instr *I, unsigned arch) hex |= ((uint64_t)I->sample) << 38; break; + case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: + hex |= ((uint64_t)I->index) << 8; + break; + case BI_OPCODE_LD_ATTR_IMM: hex |= ((uint64_t)I->table) << 16; hex |= ((uint64_t)I->attribute_index) << 20; From 94ec179b55124ec27ca41a86f3fc39ea5108d500 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Wed, 11 Feb 2026 15:52:20 +0100 Subject: [PATCH 21/49] panfrost: Hook up RUN_FRAGMENT2 on the Gallium driver Set the FBD size/alignment correctly and emit the fragment staging registers before issuing fragment commands. Also, move some temporary registers to non-conflicting ones. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/gallium/drivers/panfrost/pan_csf.c | 169 ++++++++++++++++++++++--- 1 file changed, 154 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 357d4cfeff0..d6e143de182 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2023 Collabora Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -13,6 +14,7 @@ #include "pan_cmdstream.h" #include "pan_context.h" #include "pan_csf.h" +#include "pan_fb.h" #include "pan_fb_preload.h" #include "pan_job.h" #include "pan_trace.h" @@ -75,6 +77,99 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr) (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) + \ offsetof(struct pan_ptr, gpu)) +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panfrost_batch *batch, + const struct pan_fb_info *fb) +{ + struct mali_fragment_bounding_box_packed bbox; + pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) { + cfg.bound_min_x = batch->minx; + cfg.bound_min_y = batch->miny; + cfg.bound_max_x = batch->maxx - 1; + cfg.bound_max_y = batch->maxy - 1; + } + + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width; + cfg.height = fb->height; + } + + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX), + bbox.opaque[0] | ((uint64_t)bbox.opaque[1] << 32)); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + fb->sample_positions); + + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + /* The force_samples setting dictates the sample-count that is used + * for rasterization, and works like D3D11's ForcedSampleCount + * feature: + * + * - If force_samples == 0: Let nr_samples dictate sample count + * - If force_samples == 1: force single-sampled rasterization + * - If force_samples >= 1: force multi-sampled rasterization + * + * This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the + * fragment shader, even when performing single-sampled rendering. + */ + if (fb->pls_enabled) { + cfg.sample_count = 4; + cfg.sample_pattern = pan_sample_pattern(1); + } else if (!fb->force_samples) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(fb->nr_samples); + } else if (fb->force_samples == 1) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(1); + } else { + cfg.sample_count = 1; + cfg.sample_pattern = pan_sample_pattern(fb->force_samples); + } + + cfg.effective_tile_size = fb->tile_size; + cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin; + cfg.first_provoking_vertex = fb->first_provoking_vertex; + cfg.render_target_count = MAX2(fb->rt_count, 1); + cfg.color_buffer_allocation = fb->cbuf_allocation; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} + +#define PAN_CS_REG_FBD_LAYER_PTR 54 + +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct pan_fbd_layer, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct pan_fbd_layer, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct pan_fbd_layer, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct pan_fbd_layer, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + static int csf_oom_handler_init(struct panfrost_context *ctx) { @@ -113,13 +208,18 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_function_def(&b, &handler, handler_ctx) { struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG); - struct cs_index counter = cs_reg32(&b, 47); - struct cs_index zero = cs_reg64(&b, 48); - struct cs_index flush_id = cs_reg32(&b, 48); - struct cs_index tiler_ctx = cs_reg64(&b, 50); - struct cs_index completed_top = cs_reg64(&b, 52); - struct cs_index completed_bottom = cs_reg64(&b, 54); - struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4); + struct cs_index counter = cs_reg32(&b, 31); + struct cs_index zero = cs_reg64(&b, 56); + struct cs_index flush_id = cs_reg32(&b, 58); + struct cs_index tiler_ctx = cs_reg64(&b, 60); + struct cs_index completed_top = cs_reg64(&b, 64); + struct cs_index completed_bottom = cs_reg64(&b, 66); + struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4); +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(&b, PAN_CS_REG_FBD_LAYER_PTR); +#else + struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER); +#endif /* Ensure that the OTHER endpoint is valid */ #if PAN_ARCH >= 11 @@ -133,25 +233,31 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter)); cs_wait_slot(&b, 0); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(MIDDLE)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE)); } cs_else(&b) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(FIRST)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST)); } +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(&b, fbd_pointer); +#else cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx, FIELD_OFFSET(bbox_min)); cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MAX), tiler_oom_ctx, FIELD_OFFSET(bbox_max)); cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0); +#endif cs_wait_slot(&b, 0); /* Run the fragment job and wait */ cs_select_endpoint_sb(&b, 3); +#if PAN_ARCH >= 14 + cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(&b, 3); /* Increment counter */ @@ -218,6 +324,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch) panfrost_pool_cleanup(&batch->csf.cs_chunk_pool); } +#if PAN_ARCH >= 14 +static inline struct pan_ptr +alloc_fbd(struct panfrost_batch *batch) +{ + const struct pan_desc_alloc_info fbd_layer = { + .size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64), + .align = alignof(struct pan_fbd_layer), + .nelems = 1, + }; + + return pan_pool_alloc_desc_aggregate( + &batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION), + PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); +} +#else static inline struct pan_ptr alloc_fbd(struct panfrost_batch *batch) { @@ -225,6 +346,7 @@ alloc_fbd(struct panfrost_batch *batch) &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION), PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); } +#endif /* PAN_ARCH >= 14 */ int GENX(csf_init_batch)(struct panfrost_batch *batch) @@ -854,15 +976,26 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_vt_end(b, cs_now()); } +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(b, PAN_CS_REG_FBD_LAYER_PTR); +#else + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); +#endif + /* Set up the fragment job */ - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - batch->framebuffer.gpu); + cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu); + +#if PAN_ARCH >= 14 + cs_emit_static_fragment_state(b, batch, pfb); + cs_emit_layer_fragment_state(b, fbd_pointer); +#else cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), (batch->miny << 16) | batch->minx); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), ((batch->maxy - 1) << 16) | (batch->maxx - 1)); cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0); +#endif /* Use different framebuffer descriptor if incremental rendering was * triggered while tiling */ @@ -871,13 +1004,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0); cs_wait_slot(b, 0); cs_if(b, MALI_CS_CONDITION_GREATER, counter) { - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - GET_FBD(oom_ctx, LAST).gpu); + cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); +#endif } } /* Run the fragment job and wait */ +#if PAN_ARCH >= 14 + cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(b, 2); /* Gather freed heap chunks and add them to the heap context free list From 4abd3ce7447d840fbce0bfd2fd7b9200800e8711 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 15:35:38 +0200 Subject: [PATCH 22/49] panfrost: Build the Gallium driver for v14 Enable building panfrost for v14. Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/gallium/drivers/panfrost/meson.build | 4 ++-- src/gallium/drivers/panfrost/pan_cmdstream.c | 2 +- src/gallium/drivers/panfrost/pan_screen.c | 3 +++ src/gallium/drivers/panfrost/pan_screen.h | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 5b3e5e41d97..ba243f5a4ed 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -41,7 +41,7 @@ compile_args_panfrost = [ '-Wno-pointer-arith' ] -panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13'] +panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_versions = [] foreach ver : panfrost_versions @@ -54,7 +54,7 @@ foreach ver : panfrost_versions ] if ver in ['4', '5', '6', '7', '9'] files_panfrost_vx += ['pan_jm.c'] - elif ver in ['10', '12', '13'] + elif ver in ['10', '12', '13', '14'] files_panfrost_vx += ['pan_csf.c'] endif libpanfrost_versions += static_library( diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 87a3cbbe7ea..aa32944195f 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -49,7 +49,7 @@ * functions. */ #if PAN_ARCH <= 9 #define JOBX(__suffix) GENX(jm_##__suffix) -#elif PAN_ARCH <= 13 +#elif PAN_ARCH <= 14 #define JOBX(__suffix) GENX(csf_##__suffix) #else #error "Unsupported arch" diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index 86d28d2de7a..ede056ba82f 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -1175,6 +1175,9 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config, case 13: panfrost_cmdstream_screen_init_v13(screen); break; + case 14: + panfrost_cmdstream_screen_init_v14(screen); + break; default: debug_printf("panfrost: Unhandled architecture major %d", dev->arch); panfrost_destroy_screen(&(screen->base)); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index 14eb7ea59fd..9e6b95d008d 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -155,6 +155,7 @@ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen); +void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen); #define perf_debug(ctx, ...) \ do { \ From c9e740a80e791354fcd1514cb7f9158286b70d19 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 15:40:58 +0200 Subject: [PATCH 23/49] panfrost: Advertize Mali-G1-Pro support Reviewed-by: Lars-Ivar Hesselberg Simonsen --- src/panfrost/model/pan_model.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/panfrost/model/pan_model.c b/src/panfrost/model/pan_model.c index f9861ace8dc..4b28c4067fb 100644 --- a/src/panfrost/model/pan_model.c +++ b/src/panfrost/model/pan_model.c @@ -95,6 +95,10 @@ const struct pan_model pan_model_list[] = { MODEL_RATES(4, 8, 128)), FIFTHGEN_MODEL(PAN_PROD_ID(13, 8, 0), 4, "G725", "TKRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), MODEL_RATES(4, 8, 128)), + FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 1, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 64)), + FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 128)), }; /* clang-format on */ From 2f6a4e76928d88d7f908aa1a65069f97f195ae76 Mon Sep 17 00:00:00 2001 From: Marc Alcala Prieto Date: Fri, 17 Apr 2026 15:41:20 +0200 Subject: [PATCH 24/49] docs/panfrost: Advertize Mali-G1-Pro support Reviewed-by: Lars-Ivar Hesselberg Simonsen --- docs/drivers/panfrost.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/drivers/panfrost.rst b/docs/drivers/panfrost.rst index 2e214ded1e9..d9e3a618128 100644 --- a/docs/drivers/panfrost.rst +++ b/docs/drivers/panfrost.rst @@ -34,6 +34,8 @@ The following hardware is currently supported: +--------------------+---------------+-----------+--------+--------+ | G725 | 5th Gen (v13) | 3.1 | 3.1 | 1.4 | +--------------------+---------------+-----------+--------+--------+ +| G1-Pro | 5th Gen (v14) | 3.1 | 3.1 | 1.4 | ++--------------------+---------------+-----------+--------+--------+ Other Midgard and Bifrost chips (e.g. G71) are not yet supported. From 769eddfecacf6951ca7513004e7d897ff67ce235 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 20 Apr 2026 15:03:01 +0200 Subject: [PATCH 25/49] pan/va: Use preload abstraction for blend shader regs A couple of preloads were missed when implementing the preload register abstraction. This fix is not required prior to v15, but marking it as a bug fix for consistency. Fixes: 1f0370616a6 ("pan: Centralize preload registers") --- src/panfrost/compiler/bifrost/bifrost_compile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 989a36b7046..8f459a1d8e6 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -4698,7 +4698,7 @@ bi_compile_variant(nir_shader *nir, uint64_t preload = first_block->reg_live_in; /* If multisampling is used with a blend shader, the blend shader needs - * to access the sample coverage mask in r60 and the sample ID in r61. + * to access the sample coverage mask and the sample ID. * Blend shaders run in the same context as fragment shaders, so if a * blend shader could run, we need to preload these registers * conservatively. There is believed to be little cost to doing so, so @@ -4709,7 +4709,10 @@ bi_compile_variant(nir_shader *nir, * driver. We could unify the paths if the cost is acceptable. */ if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) - preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); + preload |= + BITFIELD64_BIT( + bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch)) | + BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_SAMPLE_ID, ctx->arch)); info->ubo_mask |= ctx->ubo_mask; info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); From 1a374e1f04c8b9e20478290589effa2823bf5eda Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 27 Apr 2026 12:51:06 +0200 Subject: [PATCH 26/49] pan/va/ISA: Remove non-existent register_type Register_type does not exist in Valhall and was currently not actually packed. --- src/panfrost/compiler/bifrost/valhall/ISA.xml | 21 ------------------- .../bifrost/valhall/test/assembler-cases.txt | 8 +++---- .../compiler/bifrost/valhall/va_pack.c | 21 ------------------- .../compiler/bifrost/valhall/valhall.py | 1 - 4 files changed, 4 insertions(+), 47 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 92d7ecbdc11..9c412ebf469 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -556,14 +556,6 @@ gather4_a - - Unsized type, part of a register format. - - f - u - s - - Untyped size, part of a register format. 16 @@ -3574,7 +3566,6 @@ Unfiltered textured instruction. - @@ -3601,7 +3592,6 @@ Ordinary texturing instruction using a sampler. - @@ -3630,7 +3620,6 @@ Texture gather instruction. - @@ -3660,7 +3649,6 @@ Texture sample with explicit gradient. - @@ -3687,7 +3675,6 @@ Pair of texture instructions. - @@ -3717,7 +3704,6 @@ - @@ -3742,7 +3728,6 @@ - @@ -3768,7 +3753,6 @@ - @@ -3794,7 +3778,6 @@ - @@ -3819,7 +3802,6 @@ - @@ -3844,7 +3826,6 @@ - @@ -3870,7 +3851,6 @@ - @@ -3896,7 +3876,6 @@ - diff --git a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt index d7eca3c60de..ac40685c05c 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt +++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt @@ -213,7 +213,7 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0 f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0 c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0 c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1 -80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0 +80 00 c0 13 34 7c 25 01 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0 80 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w0 81 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w1 8a 00 00 00 00 c1 91 02 MOV.i32 r1, u37.w0 @@ -221,9 +221,9 @@ c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1 32 00 80 18 02 4c 68 08 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0 32 00 00 18 02 8c 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0 32 00 00 18 00 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @, [r50:r51], offset:0x0 -82 00 80 15 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0 -82 20 80 15 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0 -82 20 80 1d 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.s.32.2d.computed.wait0126 @r0, u1, u0.w0 +82 00 80 11 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0 +82 20 80 11 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0 +82 20 80 11 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0 40 c0 c0 80 03 c0 f0 10 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0 42 43 40 01 01 c0 f8 00 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^ 42 c0 c0 c2 03 c0 f0 10 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0 diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index fc7ac40ccb8..d7f42168c7a 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -800,27 +800,6 @@ va_pack_lod_mode(const bi_instr *I) invalid_instruction(I, "LOD mode"); } -static enum va_register_type -va_pack_register_type(const bi_instr *I) -{ - switch (I->register_format) { - case BI_REGISTER_FORMAT_F16: - case BI_REGISTER_FORMAT_F32: - return VA_REGISTER_TYPE_F; - - case BI_REGISTER_FORMAT_U16: - case BI_REGISTER_FORMAT_U32: - return VA_REGISTER_TYPE_U; - - case BI_REGISTER_FORMAT_S16: - case BI_REGISTER_FORMAT_S32: - return VA_REGISTER_TYPE_S; - - default: - invalid_instruction(I, "register type"); - } -} - static enum va_register_format va_pack_register_format(const bi_instr *I) { diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index 7bfaa1489d5..7cae9521b87 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -404,7 +404,6 @@ def valhall_parse_isa(xmlfile): "lod_bias_disable": Modifier("lod_mode", 13, 1), "lod_clamp_disable": Modifier("lod_mode", 14, 1), "write_mask": Modifier("write_mask", 22, 4), - "register_type": Modifier("register_type", 26, 2), "dimension": Modifier("dimension", 28, 2), "skip": Flag("skip", 39), "register_width": Modifier("register_width", 46, 1, force_enum = "register_width"), From 6e8b73ca767bba8eaed30da1cf9f2b0cdee6a4b6 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 27 Apr 2026 14:06:42 +0200 Subject: [PATCH 27/49] pan/va/compiler: Fix broken ATOM1_RETURN asm/disasm test --- src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt index ac40685c05c..26e389697f4 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt +++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt @@ -220,7 +220,7 @@ c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1 30 00 f7 1b 02 cc 20 09 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0 32 00 80 18 02 4c 68 08 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0 32 00 00 18 02 8c 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0 -32 00 00 18 00 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @, [r50:r51], offset:0x0 +32 00 00 18 02 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0 82 00 80 11 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0 82 20 80 11 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0 82 20 80 11 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0 From df8f2d8896acb02b6fecb746d62c2879ccdb6a42 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 4 May 2026 14:25:55 +0200 Subject: [PATCH 28/49] drm-uapi: Add panthor v15 uapi changes This is currently based on the uapi in the following MR: https://gitlab.freedesktop.org/panfrost/linux/-/merge_requests/65 --- include/drm-uapi/panthor_drm.h | 85 ++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/include/drm-uapi/panthor_drm.h b/include/drm-uapi/panthor_drm.h index e238c6264fa..2ecc50eade2 100644 --- a/include/drm-uapi/panthor_drm.h +++ b/include/drm-uapi/panthor_drm.h @@ -350,7 +350,7 @@ struct drm_panthor_gpu_info { __u32 as_present; /** - * @select_coherency: Coherency selected for this device. + * @selected_coherency: Coherency selected for this device. * * One of drm_panthor_gpu_coherency. */ @@ -368,11 +368,27 @@ struct drm_panthor_gpu_info { /** @core_features: Used to discriminate core variants when they exist. */ __u32 core_features; - /** @pad: MBZ. */ - __u32 pad; + /** @thread_num_active_granularity: Granularity of number of active threads */ + __u32 thread_num_active_granularity; /** @gpu_features: Bitmask describing supported GPU-wide features */ __u64 gpu_features; + + /** @gpu_wide_id: 64-bit GPU_ID for v15 onwards. */ + __u64 gpu_wide_id; +#define DRM_PANTHOR_WIDE_ARCH_MAJOR(x) (((x) >> 56) & 0xff) +#define DRM_PANTHOR_WIDE_ARCH_MINOR(x) (((x) >> 48) & 0xff) +#define DRM_PANTHOR_WIDE_ARCH_REV(x) (((x) >> 40) & 0xff) +#define DRM_PANTHOR_WIDE_PRODUCT_MAJOR(x) (((x) >> 32) & 0xff) +#define DRM_PANTHOR_WIDE_VERSION_MAJOR(x) (((x) >> 16) & 0xff) +#define DRM_PANTHOR_WIDE_VERSION_MINOR(x) (((x) >> 8) & 0xff) +#define DRM_PANTHOR_WIDE_VERSION_STATUS(x) ((x) & 0xff) + + /** @gpu_rev_wide: 64-bit GPU revision for v15 onwards */ + __u64 gpu_rev_wide; + + /** @l2_features_wide: 64-bit L2_FEATURES for v15 onwards */ + __u64 l2_features_wide; }; /** @@ -409,6 +425,38 @@ struct drm_panthor_csif_info { __u32 pad; }; +/** + * enum drm_panthor_timestamp_info_flags - drm_panthor_timestamp_info.flags + */ +enum drm_panthor_timestamp_info_flags { + /** @DRM_PANTHOR_TIMESTAMP_GPU: Query GPU time. */ + DRM_PANTHOR_TIMESTAMP_GPU = 1 << 0, + + /** @DRM_PANTHOR_TIMESTAMP_CPU_NONE: Don't query CPU time. */ + DRM_PANTHOR_TIMESTAMP_CPU_NONE = 0 << 1, + + /** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC: Query CPU time using CLOCK_MONOTONIC. */ + DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC = 1 << 1, + + /** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW: Query CPU time using CLOCK_MONOTONIC_RAW. */ + DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW = 2 << 1, + + /** @DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK: Space reserved for CPU clock type. */ + DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK = 7 << 1, + + /** @DRM_PANTHOR_TIMESTAMP_GPU_OFFSET: Query GPU offset. */ + DRM_PANTHOR_TIMESTAMP_GPU_OFFSET = 1 << 4, + + /** @DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT: Query GPU cycle count. */ + DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT = 1 << 5, + + /** @DRM_PANTHOR_TIMESTAMP_FREQ: Query timestamp frequency. */ + DRM_PANTHOR_TIMESTAMP_FREQ = 1 << 6, + + /** @DRM_PANTHOR_TIMESTAMP_DURATION: Return duration of time query. */ + DRM_PANTHOR_TIMESTAMP_DURATION = 1 << 7, +}; + /** * struct drm_panthor_timestamp_info - Timestamp information * @@ -421,11 +469,38 @@ struct drm_panthor_timestamp_info { */ __u64 timestamp_frequency; - /** @current_timestamp: The current timestamp. */ + /** @current_timestamp: The current GPU timestamp. */ __u64 current_timestamp; - /** @timestamp_offset: The offset of the timestamp timer. */ + /** @timestamp_offset: The offset of the GPU timestamp timer. */ __u64 timestamp_offset; + + /** + * @flags: Bitmask of drm_panthor_timestamp_info_flags. + * + * If set to 0, then it is interpreted as: + * DRM_PANTHOR_TIMESTAMP_GPU | + * DRM_PANTHOR_TIMESTAMP_GPU_OFFSET | + * DRM_PANTHOR_TIMESTAMP_FREQ + * + * Note: these flags are exclusive to each other (only one can be used): + * - DRM_PANTHOR_TIMESTAMP_CPU_NONE + * - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC + * - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW + */ + __u32 flags; + + /** @duration_nsec: Duration of time query. */ + __u32 duration_nsec; + + /** @cycle_count: Value of GPU_CYCLE_COUNT. */ + __u64 cycle_count; + + /** @cpu_timestamp_sec: Seconds part of CPU timestamp. */ + __u64 cpu_timestamp_sec; + + /** @cpu_timestamp_nsec: Nanseconds part of CPU timestamp. */ + __u64 cpu_timestamp_nsec; }; /** From 003becf081659535452c23505a3167330c4e4ec8 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 5 May 2026 11:08:29 +0200 Subject: [PATCH 29/49] pan: Add handling for v15+ uapi gpu_id Since v15, gpu_ids are 64 bit, so they need to be handled differently. To ease this, a compat value of 0xF is found in what previously used to be ARCH_MAJOR, which we can use to decide whether to read information from the full 64 bits. Since we now cannot pass gpu_id directly as deviceID, align with the DDK on what fields to expose. --- src/panfrost/clc/pan_compile.c | 8 ++++- src/panfrost/lib/kmod/panthor_kmod.c | 6 +++- src/panfrost/model/pan_model.h | 30 +++++++++++++++---- .../vulkan/panvk_vX_physical_device.c | 14 ++++++++- 4 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/panfrost/clc/pan_compile.c b/src/panfrost/clc/pan_compile.c index 3a34897c21b..593bfe2de07 100644 --- a/src/panfrost/clc/pan_compile.c +++ b/src/panfrost/clc/pan_compile.c @@ -10,6 +10,7 @@ #include "panfrost/compiler/bifrost/bifrost_compile.h" #include "panfrost/compiler/pan_compiler.h" #include "panfrost/compiler/pan_nir.h" +#include "panfrost/model/pan_model.h" #include "nir.h" #include "nir_builder.h" #include "nir_builder_opcodes.h" @@ -353,7 +354,12 @@ main(int argc, const char **argv) libfunc, MESA_SHADER_COMPUTE, v, get_compiler_options(target_arch), &opt, load_kernel_input); - uint64_t target_gpu_id = (target_arch & 0xf) << 28; + uint64_t target_gpu_id; + if (target_arch >= PAN_ID64_COMPAT) + target_gpu_id = + ((uint64_t)(target_arch & 0xff) << 56) | (PAN_ID64_COMPAT << 28); + else + target_gpu_id = (target_arch & 0xf) << 28; struct pan_compile_inputs inputs = { .gpu_id = target_gpu_id, diff --git a/src/panfrost/lib/kmod/panthor_kmod.c b/src/panfrost/lib/kmod/panthor_kmod.c index c2e774eab87..8900ea5ac5e 100644 --- a/src/panfrost/lib/kmod/panthor_kmod.c +++ b/src/panfrost/lib/kmod/panthor_kmod.c @@ -153,8 +153,12 @@ panthor_dev_query_props(struct panthor_kmod_dev *panthor_dev) { struct pan_kmod_dev_props *props = &panthor_dev->base.props; + bool is_gpu_wide = panthor_dev->props.gpu.gpu_id == 0; + assert(!is_gpu_wide || panthor_dev->props.gpu.gpu_wide_id); + *props = (struct pan_kmod_dev_props){ - .gpu_id = panthor_dev->props.gpu.gpu_id, + .gpu_id = is_gpu_wide ? panthor_dev->props.gpu.gpu_wide_id + : panthor_dev->props.gpu.gpu_id, .gpu_variant = panthor_dev->props.gpu.core_features & 0xff, .shader_present = panthor_dev->props.gpu.shader_present, .tiler_features = panthor_dev->props.gpu.tiler_features, diff --git a/src/panfrost/model/pan_model.h b/src/panfrost/model/pan_model.h index 8eb7980b633..d1edf9a7eb6 100644 --- a/src/panfrost/model/pan_model.h +++ b/src/panfrost/model/pan_model.h @@ -31,6 +31,15 @@ struct pan_tiler_features { #define PAN_VERSION_MINOR(x) (((x) & BITFIELD_RANGE(4, 8)) >> 4) #define PAN_VERSION_STATUS(x) ((x) & BITFIELD_RANGE(0, 4)) +#define PAN_ID64_COMPAT 0xFull +#define PAN_ID64_ARCH_MAJOR(x) (((x) & BITFIELD64_RANGE(56, 8)) >> 56) +#define PAN_ID64_ARCH_MINOR(x) (((x) & BITFIELD64_RANGE(48, 8)) >> 48) +#define PAN_ID64_ARCH_REV(x) (((x) & BITFIELD64_RANGE(40, 8)) >> 40) +#define PAN_ID64_PRODUCT_MAJOR(x) (((x) & BITFIELD64_RANGE(32, 8)) >> 32) +#define PAN_ID64_VERSION_MAJOR(x) (((x) & BITFIELD64_RANGE(16, 8)) >> 16) +#define PAN_ID64_VERSION_MINOR(x) (((x) & BITFIELD64_RANGE(8, 8)) >> 8) +#define PAN_ID64_VERSION_STATUS(x) ((x) & BITFIELD64_RANGE(0, 8)) + /* GPU product id for Midgard */ #define MIDGARD_PROD_ID(x) (((x) & BITFIELD_RANGE(16, 16)) >> 16) @@ -108,8 +117,12 @@ pan_arch(uint64_t gpu_id) case 0x860: case 0x880: return 5; - default: - return PAN_ARCH_MAJOR(gpu_id); + default: { + unsigned gpu_arch = PAN_ARCH_MAJOR(gpu_id); + if (gpu_arch == PAN_ID64_COMPAT) + return PAN_ID64_ARCH_MAJOR(gpu_id); + return gpu_arch; + } } } @@ -119,14 +132,21 @@ pan_prod_id(uint64_t gpu_id) unsigned arch = pan_arch(gpu_id); if (arch < 6) return MIDGARD_PROD_ID(gpu_id); - return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id), - PAN_PRODUCT_MAJOR(gpu_id)); + else if (arch < PAN_ID64_COMPAT) + return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id), + PAN_PRODUCT_MAJOR(gpu_id)); + return PAN_PROD_ID(PAN_ID64_ARCH_MAJOR(gpu_id), PAN_ID64_ARCH_MINOR(gpu_id), + PAN_ID64_PRODUCT_MAJOR(gpu_id)); } static inline uint32_t pan_rev(uint64_t gpu_id) { - return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id)); + unsigned arch = pan_arch(gpu_id); + if (arch < PAN_ID64_COMPAT) + return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id)); + return PAN_REV(PAN_ID64_VERSION_MAJOR(gpu_id), + PAN_ID64_VERSION_MINOR(gpu_id)); } #endif diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c index a0845b6987e..0db46da50f6 100644 --- a/src/panfrost/vulkan/panvk_vX_physical_device.c +++ b/src/panfrost/vulkan/panvk_vX_physical_device.c @@ -698,6 +698,18 @@ get_conformance_version() return (VkConformanceVersion){0, 0, 0, 0}; } +static uint32_t +get_device_id(uint64_t gpu_id) +{ + if (PAN_ARCH >= PAN_ID64_COMPAT) + return ((PAN_ID64_COMPAT << 28) | (PAN_ID64_ARCH_MAJOR(gpu_id) << 20) | + (PAN_ID64_ARCH_MINOR(gpu_id) << 12) | + ((PAN_ID64_PRODUCT_MAJOR(gpu_id) & 0xF) << 8) | + ((PAN_ID64_VERSION_MAJOR(gpu_id) & 0xF) << 4) | + (PAN_ID64_VERSION_MINOR(gpu_id) & 0xF)); + return (gpu_id & 0xFFFFFFFF); +} + void panvk_per_arch(get_physical_device_properties)( const struct panvk_instance *instance, @@ -736,7 +748,7 @@ panvk_per_arch(get_physical_device_properties)( .driverVersion = vk_get_driver_version(), .vendorID = instance->force_vk_vendor ? instance->force_vk_vendor : ARM_VENDOR_ID, - .deviceID = device->kmod.dev->props.gpu_id, + .deviceID = get_device_id(device->kmod.dev->props.gpu_id), .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU, /* Vulkan 1.0 limits */ From 1f1282801170170bd225fb4794071c29ec7f11ca Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 23 Mar 2026 14:05:55 +0100 Subject: [PATCH 30/49] pan: Add handling for v15+ uapi thread_max_wg_size thread_max_workgroup_size has been replaced with thread_num_active_granularity in v15, which requires updated handling for calculating the max number of threads in a workgroup --- src/gallium/drivers/panfrost/pan_cmdstream.c | 3 ++- src/gallium/drivers/panfrost/pan_precomp.c | 5 +++-- src/panfrost/lib/kmod/pan_kmod.h | 3 +++ src/panfrost/lib/kmod/panthor_kmod.c | 8 +++++-- src/panfrost/lib/pan_desc.h | 13 +++++++---- src/panfrost/lib/pan_props.c | 9 ++++++++ .../vulkan/csf/panvk_vX_cmd_dispatch.c | 8 ++++--- .../vulkan/csf/panvk_vX_cmd_precomp.c | 3 ++- .../vulkan/jm/panvk_vX_cmd_dispatch.c | 5 +++-- .../vulkan/panvk_vX_physical_device.c | 22 ++++++++++++------- 10 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index aa32944195f..5294b5831da 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, .tls.size = ss->info.tls_size, .wls.size = ss->info.wls_size + grid->variable_shared_mem, .wls.instances = pan_calc_wls_instances( - &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim), + &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim, + ss->info.work_reg_count), }; if (ss->info.tls_size) { diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index c9b0c9b62fe..da2d3f51f7d 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch, struct pan_tls_info info = { .tls.size = shader->info.tls_size, .wls.size = shader->info.wls_size, - .wls.instances = pan_calc_wls_instances(&shader->local_size, - &dev->kmod.dev->props, dim), + .wls.instances = + pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim, + shader->info.work_reg_count), }; if (info.tls.size) { diff --git a/src/panfrost/lib/kmod/pan_kmod.h b/src/panfrost/lib/kmod/pan_kmod.h index e7356330e7d..a876afeafe3 100644 --- a/src/panfrost/lib/kmod/pan_kmod.h +++ b/src/panfrost/lib/kmod/pan_kmod.h @@ -206,6 +206,9 @@ struct pan_kmod_dev_props { /* Maximum number of threads per workgroup. */ uint32_t max_threads_per_wg; + /* Granularity of number of active threads. */ + uint32_t num_threads_active_granularity; + /* Number of registers per core. Can be used to determine the maximum * number of threads that can be allocated for a specific shader based on * the number of registers assigned to this shader. diff --git a/src/panfrost/lib/kmod/panthor_kmod.c b/src/panfrost/lib/kmod/panthor_kmod.c index 8900ea5ac5e..2c723f7d506 100644 --- a/src/panfrost/lib/kmod/panthor_kmod.c +++ b/src/panfrost/lib/kmod/panthor_kmod.c @@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev) props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24; props->num_registers_per_core = panthor_dev->props.gpu.thread_features & 0x3fffff; + props->num_threads_active_granularity = + panthor_dev->props.gpu.thread_num_active_granularity; /* We assume that all thread properties are populated. If we ever have a GPU * that have one of the THREAD_xxx register that's zero, we can always add a * quirk here. */ - assert(props->max_threads_per_wg && props->max_threads_per_core && - props->max_tasks_per_core && props->num_registers_per_core); + assert( + (props->max_threads_per_wg || props->num_threads_active_granularity) && + props->max_threads_per_core && props->max_tasks_per_core && + props->num_registers_per_core); /* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number * of TLS instance per core is assumed to be the maximum number of threads diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index 7cc7639c897..bdb19976977 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size) static inline unsigned pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size, - const struct pan_kmod_dev_props *props) + const struct pan_kmod_dev_props *props, + unsigned work_reg_count) { /* Each shader core can run N tasks and a total of M threads at any single * time, thus each task should ideally have no more than M/N threads. */ unsigned max_threads_per_task = props->max_threads_per_core / props->max_tasks_per_core; + ASSERTED unsigned max_threads_per_wg = + pan_compute_max_thread_count(props, work_reg_count); + /* To achieve the best utilization, we should aim for as many workgroups * per tasks as we can fit without exceeding the above thread limit */ unsigned threads_per_wg = shader_local_size->x * shader_local_size->y * shader_local_size->z; - assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg); + assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg); unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg); assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task); @@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size, static inline unsigned pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size, const struct pan_kmod_dev_props *props, - const struct pan_compute_dim *dim) + const struct pan_compute_dim *dim, + unsigned work_reg_count) { /* NOTE: If the instance count is lower than the number of workgroups * being dispatched, the HW will hold back workgroups until instances * can be reused. */ unsigned instances; unsigned wg_per_task = - pan_calc_workgroups_per_task(shader_local_size, props); + pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count); unsigned max_instances_per_core = util_next_power_of_two(wg_per_task * props->max_tasks_per_core); diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c index 056bd48d4a2..b125dad4947 100644 --- a/src/panfrost/lib/pan_props.c +++ b/src/panfrost/lib/pan_props.c @@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props, aligned_reg_count = work_reg_count <= 32 ? 32 : 64; } + if (pan_arch(props->gpu_id) >= 15) { + assert(props->num_threads_active_granularity); + unsigned max_treads_per_wg = + ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count, + props->num_threads_active_granularity); + return MIN2(max_treads_per_wg, props->max_threads_per_core); + } + + assert(props->max_threads_per_wg); return MIN3(props->max_threads_per_wg, props->max_threads_per_core, props->num_registers_per_core / aligned_reg_count); } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index b1bf45483ee..1fd8e437d49 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)( unsigned core_id_range; pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range); - tlsinfo.wls.instances = pan_calc_wls_instances( - &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim); + tlsinfo.wls.instances = + pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props, + indirect ? NULL : dim, cs->info.work_reg_count); unsigned wls_total_size = pan_calc_total_wls_size( tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range); @@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) unsigned wg_per_task = 0; if (indirect) wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size, - &phys_dev->kmod.dev->props); + &phys_dev->kmod.dev->props, + cs->info.work_reg_count); if (compute_state_dirty(cmdbuf, DESC_STATE) || compute_state_dirty(cmdbuf, CS)) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index bd302847aec..56f6c546217 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, * increment/axis parameters requires knowledge of job dimensions, but * this is somewhat offset by run_compute being a native instruction. */ task_increment = pan_calc_workgroups_per_task( - &shader->cs.local_size, &phys_dev->kmod.dev->props); + &shader->cs.local_size, &phys_dev->kmod.dev->props, + shader->info.work_reg_count); } else { panvk_per_arch(calculate_task_axis_and_increment)( shader, phys_dev, &dim, &task_axis, &task_increment); diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c index 57b48e69f15..fb5782a141f 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c @@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)( unsigned core_id_range; pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range); - batch->tlsinfo.wls.instances = pan_calc_wls_instances( - &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim); + batch->tlsinfo.wls.instances = + pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props, + indirect ? NULL : dim, cs->info.work_reg_count); batch->wls_total_size = pan_calc_total_wls_size( batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range); } diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c index 0db46da50f6..333a5abdaa0 100644 --- a/src/panfrost/vulkan/panvk_vX_physical_device.c +++ b/src/panfrost/vulkan/panvk_vX_physical_device.c @@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)( const bool has_disk_cache = device->vk.disk_cache != NULL; + /* Calculate the value using register count on v15+. + * TODO: As this requires register allocation changes ensuring we don't + * violate the limits based on the workgroup size, clamp the value to half of + * the max threads value (always safe and matches previous GPUs) for now. */ + unsigned max_threads_per_wg = + (PAN_ARCH >= 15) + ? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32), + device->kmod.dev->props.max_threads_per_core / 2) + : device->kmod.dev->props.max_threads_per_wg; /* Ensure that the max threads count per workgroup is valid for Bifrost */ - assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024); + assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024); float pointSizeRangeMin; float pointSizeRangeMax; @@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)( /* We could also split into serveral jobs but this has many limitations. * As such we limit to the max threads per workgroup supported by the GPU. */ - .maxComputeWorkGroupInvocations = - device->kmod.dev->props.max_threads_per_wg, - .maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg, - device->kmod.dev->props.max_threads_per_wg, - device->kmod.dev->props.max_threads_per_wg}, + .maxComputeWorkGroupInvocations = max_threads_per_wg, + .maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg, + max_threads_per_wg}, /* 8-bit subpixel precision. */ .subPixelPrecisionBits = 8, .subTexelPrecisionBits = 8, @@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)( .minSubgroupSize = pan_subgroup_size(PAN_ARCH), .maxSubgroupSize = pan_subgroup_size(PAN_ARCH), .maxComputeWorkgroupSubgroups = - device->kmod.dev->props.max_threads_per_wg / - pan_subgroup_size(PAN_ARCH), + max_threads_per_wg / pan_subgroup_size(PAN_ARCH), .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT, .maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE, .maxPerStageDescriptorInlineUniformBlocks = From b2671ddcee912ba0b074f3e88d59a308a48cef8a Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:06:23 +0100 Subject: [PATCH 31/49] pan/genxml: Add base v15 definition This is currently just a copy of v14 except for "arch" being changed to "15". --- src/panfrost/genxml/gen_macros.h | 3 + src/panfrost/genxml/meson.build | 2 +- src/panfrost/genxml/v15.xml | 2753 ++++++++++++++++++++++++++++++ 3 files changed, 2757 insertions(+), 1 deletion(-) create mode 100644 src/panfrost/genxml/v15.xml diff --git a/src/panfrost/genxml/gen_macros.h b/src/panfrost/genxml/gen_macros.h index c1e8ab1fbae..19305dd0854 100644 --- a/src/panfrost/genxml/gen_macros.h +++ b/src/panfrost/genxml/gen_macros.h @@ -64,6 +64,9 @@ #elif (PAN_ARCH == 14) #define GENX(X) X##_v14 #include "genxml/v14_pack.h" +#elif (PAN_ARCH == 15) +#define GENX(X) X##_v15 +#include "genxml/v15_pack.h" #else #error "Need to add suffixing macro for this architecture" #endif diff --git a/src/panfrost/genxml/meson.build b/src/panfrost/genxml/meson.build index ee4b4adea3f..c23e3c0d17e 100644 --- a/src/panfrost/genxml/meson.build +++ b/src/panfrost/genxml/meson.build @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MIT pan_packers = [] -foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14'] +foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14', 'v15'] pan_packers += custom_target( packer + '_pack.h', input : ['gen_pack.py', packer + '.xml'], diff --git a/src/panfrost/genxml/v15.xml b/src/panfrost/genxml/v15.xml new file mode 100644 index 00000000000..2b35043f964 --- /dev/null +++ b/src/panfrost/genxml/v15.xml @@ -0,0 +1,2753 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From fb3dd5f938f67a87c2a325224208521de47a3624 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:11:16 +0100 Subject: [PATCH 32/49] pan/genxml: Build libpanfrost_decode for v15 --- src/panfrost/genxml/decode.h | 7 +++++++ src/panfrost/genxml/decode_common.c | 9 +++++++++ src/panfrost/genxml/meson.build | 2 +- src/panfrost/lib/pan_format.h | 3 +++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/panfrost/genxml/decode.h b/src/panfrost/genxml/decode.h index 47fe28f798f..85d43cd9472 100644 --- a/src/panfrost/genxml/decode.h +++ b/src/panfrost/genxml/decode.h @@ -139,6 +139,13 @@ void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin, void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace, uint32_t trace_size, uint64_t gpu_id); +void pandecode_interpret_cs_v15(struct pandecode_context *ctx, uint64_t queue, + uint32_t size, uint64_t gpu_id, uint32_t *regs); +void pandecode_cs_binary_v15(struct pandecode_context *ctx, uint64_t bin, + uint32_t bin_size); +void pandecode_cs_trace_v15(struct pandecode_context *ctx, uint64_t trace, + uint32_t trace_size, uint64_t gpu_id); + /* Logging infrastructure */ static void pandecode_make_indent(struct pandecode_context *ctx) diff --git a/src/panfrost/genxml/decode_common.c b/src/panfrost/genxml/decode_common.c index 399fec9f335..1c5c3b4a46f 100644 --- a/src/panfrost/genxml/decode_common.c +++ b/src/panfrost/genxml/decode_common.c @@ -426,6 +426,9 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va, case 14: pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs); break; + case 15: + pandecode_interpret_cs_v15(ctx, queue_gpu_va, size, gpu_id, regs); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -452,6 +455,9 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va, case 14: pandecode_cs_binary_v14(ctx, bin_gpu_va, size); break; + case 15: + pandecode_cs_binary_v15(ctx, bin_gpu_va, size); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -478,6 +484,9 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va, case 14: pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id); break; + case 15: + pandecode_cs_trace_v15(ctx, trace_gpu_va, size, gpu_id); + break; default: UNREACHABLE("Unsupported architecture"); } diff --git a/src/panfrost/genxml/meson.build b/src/panfrost/genxml/meson.build index c23e3c0d17e..ab8e0a37d34 100644 --- a/src/panfrost/genxml/meson.build +++ b/src/panfrost/genxml/meson.build @@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency( libpanfrost_decode_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15'] libpanfrost_decode_per_arch += static_library( 'pandecode-arch-v' + ver, ['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers], diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index 770d8a1bf56..b426ee5c866 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -170,6 +170,8 @@ extern const struct pan_blendable_format pan_blendable_formats_v13[PIPE_FORMAT_COUNT]; extern const struct pan_blendable_format pan_blendable_formats_v14[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + pan_blendable_formats_v15[PIPE_FORMAT_COUNT]; uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats); @@ -203,6 +205,7 @@ extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT]; +extern const struct pan_format pan_pipe_format_v15[PIPE_FORMAT_COUNT]; static inline const struct pan_format * pan_format_table(unsigned arch) From fb50cac9c62176523740f23991465c6c380045ed Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:14:01 +0100 Subject: [PATCH 33/49] pan/lib: Build for v15 --- src/panfrost/lib/meson.build | 4 ++-- src/panfrost/lib/pan_format.h | 2 ++ src/panfrost/lib/pan_mod.h | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build index a4572db619c..30498f42dca 100644 --- a/src/panfrost/lib/meson.build +++ b/src/panfrost/lib/meson.build @@ -4,7 +4,7 @@ subdir('kmod') -pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14'] +pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14', '15'] libpanfrost_pixel_format = [] deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep] @@ -22,7 +22,7 @@ endforeach libpanfrost_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15'] libpanfrost_per_arch += static_library( 'pan-arch-v' + ver, [ diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index b426ee5c866..4ac2f1e78f6 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -189,6 +189,7 @@ pan_blendable_format_table(unsigned arch) FMT_TABLE(12); FMT_TABLE(13); FMT_TABLE(14); + FMT_TABLE(15); #undef FMT_TABLE default: assert(!"Unsupported architecture"); @@ -221,6 +222,7 @@ pan_format_table(unsigned arch) FMT_TABLE(12); FMT_TABLE(13); FMT_TABLE(14); + FMT_TABLE(15); #undef FMT_TABLE default: assert(!"Unsupported architecture"); diff --git a/src/panfrost/lib/pan_mod.h b/src/panfrost/lib/pan_mod.h index 1bd9a759a44..e5b22c7741a 100644 --- a/src/panfrost/lib/pan_mod.h +++ b/src/panfrost/lib/pan_mod.h @@ -85,6 +85,7 @@ const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier); +const struct pan_mod_handler *pan_mod_get_handler_v15(uint64_t modifier); static inline const struct pan_mod_handler * pan_mod_get_handler(unsigned arch, uint64_t modifier) @@ -108,6 +109,8 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier) return pan_mod_get_handler_v13(modifier); case 14: return pan_mod_get_handler_v14(modifier); + case 15: + return pan_mod_get_handler_v15(modifier); default: UNREACHABLE("Unsupported arch"); } From a986de1f53b487006248549bf053971746fd4dc4 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:17:32 +0100 Subject: [PATCH 34/49] pan/clc: Build for v15 --- src/panfrost/clc/pan_compile.c | 2 +- src/panfrost/libpan/libpan.h | 2 ++ src/panfrost/libpan/libpan_shaders.h | 2 ++ src/panfrost/libpan/meson.build | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/panfrost/clc/pan_compile.c b/src/panfrost/clc/pan_compile.c index 593bfe2de07..15bad8f5c94 100644 --- a/src/panfrost/clc/pan_compile.c +++ b/src/panfrost/clc/pan_compile.c @@ -276,7 +276,7 @@ main(int argc, const char **argv) unsigned target_arch = atoi(target_arch_str); - if (target_arch < 4 || target_arch > 14) { + if (target_arch < 4 || target_arch > 15) { fprintf(stderr, "Unsupported target arch %d\n", target_arch); return 1; } diff --git a/src/panfrost/libpan/libpan.h b/src/panfrost/libpan/libpan.h index cc79ea92b74..56326ee9967 100644 --- a/src/panfrost/libpan/libpan.h +++ b/src/panfrost/libpan/libpan.h @@ -30,6 +30,8 @@ #include "libpan_v13.h" #elif (PAN_ARCH == 14) #include "libpan_v14.h" +#elif (PAN_ARCH == 15) +#include "libpan_v15.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/libpan_shaders.h b/src/panfrost/libpan/libpan_shaders.h index d51761abf64..a0901869b34 100644 --- a/src/panfrost/libpan/libpan_shaders.h +++ b/src/panfrost/libpan/libpan_shaders.h @@ -28,6 +28,8 @@ #include "libpan_shaders_v13.h" #elif (PAN_ARCH == 14) #include "libpan_shaders_v14.h" +#elif (PAN_ARCH == 15) +#include "libpan_shaders_v15.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/meson.build b/src/panfrost/libpan/meson.build index dfe40fff9c1..83ff6bad122 100644 --- a/src/panfrost/libpan/meson.build +++ b/src/panfrost/libpan/meson.build @@ -11,7 +11,7 @@ libpan_shader_files = files( idep_libpan_per_arch = {} -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15'] libpan_spv = custom_target( input : libpan_shader_files, output : 'libpan_v' + ver + '.spv', From bd52eb4a3aa9bae446e376630372f1c54c881e34 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:28:14 +0100 Subject: [PATCH 35/49] panvk: Add v15 support --- src/panfrost/vulkan/meson.build | 8 ++++---- src/panfrost/vulkan/panvk_macros.h | 8 ++++++++ src/panfrost/vulkan/panvk_physical_device.c | 2 ++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index ce06192d50a..8cbac6d9d75 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -14,7 +14,7 @@ panvk_entrypoints = custom_target( '--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7', '--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10', '--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13', - '--device-prefix', 'panvk_v14', + '--device-prefix', 'panvk_v14', '--device-prefix', 'panvk_v15', '--beta', with_vulkan_beta.to_string() ], depend_files : vk_entrypoints_gen_depend_files, @@ -66,7 +66,7 @@ valhall_archs = [9, 10] valhall_inc_dir = ['valhall'] valhall_files = [] -fifthgen_archs = [12, 13, 14] +fifthgen_archs = [12, 13, 14, 15] fifthgen_inc_dir = ['fifthgen'] fifthgen_files = [] @@ -84,7 +84,7 @@ jm_files = [ 'jm/panvk_vX_gpu_queue.c', ] -csf_archs = [10, 12, 13, 14] +csf_archs = [10, 12, 13, 14, 15] csf_inc_dir = ['csf'] csf_files = [ 'csf/panvk_vX_bind_queue.c', @@ -127,7 +127,7 @@ common_per_arch_files = [ sha1_h, ] -foreach arch : [6, 7, 10, 12, 13, 14] +foreach arch : [6, 7, 10, 12, 13, 14, 15] per_arch_files = common_per_arch_files inc_panvk_per_arch = [] diff --git a/src/panfrost/vulkan/panvk_macros.h b/src/panfrost/vulkan/panvk_macros.h index 09253ffdb93..0b5a2d347cb 100644 --- a/src/panfrost/vulkan/panvk_macros.h +++ b/src/panfrost/vulkan/panvk_macros.h @@ -64,6 +64,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 14: \ panvk_arch_name(name, v14)(__VA_ARGS__); \ break; \ + case 15: \ + panvk_arch_name(name, v15)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -90,6 +93,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 14: \ ret = panvk_arch_name(name, v14)(__VA_ARGS__); \ break; \ + case 15: \ + ret = panvk_arch_name(name, v15)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -110,6 +116,8 @@ panvk_catch_indirect_alloc_failure(VkResult error) #define panvk_per_arch(name) panvk_arch_name(name, v13) #elif PAN_ARCH == 14 #define panvk_per_arch(name) panvk_arch_name(name, v14) +#elif PAN_ARCH == 15 +#define panvk_per_arch(name) panvk_arch_name(name, v15) #else #error "Unsupported arch" #endif diff --git a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c index bb18df6b49a..d9ac53e6afa 100644 --- a/src/panfrost/vulkan/panvk_physical_device.c +++ b/src/panfrost/vulkan/panvk_physical_device.c @@ -65,6 +65,7 @@ PER_ARCH_FUNCS(10); PER_ARCH_FUNCS(12); PER_ARCH_FUNCS(13); PER_ARCH_FUNCS(14); +PER_ARCH_FUNCS(15); static VkResult create_kmod_dev(struct panvk_physical_device *device, @@ -413,6 +414,7 @@ panvk_physical_device_init(struct panvk_physical_device *device, case 6: case 7: case 14: + case 15: if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) { result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, "WARNING: panvk is not well-tested on v%d, " From 4789fa6b7081cad8e9bc5d7ab8bd51cb5f6d9af2 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:29:04 +0100 Subject: [PATCH 36/49] pan: Add v15 support --- src/panfrost/compiler/pan_compiler.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/panfrost/compiler/pan_compiler.c b/src/panfrost/compiler/pan_compiler.c index d1a4dc08eed..3fd702227c8 100644 --- a/src/panfrost/compiler/pan_compiler.c +++ b/src/panfrost/compiler/pan_compiler.c @@ -53,8 +53,9 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg) case 12: case 13: case 14: - return merge_wg ? &bifrost_nir_options_v11_merge_wg : - &bifrost_nir_options_v11; + case 15: + return merge_wg ? &bifrost_nir_options_v11_merge_wg + : &bifrost_nir_options_v11; default: assert(!"Unsupported arch"); return NULL; From 82697cc24531d154b4ffb06953569a5cf4d89c4b Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:31:04 +0100 Subject: [PATCH 37/49] panfrost: Add v15 support to the Gallium driver --- src/gallium/drivers/panfrost/meson.build | 4 ++-- src/gallium/drivers/panfrost/pan_cmdstream.c | 2 +- src/gallium/drivers/panfrost/pan_screen.c | 3 +++ src/gallium/drivers/panfrost/pan_screen.h | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index ba243f5a4ed..4f69564da47 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -41,7 +41,7 @@ compile_args_panfrost = [ '-Wno-pointer-arith' ] -panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14'] +panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15'] libpanfrost_versions = [] foreach ver : panfrost_versions @@ -54,7 +54,7 @@ foreach ver : panfrost_versions ] if ver in ['4', '5', '6', '7', '9'] files_panfrost_vx += ['pan_jm.c'] - elif ver in ['10', '12', '13', '14'] + elif ver in ['10', '12', '13', '14', '15'] files_panfrost_vx += ['pan_csf.c'] endif libpanfrost_versions += static_library( diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 5294b5831da..7fad87f7e6d 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -49,7 +49,7 @@ * functions. */ #if PAN_ARCH <= 9 #define JOBX(__suffix) GENX(jm_##__suffix) -#elif PAN_ARCH <= 14 +#elif PAN_ARCH <= 15 #define JOBX(__suffix) GENX(csf_##__suffix) #else #error "Unsupported arch" diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index ede056ba82f..d1e82a8be61 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -1178,6 +1178,9 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config, case 14: panfrost_cmdstream_screen_init_v14(screen); break; + case 15: + panfrost_cmdstream_screen_init_v15(screen); + break; default: debug_printf("panfrost: Unhandled architecture major %d", dev->arch); panfrost_destroy_screen(&(screen->base)); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index 9e6b95d008d..6e28ce4d8e3 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -156,6 +156,7 @@ void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen); +void panfrost_cmdstream_screen_init_v15(struct panfrost_screen *screen); #define perf_debug(ctx, ...) \ do { \ From b7afb629c3bea6e98708cb86bc2ecbad9d373203 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 16 Mar 2026 12:14:45 +0100 Subject: [PATCH 38/49] pan/genxml: Add support for multiple modifiers This allows specifying multiple modifiers for fields in the xml which will be applied in the written order when packing and inverse-applies in the reverse order when unpacking. --- src/panfrost/genxml/gen_pack.py | 111 +++++++++++++++++++------------- 1 file changed, 65 insertions(+), 46 deletions(-) diff --git a/src/panfrost/genxml/gen_pack.py b/src/panfrost/genxml/gen_pack.py index 595a04c0649..ac8e87e59ab 100644 --- a/src/panfrost/genxml/gen_pack.py +++ b/src/panfrost/genxml/gen_pack.py @@ -83,23 +83,34 @@ def parse_modifier(modifier): if modifier is None: return None - for mod in MODIFIERS: - if modifier[0:len(mod)] == mod: - if mod == "log2": - assert(len(mod) == len(modifier)) - return [mod] + ret = [] + split_modifiers = modifier.split() - if modifier[len(mod)] == '(' and modifier[-1] == ')': - ret = [mod, int(modifier[(len(mod) + 1):-1])] - if ret[0] == 'align': - align = ret[1] - # Make sure the alignment is a power of 2 - assert(align > 0 and not(align & (align - 1))); + for mod in split_modifiers: + valid = False + for valid_mod in MODIFIERS: + if mod[0:len(valid_mod)] == valid_mod: + if valid_mod == "log2": + assert(len(valid_mod) == len(modifier)) + # Add a number to simplify parsing + ret.extend([valid_mod, 0]) + valid = True + break - return ret + if mod[len(valid_mod)] == '(' and mod[-1] == ')': + mod_arg = [valid_mod, int(mod[(len(valid_mod) + 1):-1])] + if mod_arg[0] == 'align': + align = mod_arg[1] + # Make sure the alignment is a power of 2 + assert(align > 0 and not(align & (align - 1))); - print("Invalid modifier") - assert(False) + ret.extend(mod_arg) + valid = True + break + + assert valid, f"Invalid modifier: {modifier}" + + return ret class Aggregate(object): def __init__(self, parser, name, attrs): @@ -169,7 +180,7 @@ class Field(object): if self.type in self.parser.enums and self.default is not None: self.default = safe_name('{}_{}_{}'.format(global_prefix, self.type, self.default)).upper() - self.modifier = parse_modifier(attrs.get("modifier")) + self.modifier = parse_modifier(attrs.get("modifier")) def emit_template_struct(self, dim): if self.type == 'address': @@ -291,14 +302,22 @@ class Group(object): if field.modifier is None: continue - if field.modifier[0] == "shr": - shift = field.modifier[1] - mask = hex((1 << shift) - 1) - print(" assert(((__unpacked)->{} & {}) == 0); \\".format(field.name, mask)) - elif field.modifier[0] == "minus": - print(" assert((__unpacked)->{} >= {}); \\".format(field.name, field.modifier[1])) - elif field.modifier[0] == "log2": - print(" assert(IS_POT_NONZERO((__unpacked)->{})); \\".format(field.name)) + value = "(__unpacked)->{}".format(field.name) + for mod, mod_val in zip (field.modifier[::2], field.modifier[1::2]): + if mod == "shr": + mask = hex((1 << mod_val) - 1) + print(" assert(({} & {}) == 0); \\".format(value, mask)) + value = "({} >> {})".format(value, mod_val) + elif mod == "minus": + print(" assert({} >= {}); \\".format(value, mod_val)) + value = "({} - {})".format(value, mod_val) + elif mod == "align": + mask = hex(mod_val - 1) + print(' assert(!({} & {})); \\'.format(value, mask)) + value = "(ALIGN_POT({}, {}))".format(value, mod_val) + elif mod == "log2": + print(" assert(IS_POT_NONZERO({})); \\".format(value)) + value = "(util_logbase2({}))".format(value) for index in range(self.length // 4): # Handle MBZ words @@ -324,14 +343,15 @@ class Group(object): value = "(__unpacked)->{}".format(contributor.path) if field.modifier is not None: - if field.modifier[0] == "shr": - value = "{} >> {}".format(value, field.modifier[1]) - elif field.modifier[0] == "minus": - value = "{} - {}".format(value, field.modifier[1]) - elif field.modifier[0] == "align": - value = "ALIGN_POT({}, {})".format(value, field.modifier[1]) - elif field.modifier[0] == "log2": - value = "util_logbase2({})".format(value) + for mod, mod_val in zip(field.modifier[::2], field.modifier[1::2]): + if mod == "shr": + value = "({} >> {})".format(value, mod_val) + elif mod == "minus": + value = "({} - {})".format(value, mod_val) + elif mod == "align": + value = "(ALIGN_POT({}, {}))".format(value, mod_val) + elif mod == "log2": + value = "(util_logbase2({}))".format(value) if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format", "Component Swizzle"]: s = "util_bitpack_uint(%s, %d, %d)" % \ @@ -435,25 +455,24 @@ class Group(object): else: s = "/* unhandled field %s, type %s */\n" % (field.name, field.type) - suffix = "" - prefix = "" - if field.modifier: - if field.modifier[0] == "minus": - suffix = " + {}".format(field.modifier[1]) - elif field.modifier[0] == "shr": - suffix = " << {}".format(field.modifier[1]) - if field.modifier[0] == "log2": - prefix = "1U << " print(' {}({}); \\'.format(convert, ', '.join(args))) - if len(prefix) != 0 or len(suffix) != 0: - print(' (__unpacked)->{} = {}(__unpacked)->{}{}; \\'.format(fieldref.path, prefix, fieldref.path, suffix)) + value = "(__unpacked)->{}".format(fieldref.path) + if field.modifier is not None: + # Need to reverse ([::-1]) modifier order when unpacking + for mod, mod_val in list(zip(field.modifier[::2], field.modifier[1::2]))[::-1]: + if mod == "shr": + value = "({} << {})".format(value, mod_val) + elif mod == "minus": + value = "({} + {})".format(value, mod_val) + elif mod == "align": + mask = hex(mod_val - 1) + print(' assert(!({} & {})); \\'.format(value, mask)) + elif mod == "log2": + value = "(1U << {})".format(value) - - if field.modifier and field.modifier[0] == "align": - mask = hex(field.modifier[1] - 1) - print(' assert(!((__unpacked)->{} & {})); \\'.format(fieldref.path, mask)) + print(' (__unpacked)->{} = {}; \\'.format(fieldref.path, value)) def emit_print_function(self): for field in self.fields: From ad81596b6d87eea4a27842b4632f30a9ee5846b5 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 25 Feb 2026 14:44:18 +0100 Subject: [PATCH 39/49] pan/va: Implement v15 register count changes With v15, we get support for 128 registers in any multiple of 16 (vs previously having the choice between 32 or 64 register mode). To support this, shader register count is passed in a different way from v15, requiring some updates to how we encode the ShaderProgramDescriptor and the ShaderProgramPointer. Note that this currently does not change the compiler behavior of running in either 32 or 64 register mode, just how this is passed to the GPU. --- src/gallium/drivers/panfrost/pan_cmdstream.c | 9 +++++- src/gallium/drivers/panfrost/pan_fb_preload.c | 4 +++ src/gallium/drivers/panfrost/pan_precomp.c | 14 +++++++++ src/panfrost/genxml/decode_csf.c | 30 ++++++++++++++++--- src/panfrost/genxml/v15.xml | 8 ++++- .../vulkan/csf/panvk_vX_cmd_dispatch.c | 13 +++++++- .../vulkan/csf/panvk_vX_cmd_precomp.c | 10 +++++++ .../vulkan/panvk_vX_cmd_frame_shaders.c | 4 +++ src/panfrost/vulkan/panvk_vX_shader.c | 12 ++++++++ 9 files changed, 97 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 7fad87f7e6d..15680c000bc 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -4456,9 +4456,12 @@ prepare_shader(struct panfrost_compiled_shader *state, else if (vs) cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif - +#if PAN_ARCH >= 15 + cfg.register_count = state->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); +#endif cfg.binary = state->bin.gpu; cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); @@ -4476,8 +4479,12 @@ prepare_shader(struct panfrost_compiled_shader *state, #if PAN_ARCH < 12 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif +#if PAN_ARCH >= 15 + cfg.register_count = state->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); +#endif cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); diff --git a/src/gallium/drivers/panfrost/pan_fb_preload.c b/src/gallium/drivers/panfrost/pan_fb_preload.c index 172398b6ec8..be4c2c9965e 100644 --- a/src/gallium/drivers/panfrost/pan_fb_preload.c +++ b/src/gallium/drivers/panfrost/pan_fb_preload.c @@ -1105,7 +1105,11 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool, pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) { cfg.stage = MALI_SHADER_STAGE_FRAGMENT; cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; +#if PAN_ARCH >= 15 + cfg.register_count = preload_shader->info.work_reg_count; +#else cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; +#endif cfg.binary = preload_shader->address; cfg.preload.r48_r63 = preload_shader->info.preload >> 48; } diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index da2d3f51f7d..9ca77b32dd7 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -98,8 +98,12 @@ panfrost_precomp_shader_create( pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&res->info); +#if PAN_ARCH >= 15 + cfg.register_count = res->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(res->info.work_reg_count); +#endif cfg.binary = res->code_ptr; cfg.preload.r48_r63 = (res->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info); @@ -326,7 +330,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch, uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr); +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = shader->info.work_reg_count; + ctx.pointer = shader->state_ptr; + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr); +#else cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr); +#endif cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd); /* Global attribute offset */ diff --git a/src/panfrost/genxml/decode_csf.c b/src/panfrost/genxml/decode_csf.c index 10f062cebda..7c43991f64d 100644 --- a/src/panfrost/genxml/decode_csf.c +++ b/src/panfrost/genxml/decode_csf.c @@ -651,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp, if (fau) GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU"); - GENX(pandecode_shader) - (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id); + uint64_t addr = cs_get_u64(qctx, reg_spd); +#if PAN_ARCH >= 15 + const struct mali_shader_program_pointer_packed spp_packed = { + .opaque[0] = addr & 0xFFFFFFFF, + .opaque[1] = (addr >> 32) & 0xFFFFFFFF, + }; + pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp) + ; + DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp, + "Shader Program Pointer (%" PRIx64 "):\n", addr); + addr = spp.pointer; +#endif + GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id); DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd), "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd)); @@ -693,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp, if (fau) GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU"); - GENX(pandecode_shader) - (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id); + uint64_t addr = cs_get_u64(qctx, reg_spd); +#if PAN_ARCH >= 15 + const struct mali_shader_program_pointer_packed spp_packed = { + .opaque[0] = addr & 0xFFFFFFFF, + .opaque[1] = (addr >> 32) & 0xFFFFFFFF, + }; + pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp) + ; + DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp, + "Shader Program Pointer (%" PRIx64 "):\n", addr); + addr = spp.pointer; +#endif + GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id); DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd), "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd)); diff --git a/src/panfrost/genxml/v15.xml b/src/panfrost/genxml/v15.xml index 2b35043f964..983834f16e4 100644 --- a/src/panfrost/genxml/v15.xml +++ b/src/panfrost/genxml/v15.xml @@ -2040,14 +2040,20 @@ + - + + + + + + diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 1fd8e437d49..8de3de939b0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -209,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr); } - if (compute_state_dirty(cmdbuf, CS)) + if (compute_state_dirty(cmdbuf, CS)) { +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = cs->info.work_reg_count; + ctx.pointer = panvk_priv_mem_dev_addr(cs->spd); + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr); +#else cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), panvk_priv_mem_dev_addr(cs->spd)); +#endif + } cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index 56f6c546217..386f2b317a5 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr); +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = shader->info.work_reg_count; + ctx.pointer = panvk_priv_mem_dev_addr(shader->spd); + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr); +#else cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), panvk_priv_mem_dev_addr(shader->spd)); +#endif cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd); diff --git a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c index 3bd5eda41f0..afa2692bde0 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c @@ -239,8 +239,12 @@ get_frame_shader(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) { cfg.stage = MALI_SHADER_STAGE_FRAGMENT; cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem); cfg.preload.r48_r63 = shader->info.preload >> 48; } diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 03da805a49c..927a72cd7c9 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -1180,8 +1180,12 @@ panvk_shader_upload(struct panvk_device *dev, cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); @@ -1199,8 +1203,12 @@ panvk_shader_upload(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&shader->info); +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); @@ -1214,8 +1222,12 @@ panvk_shader_upload(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&shader->info); +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader) + shader->info.vs.no_psiz_offset; cfg.preload.r48_r63 = (shader->info.preload >> 48); From fae53403deb5d950862cf7c7eeb70189c455c90f Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Fri, 13 Mar 2026 14:36:27 +0100 Subject: [PATCH 40/49] pan/va: Implement v15 preload registers With the change in register count, preloads move from r48-r63 to r0-r15. Update the preload logic to reflect this. --- src/gallium/drivers/panfrost/pan_cmdstream.c | 6 +- src/gallium/drivers/panfrost/pan_fb_preload.c | 3 +- src/gallium/drivers/panfrost/pan_precomp.c | 3 +- src/panfrost/compiler/bifrost/bi_ra.c | 12 ++-- src/panfrost/compiler/bifrost/compiler.h | 62 +++++++++---------- src/panfrost/genxml/v15.xml | 20 +++--- .../vulkan/panvk_vX_cmd_frame_shaders.c | 3 +- src/panfrost/vulkan/panvk_vX_shader.c | 9 ++- 8 files changed, 64 insertions(+), 54 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 15680c000bc..efd9d22e338 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -4458,12 +4458,13 @@ prepare_shader(struct panfrost_compiled_shader *state, #endif #if PAN_ARCH >= 15 cfg.register_count = state->info.work_reg_count; + cfg.preload.r0_r15 = state->info.preload; #else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.preload.r48_r63 = (state->info.preload >> 48); #endif cfg.binary = state->bin.gpu; - cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) @@ -4481,12 +4482,13 @@ prepare_shader(struct panfrost_compiled_shader *state, #endif #if PAN_ARCH >= 15 cfg.register_count = state->info.work_reg_count; + cfg.preload.r0_r15 = state->info.preload; #else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); + cfg.preload.r48_r63 = (state->info.preload >> 48); #endif cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; - cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); } diff --git a/src/gallium/drivers/panfrost/pan_fb_preload.c b/src/gallium/drivers/panfrost/pan_fb_preload.c index be4c2c9965e..6f1bba8ab66 100644 --- a/src/gallium/drivers/panfrost/pan_fb_preload.c +++ b/src/gallium/drivers/panfrost/pan_fb_preload.c @@ -1107,11 +1107,12 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool, cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; #if PAN_ARCH >= 15 cfg.register_count = preload_shader->info.work_reg_count; + cfg.preload.r0_r15 = preload_shader->info.preload; #else cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; + cfg.preload.r48_r63 = preload_shader->info.preload >> 48; #endif cfg.binary = preload_shader->address; - cfg.preload.r48_r63 = preload_shader->info.preload >> 48; } unsigned bd_count = views.rt_count; diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index 9ca77b32dd7..bc4640d7e2a 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -100,12 +100,13 @@ panfrost_precomp_shader_create( cfg.stage = pan_shader_stage(&res->info); #if PAN_ARCH >= 15 cfg.register_count = res->info.work_reg_count; + cfg.preload.r0_r15 = res->info.preload; #else cfg.register_allocation = pan_register_allocation(res->info.work_reg_count); + cfg.preload.r48_r63 = (res->info.preload >> 48); #endif cfg.binary = res->code_ptr; - cfg.preload.r48_r63 = (res->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info); } diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index f8579f8c983..7f058bf0d3f 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -294,7 +294,8 @@ bi_compute_liveness_ra(bi_context *ctx) #define EVEN_BITS_MASK (0x5555555555555555ull) static uint64_t -bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) +bi_make_affinity(uint64_t clobber, unsigned count, bool split_file, + unsigned arch) { uint64_t clobbered = 0; @@ -308,12 +309,12 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) clobbered |= mask << (64 - excess); if (split_file) - clobbered |= mask << (16 - excess); + clobbered |= mask << (((arch >= 15) ? 32 : 16) - excess); } /* Don't allocate the middle if we split out the middle */ if (split_file) - clobbered |= BITFIELD64_MASK(32) << 16; + clobbered |= BITFIELD64_MASK(32) << ((arch >= 15) ? 32 : 16); /* We can use a register iff it's not clobberred */ return ~clobbered; @@ -341,7 +342,7 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, unsigned count = bi_count_write_registers(ins, d); unsigned offset = ins->dest[d].offset; uint64_t affinity = - bi_make_affinity(preload_live, count, split_file) >> offset; + bi_make_affinity(preload_live, count, split_file, arch) >> offset; /* Valhall needs >= 64-bit staging writes to be pair-aligned */ if (aligned_sr && (count >= 2 || offset)) affinity &= EVEN_BITS_MASK; @@ -435,7 +436,8 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) uint64_t default_affinity = ctx->inputs->is_blend ? BITFIELD64_MASK(16) : full_regs ? BITFIELD64_MASK(64) - : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); + : (ctx->arch >= 15) ? BITFIELD64_MASK(32) + : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); /* To test spilling, mimic a small register file */ if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend && (bifrost_debug & BIFROST_DBG_NOSSARA)) diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index 39b4f8d09a0..20b553f43da 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1162,25 +1162,25 @@ bi_preload_reg(enum bi_preload val, unsigned arch) /* Compute */ case BI_PRELOAD_LOCAL_ID_0: /* Bits [15;0] */ - return 55; + return (arch >= 15) ? 4 : 55; case BI_PRELOAD_LOCAL_ID_1: /* Bits [31;16] */ - return 55; + return (arch >= 15) ? 4 : 55; case BI_PRELOAD_LOCAL_ID_2: /* Bits [15;0] */ - return 56; + return (arch >= 15) ? 3 : 56; case BI_PRELOAD_WORKGROUP_ID_0: - return 57; + return (arch >= 15) ? 5 : 57; case BI_PRELOAD_WORKGROUP_ID_1: - return 58; + return (arch >= 15) ? 6 : 58; case BI_PRELOAD_WORKGROUP_ID_2: - return 59; + return (arch >= 15) ? 7 : 59; case BI_PRELOAD_GLOBAL_ID_0: - return 60; + return (arch >= 15) ? 0 : 60; case BI_PRELOAD_GLOBAL_ID_1: - return 61; + return (arch >= 15) ? 1 : 61; case BI_PRELOAD_GLOBAL_ID_2: - return 62; + return (arch >= 15) ? 2 : 62; /* Vertex */ case BI_PRELOAD_POS_RESULT_PTR_LO: assert(arch < 9); @@ -1190,58 +1190,58 @@ bi_preload_reg(enum bi_preload val, unsigned arch) return 59; case BI_PRELOAD_INTERNAL_ID: assert(arch >= 9); - return 59; + return (arch >= 15) ? 2 : 59; case BI_PRELOAD_VERTEX_ID: - return (arch >= 9) ? 60 : 61; + return (arch >= 15) ? 0 : (arch >= 9) ? 60 : 61; case BI_PRELOAD_INSTANCE_ID: - return (arch >= 9) ? 61 : 62; + return (arch >= 15) ? 1 : (arch >= 9) ? 61 : 62; case BI_PRELOAD_DRAW_ID: assert(arch >= 9); - return 62; + return (arch >= 15) ? 3 : 62; case BI_PRELOAD_VIEW_ID: assert(arch >= 9); - return 63; + return (arch >= 15) ? 4 : 63; /* Fragment */ case BI_PRELOAD_PRIMITIVE_ID: - return 57; + return (arch >= 15) ? 6 : 57; case BI_PRELOAD_PRIMITIVE_FLAGS: - return 58; + return (arch >= 15) ? 3 : 58; case BI_PRELOAD_POSITION_XY: - return 59; + return (arch >= 15) ? 2 : 59; case BI_PRELOAD_CUMULATIVE_COVERAGE: /* Bits [15;0] */ - return 60; + return (arch >= 15) ? 0 : 60; case BI_PRELOAD_RASTERIZER_COVERAGE: /* Bits [15;0] */ - return 61; + return (arch >= 15) ? 1 : 61; case BI_PRELOAD_SAMPLE_ID: /* Bits [23;16] */ - return 61; + return (arch >= 15) ? 0 : 61; case BI_PRELOAD_CENTROID_ID: /* Bits [31;24] */ - return 61; + return (arch >= 15) ? 0 : 61; case BI_PRELOAD_FRAME_ARG: /* Double reg */ - return 62; + return (arch >= 15) ? 4 : 62; /* Blend */ case BI_PRELOAD_BLEND_SRC0_C0: - return 0; + return (arch >= 15) ? 8 : 0; case BI_PRELOAD_BLEND_SRC0_C1: - return 1; + return (arch >= 15) ? 9 : 1; case BI_PRELOAD_BLEND_SRC0_C2: - return 2; + return (arch >= 15) ? 10 : 2; case BI_PRELOAD_BLEND_SRC0_C3: - return 3; + return (arch >= 15) ? 11 : 3; case BI_PRELOAD_BLEND_SRC1_C0: - return 4; + return (arch >= 15) ? 12 : 4; case BI_PRELOAD_BLEND_SRC1_C1: - return 5; + return (arch >= 15) ? 13 : 5; case BI_PRELOAD_BLEND_SRC1_C2: - return 6; + return (arch >= 15) ? 14 : 6; case BI_PRELOAD_BLEND_SRC1_C3: - return 7; + return (arch >= 15) ? 15 : 7; case BI_PRELOAD_BLEND_LINK: - return 48; + return (arch >= 15) ? 7 : 48; } UNREACHABLE("Non-handled BI_PRELOAD"); } diff --git a/src/panfrost/genxml/v15.xml b/src/panfrost/genxml/v15.xml index 983834f16e4..f1a2bc7e2c9 100644 --- a/src/panfrost/genxml/v15.xml +++ b/src/panfrost/genxml/v15.xml @@ -2016,16 +2016,16 @@ - - - - - - - - - - + + + + + + + + + + diff --git a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c index afa2692bde0..d687fa1d6cf 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c @@ -241,12 +241,13 @@ get_frame_shader(struct panvk_device *dev, cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; #if PAN_ARCH >= 15 cfg.register_count = shader->info.work_reg_count; + cfg.preload.r0_r15 = shader->info.preload; #else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); + cfg.preload.r48_r63 = shader->info.preload >> 48; #endif cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem); - cfg.preload.r48_r63 = shader->info.preload >> 48; } #endif diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 927a72cd7c9..639288e6e01 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -1182,12 +1182,13 @@ panvk_shader_upload(struct panvk_device *dev, #if PAN_ARCH >= 15 cfg.register_count = shader->info.work_reg_count; + cfg.preload.r0_r15 = shader->info.preload; #else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); + cfg.preload.r48_r63 = (shader->info.preload >> 48); #endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); - cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) @@ -1205,12 +1206,13 @@ panvk_shader_upload(struct panvk_device *dev, cfg.stage = pan_shader_stage(&shader->info); #if PAN_ARCH >= 15 cfg.register_count = shader->info.work_reg_count; + cfg.preload.r0_r15 = shader->info.preload; #else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); + cfg.preload.r48_r63 = (shader->info.preload >> 48); #endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); - cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); } @@ -1224,13 +1226,14 @@ panvk_shader_upload(struct panvk_device *dev, cfg.stage = pan_shader_stage(&shader->info); #if PAN_ARCH >= 15 cfg.register_count = shader->info.work_reg_count; + cfg.preload.r0_r15 = shader->info.preload; #else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); + cfg.preload.r48_r63 = (shader->info.preload >> 48); #endif cfg.binary = panvk_shader_variant_get_dev_addr(shader) + shader->info.vs.no_psiz_offset; - cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); } #else From e62e3c201065c501daf74d70e06fe1e7893522cd Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Thu, 5 Mar 2026 17:28:04 +0100 Subject: [PATCH 41/49] pan/va/ISA: Add v15 opcodes --- src/panfrost/compiler/bifrost/valhall/ISA.xml | 1292 +++++++++++++++++ 1 file changed, 1292 insertions(+) diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 9c412ebf469..6fc6e0d12de 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -787,6 +787,13 @@ + + + + + + + Do nothing. Useful at the start of a block for waiting on slots required by the first actual instruction of the block, to reconcile dependencies @@ -798,6 +805,12 @@ + + + + + + Branches to a specified relative offset if its source is nonzero (default) or if its source is zero (if `.eq` is set). The offset is 27-bits and @@ -823,6 +836,12 @@ + + + + + + Evaluates the given condition, and if it passes, discards the current fragment and terminates the thread. Only valid in a **fragment** shader. @@ -836,6 +855,12 @@ + + + + + + Jump to an indirectly specified (absolute or relative) address. Used to jump to blend shaders at the end of a fragment shader. @@ -851,6 +876,13 @@ + + + + + + + General-purpose barrier. Must use slot #7. Must be paired with a `.wait` flow on the instruction. @@ -863,11 +895,21 @@ + + + + + + + + + + Evaluates the given condition and outputs either the true source or the @@ -885,21 +927,41 @@ + + + + + + + + + + + + + + + + + + + + Evaluates the given condition and outputs either the true source or the @@ -921,6 +983,13 @@ + + + + + + + @@ -936,6 +1005,13 @@ + + + + + + + Fetches a given flat varying from hardware buffer @@ -949,6 +1025,13 @@ + + + + + + + Fetches a given flat varying from hardware buffer @@ -964,11 +1047,27 @@ + + + + + + + + + + + + + + + + @@ -988,11 +1087,25 @@ + + + + + + + + + + + + + + @@ -1010,6 +1123,12 @@ + + + + + + Interpolates a given varying from a software buffer @@ -1026,6 +1145,13 @@ + + + + + + + Interpolates a given varying from a software buffer @@ -1043,6 +1169,13 @@ + + + + + + + Fetches a given varying from a software buffer @@ -1056,6 +1189,13 @@ + + + + + + + Fetches a given varying from a software buffer @@ -1071,6 +1211,12 @@ + + + + + + Load `vecsize` components from the attribute descriptor at entry `index` of resource table `table` at index (vertex ID, instance ID), converting @@ -1092,6 +1238,13 @@ + + + + + + + Load `vecsize` components from the attribute descriptor at the specified location at index (vertex ID, instance ID), converting @@ -1113,6 +1266,13 @@ + + + + + + + Load the 64-bit global clock, either a cycle counter or the system clock. @@ -1124,6 +1284,12 @@ + + + + + + Load `vecsize` components from the texture descriptor at entry `index` of resource table `table`, converting @@ -1145,6 +1311,13 @@ + + + + + + + Load `vecsize` components from the texture descriptor at the specified location at index, converting @@ -1165,6 +1338,12 @@ + + + + + + Load the effective address of an attribute specified with the given immediate index. Returns three staging register: the low/high @@ -1184,6 +1363,13 @@ + + + + + + + Load the effective address of an attribute specified with the given index. Returns three staging register: the low/high @@ -1203,6 +1389,12 @@ + + + + + + Load the effective address of a texel from the image specified with the given immediate index. Returns three staging registers: the low/high @@ -1227,6 +1419,13 @@ + + + + + + + Load the effective address of a texel from the image specified with the given index. Returns three staging register: the low/high @@ -1251,6 +1450,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1272,6 +1478,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1293,6 +1506,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1314,6 +1534,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1335,6 +1562,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1356,6 +1590,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1377,6 +1618,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1398,6 +1646,13 @@ + + + + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1419,6 +1674,11 @@ + + + + + Load effective address of a buffer with an offset added. @@ -1433,6 +1693,12 @@ + + + + + + Load effective address of a buffer with an immediate offset added. @@ -1449,6 +1715,15 @@ + + + + + + + + + Loads from main memory @@ -1465,6 +1740,15 @@ + + + + + + + + + Loads from main memory @@ -1481,6 +1765,15 @@ + + + + + + + + + Loads from main memory @@ -1497,6 +1790,15 @@ + + + + + + + + + Loads from main memory @@ -1513,6 +1815,15 @@ + + + + + + + + + Loads from main memory @@ -1529,6 +1840,15 @@ + + + + + + + + + Loads from main memory @@ -1545,6 +1865,15 @@ + + + + + + + + + Loads from main memory @@ -1561,6 +1890,15 @@ + + + + + + + + + Loads from main memory @@ -1580,48 +1918,120 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1634,6 +2044,11 @@ + + + + + Load effective address of a simple buffer with an offset added. @@ -1648,6 +2063,13 @@ + + + + + + + Load from memory with data conversion. The address to load from is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1668,6 +2090,13 @@ + + + + + + + Store to memory with data conversion. The address to store to is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1690,6 +2119,12 @@ + + + + + + Loads a given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1710,6 +2145,13 @@ + + + + + + + Store to given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1729,6 +2171,12 @@ + + + + + + Blends a given render target. This loads the API-specified blend state for the render target from the first source. Blend descriptors are available @@ -1768,6 +2216,13 @@ + + + + + + + Does alpha-to-coverage testing, updating the sample coverage mask. ATEST does not do an implicit discard. It should be executed before the first @@ -1784,6 +2239,13 @@ + + + + + + + Programatically writes out depth, stencil, or both, depending on which modifiers are set. Used to implement gl_FragDepth and gl_FragStencil. @@ -1818,6 +2280,11 @@ + + + + + @@ -1833,6 +2300,11 @@ + + + + + @@ -1849,6 +2321,11 @@ + + + + + @@ -1863,6 +2340,11 @@ + + + + + @@ -1883,12 +2365,22 @@ + + + + + + + + + + Value to convert @@ -1939,6 +2431,11 @@ + + + + + Converts up with the specified round mode. Value to convert @@ -1954,6 +2451,11 @@ + + + + + @@ -1969,6 +2471,11 @@ + + + + + @@ -1992,6 +2499,11 @@ + + + + + @@ -2006,6 +2518,11 @@ + + + + + @@ -2029,6 +2546,11 @@ + + + + + @@ -2048,6 +2570,11 @@ + + + + + Canonical register-to-register move. @@ -2057,6 +2584,11 @@ + + + + + Used as a primitive for various bitwise operations. @@ -2068,6 +2600,11 @@ + + + + + Used as a primitive for various bitwise operations. @@ -2079,6 +2616,11 @@ + + + + + Used as a primitive for various bitwise operations. @@ -2090,6 +2632,11 @@ + + + + + 64-bit abs may be constructed in 4 instructions (5 clocks) by checking the sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with @@ -2103,6 +2650,11 @@ + + + + + @@ -2120,6 +2672,11 @@ + + + + + Only available as 32-bit. Smaller bitsizes require explicit conversions. 64-bit popcount may be constructed in 3 clocks by separate 32-bit @@ -2134,6 +2691,11 @@ + + + + + Only available as 32-bit. Other bitsizes may be derived with swizzles. @@ -2166,6 +2728,11 @@ + + + + + Returns the mask of lanes ever active within the warp (subgroup), such that the source is nonzero. The number of work-items in a subgroup is @@ -2187,12 +2754,22 @@ + + + + + + + + + + Flush special float values. The ftz modifier flushes subnormal values to @@ -2212,6 +2789,11 @@ + + + + + @@ -2225,6 +2807,11 @@ + + + + + @@ -2251,60 +2838,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Performs a given special function. The floating-point reciprocal (`FRCP`) @@ -2323,24 +2960,44 @@ + + + + + + + + + + + + + + + + + + + + Performs a given special function. The trigonometric tables @@ -2356,12 +3013,22 @@ + + + + + + + + + + $A + B$ @@ -2377,12 +3044,22 @@ + + + + + + + + + + $\min \{ A, B \}$ @@ -2396,12 +3073,22 @@ + + + + + + + + + + $\max \{ A, B \}$ @@ -2433,12 +3120,23 @@ + + + + + + + + + + + Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate @@ -2457,6 +3155,11 @@ + + + + + Calculates the base-2 exponent of an argument specified as a 8:24 fixed-point. The original argument is passed as well for correct handling @@ -2472,6 +3175,11 @@ + + + + + Performs a floating-point addition specialized for logarithm computation. @@ -2485,6 +3193,12 @@ + + + + + + Used for `atan2()` implementation. Destination is two 16-bit values (int and float) for the first form, and a single 32-bit float when @@ -2507,12 +3221,22 @@ + + + + + + + + + + @@ -2526,12 +3250,22 @@ + + + + + + + + + + @@ -2545,12 +3279,24 @@ + + + + + + + + + + + + A B @@ -2562,6 +3308,11 @@ + + + + + Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)` A B @@ -2573,12 +3324,22 @@ + + + + + + + + + + @@ -2592,12 +3353,22 @@ + + + + + + + + + + @@ -2611,12 +3382,24 @@ + + + + + + + + + + + + $A - B$ with optional saturation A @@ -2637,6 +3420,12 @@ + + + + + + @@ -2655,12 +3444,24 @@ + + + + + + + + + + + + A @@ -2673,42 +3474,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $A \cdot B$ with optional saturation. Note the multipliers can only handle up to @@ -2775,6 +3612,11 @@ + + + + + Selects the value of A in the subgroup lane given by B. This implements subgroup broadcasts. It may be used as a primitive for screen space @@ -2792,11 +3634,21 @@ + + + + + + + + + + $A \cdot B + C$ @@ -2812,24 +3664,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise ANDs it with the @@ -2847,24 +3723,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise ANDs it with the @@ -2885,24 +3785,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise ORs it with the @@ -2920,24 +3844,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise ORs it with the @@ -2958,24 +3906,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise XORs it with the @@ -2993,24 +3965,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise XORs it with the @@ -3029,6 +4025,12 @@ + + + + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -3046,6 +4048,12 @@ + + + + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -3063,6 +4071,12 @@ + + + + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -3081,6 +4095,12 @@ + + + + + + During a cube map transform, select the S coordinate given a selected face. Z coordinate as 32-bit floating point X coordinate as 32-bit floating point @@ -3092,6 +4112,12 @@ + + + + + + During a cube map transform, select the T coordinate given a selected face. Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point @@ -3102,6 +4128,11 @@ + + + + + Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD. @@ -3120,6 +4151,11 @@ + + + + + Select the maximum absolute value of its arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -3130,6 +4166,11 @@ + + + + + Select the cube face index corresponding to the arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -3153,12 +4194,24 @@ + + + + + + + + + + + + A B @@ -3179,12 +4232,22 @@ + + + + + + + + + + @@ -3212,12 +4275,22 @@ + + + + + + + + + + @@ -3246,12 +4319,22 @@ + + + + + + + + + + @@ -3272,12 +4355,22 @@ + + + + + + + + + + @@ -3298,12 +4391,22 @@ + + + + + + + + + + @@ -3331,12 +4434,22 @@ + + + + + + + + + + @@ -3371,12 +4484,22 @@ + + + + + + + + + + @@ -3389,6 +4512,10 @@ + + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `IADD.i32` with a @@ -3405,6 +4532,10 @@ + + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -3436,6 +4567,10 @@ + + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `FADD.f32` with a @@ -3450,6 +4585,10 @@ + + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -3466,6 +4605,13 @@ + + + + + + + @@ -3481,6 +4627,13 @@ + + + + + + + @@ -3496,6 +4649,13 @@ + + + + + + + @@ -3510,6 +4670,13 @@ + + + + + + + @@ -3524,6 +4691,13 @@ + + + + + + + @@ -3544,6 +4718,13 @@ + + + + + + + @@ -3563,6 +4744,12 @@ + + + + + + Unfiltered textured instruction. @@ -3589,6 +4776,11 @@ + + + + + Ordinary texturing instruction using a sampler. @@ -3617,6 +4809,11 @@ + + + + + Texture gather instruction. @@ -3646,6 +4843,12 @@ + + + + + + Texture sample with explicit gradient. @@ -3672,6 +4875,11 @@ + + + + + Pair of texture instructions. @@ -3697,6 +4905,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3721,6 +4937,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3746,6 +4970,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3771,6 +5003,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3795,6 +5035,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3819,6 +5067,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3844,6 +5100,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3869,6 +5133,14 @@ + + + + + + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3893,6 +5165,11 @@ + + + + + First calculates $A \cdot B + C$ and then biases the exponent by D. Used in special transcendental function sequences. It should not be used for @@ -3911,6 +5188,11 @@ + + + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an @@ -3930,6 +5212,11 @@ + + + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply is treated as $A$ even if an @@ -3949,6 +5236,11 @@ + + + + + First calculates $A \cdot B + C$ and then biases the exponent by D, interpreted as a 16-bit value. Used in special transcendental function From 4d341937b19c86e948c7fc621ec165f969b69824 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 11 Mar 2026 16:35:17 +0100 Subject: [PATCH 42/49] pan/va: Build v15 compiler tables These use the updated opcodes used by v15. --- .../compiler/bifrost/valhall/disasm.py | 26 ++++++- .../compiler/bifrost/valhall/disassemble.h | 1 + .../compiler/bifrost/valhall/valhall.c.py | 68 ++++++++++++++++++- .../compiler/bifrost/valhall/valhall.py | 7 +- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index 20441318cc0..6ec53f3fafd 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -218,6 +218,14 @@ va_disasm_instr(FILE *fp, uint64_t instr) ${recurse_subcodes(OPCODES)} } +void +va_disasm_instr_v15(FILE *fp, uint64_t instr) +{ + unsigned opcode; + +${recurse_subcodes(OPCODES_V15)} +} + static bool is_branch(uint64_t instr) { <% (exact, mask) = OPCODES.get_exact_mask("BRANCHZ") %> @@ -229,6 +237,17 @@ static bool is_branch(uint64_t instr) return false; } +static bool is_branch_v15(uint64_t instr) +{ +<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZ") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; +<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZI") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; + return false; +} + void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) { @@ -276,6 +295,9 @@ class OpBucket: self.children = {} def insert(self, subcodes, ins): + # Need an early return in case of removed instructions + if subcodes is None: + return if len(subcodes) == 0: self.instr = ins else: @@ -305,10 +327,12 @@ class OpBucket: # Build opcode hierarchy: OPCODES = OpBucket() +OPCODES_V15 = OpBucket() for ins in instructions: OPCODES.insert(ins.opcode, ins) + OPCODES_V15.insert(ins.opcode_v15, ins) try: - print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) + print(Template(template).render(OPCODES = OPCODES, OPCODES_V15 = OPCODES_V15, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) except: print(exceptions.text_error_template().render()) diff --git a/src/panfrost/compiler/bifrost/valhall/disassemble.h b/src/panfrost/compiler/bifrost/valhall/disassemble.h index e9057fa860b..a7f73db52b8 100644 --- a/src/panfrost/compiler/bifrost/valhall/disassemble.h +++ b/src/panfrost/compiler/bifrost/valhall/disassemble.h @@ -15,6 +15,7 @@ #include void va_disasm_instr(FILE *fp, uint64_t instr); +void va_disasm_instr_v15(FILE *fp, uint64_t instr); void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose); #endif diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index 81e9a2ba523..ae7a1b5a001 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -100,7 +100,64 @@ valhall_opcodes[BI_NUM_OPCODES] = { sr_control = op.staging[0].encoded_flags >> 6 %> [BI_OPCODE_${name.replace('.', '_').upper()}] = { - .exact = ${hex(exact(op))}ULL, + .exact = ${hex(exact(op.opcode))}ULL, + .srcs = { +% for src in ([sr for sr in op.staging if sr.read] + op.srcs): + { + .absneg = ${ibool(src.absneg)}, + .swizzle = ${ibool(src.swizzle)}, + .notted = ${ibool(src.notted)}, + .widen = ${ibool(src.widen)}, + .lanes = ${ibool(src.lanes)}, + .halfswizzle = ${ibool(src.halfswizzle)}, + .lane = ${ibool(src.lane)}, + .combine = ${ibool(src.combine)}, +% if src.size in [8, 16, 32, 64]: + .size = VA_SIZE_${src.size}, +% endif + }, +% endfor + }, + .type_size = ${typesize(op.name)}, + .has_dest = ${ibool(len(op.dests) > 0)}, + .is_signed = ${ibool(op.is_signed)}, + .unit = VA_UNIT_${op.unit}, + .nr_srcs = ${len(op.srcs)}, + .nr_staging_srcs = ${sum([sr.read for sr in op.staging])}, + .nr_staging_dests = ${sum([sr.write for sr in op.staging])}, + .clamp = ${hasmod(x, 'clamp')}, + .saturate = ${hasmod(x, 'saturate')}, + .rhadd = ${hasmod(x, 'rhadd')}, + .round_mode = ${hasmod(x, 'round_mode')}, + .condition = ${hasmod(x, 'condition')}, + .result_type = ${hasmod(x, 'result_type')}, + .vecsize = ${hasmod(x, 'vector_size')}, + .register_format = ${hasmod(x, 'register_format')}, + .slot = ${hasmod(x, 'slot')}, + .sr_count = ${hasmod(x, 'staging_register_count')}, + .sr_write_count = ${hasmod(x, 'staging_register_write_count')}, + .sr_control = ${sr_control}, + }, +% endif +% endfor +}; + +const struct va_opcode_info +valhall_v15_opcodes[BI_NUM_OPCODES] = { +% for op in instructions: +% if op.name not in skip: +<% + name = op.name + if name == 'BRANCHZ': + name = 'BRANCHZ.i16' + + sr_control = 0 + + if len(op.staging) > 0: + sr_control = op.staging[0].encoded_flags >> 6 +%> + [BI_OPCODE_${name.replace('.', '_').upper()}] = { + .exact = ${hex(exact(op.opcode_v15))}ULL, .srcs = { % for src in ([sr for sr in op.staging if sr.read] + op.srcs): { @@ -144,9 +201,14 @@ valhall_opcodes[BI_NUM_OPCODES] = { """ # Exact value to be ORed in to every opcode -def exact_op(op): +def exact_op(opcode): exact_op = 0 - for subcode in op.opcode: + + # Need an early return in case of removed instructions + if not opcode: + return exact_op + + for subcode in opcode: exact_op |= (subcode.value << subcode.start) return exact_op diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index 7cae9521b87..c6cbf31ed86 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -186,11 +186,12 @@ class Opcode: self.mask = mask class Instruction: - def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): + def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): self.name = name self.srcs = srcs self.dests = dests self.opcode = opcode + self.opcode_v15 = opcode_v15 self.immediates = immediates self.modifiers = modifiers self.staging = staging @@ -273,6 +274,7 @@ def build_instr(el, overrides = {}): # Get overridables name = overrides.get('name') or el.attrib.get('name') opcode = overrides.get('opcode') or build_opcode(el, 'opcode') + opcode_v15 = overrides.get('opcode_v15') or build_opcode(el, 'opcode_v15') unit = overrides.get('unit') or el.attrib.get('unit') # Get explicit sources/dests @@ -312,7 +314,7 @@ def build_instr(el, overrides = {}): elif mod.tag =='va_mod': modifiers.append(build_modifier(mod)) - instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) + instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) instructions.append(instr) @@ -323,6 +325,7 @@ def build_group(el): build_instr(el, overrides = { 'name': ins.attrib['name'], 'opcode': build_opcode(ins, 'opcode'), + 'opcode_v15': build_opcode(ins, 'opcode_v15'), 'unit': ins.attrib.get('unit'), }) From 64504422abde79e63838e48fac039322dabb8f01 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 13 Apr 2026 17:00:43 +0200 Subject: [PATCH 43/49] pan/va: Default valhall compiler tests to arch v10 Some compiler tests were not specifying arch, which will not work once v15 support lands. Therefore, default these to v10. --- .../compiler/bifrost/valhall/test/test-lower-constants.cpp | 1 + src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp index 7ec4f330a3d..f363ab3b1b2 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp @@ -12,6 +12,7 @@ static inline void add_imm(bi_context *ctx) { + ctx->arch = 10; struct hash_table_u64 *stats = _mesa_hash_table_u64_create(ctx); bi_foreach_instr_global(ctx, I) { va_lower_constants(ctx, I, stats, UINT32_MAX); diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp index f091255bdd5..5035feda5b6 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp @@ -26,7 +26,9 @@ strip_discard(bi_context *ctx) do { \ void *mem_ctx = ralloc_context(NULL); \ bi_builder *A = bit_builder(mem_ctx); \ + A->shader->arch = 10; \ bi_builder *B = bit_builder(mem_ctx); \ + B->shader->arch = 10; \ { \ UNUSED bi_builder *b = A; \ test; \ From 1ba1f76146adbc6ceea7abc3a4147a3df383fc80 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 11 Mar 2026 16:35:59 +0100 Subject: [PATCH 44/49] pan/va: Pick compiler table based on arch Make v15 use the new tables added in a previous commit. --- .../compiler/bifrost/bi_lower_swizzle.c | 10 +++---- src/panfrost/compiler/bifrost/bi_ra.c | 4 +-- .../compiler/bifrost/bifrost_compile.c | 27 ++++++++++--------- src/panfrost/compiler/bifrost/cmdline.c | 3 ++- .../compiler/bifrost/valhall/disasm.py | 13 ++++++--- .../compiler/bifrost/valhall/disassemble.h | 3 ++- .../compiler/bifrost/valhall/va_compiler.h | 2 +- .../bifrost/valhall/va_lower_constants.c | 16 ++++++----- .../bifrost/valhall/va_lower_split_64bit.c | 2 +- .../compiler/bifrost/valhall/va_mark_last.c | 2 +- .../compiler/bifrost/valhall/va_optimize.c | 2 +- .../compiler/bifrost/valhall/va_pack.c | 7 ++--- .../compiler/bifrost/valhall/va_perf.c | 4 +-- .../compiler/bifrost/valhall/valhall.c.py | 10 +++++++ .../compiler/bifrost/valhall/valhall.h | 7 ++--- src/panfrost/compiler/pan_compiler.c | 3 ++- 16 files changed, 71 insertions(+), 44 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bi_lower_swizzle.c b/src/panfrost/compiler/bifrost/bi_lower_swizzle.c index d7e92803ba7..fc16cbe19d8 100644 --- a/src/panfrost/compiler/bifrost/bi_lower_swizzle.c +++ b/src/panfrost/compiler/bifrost/bi_lower_swizzle.c @@ -16,14 +16,14 @@ */ static uint32_t -va_op_swizzles(enum bi_opcode op, unsigned src) +va_op_swizzles(enum bi_opcode op, unsigned src, unsigned arch) { /* This is a bifrost-only instruction that is lowered on valhall */ - if (!valhall_opcodes[op].exact) + if (!get_valhall_opcode(op, arch).exact) return bi_op_swizzles[op][src]; uint32_t swizzles = 0; - struct va_src_info info = va_src_info(op, src); + struct va_src_info info = va_src_info(op, src, arch); if (info.swizzle) { assert(info.size == VA_SIZE_16 || info.size == VA_SIZE_32); @@ -99,8 +99,8 @@ bool bi_op_supports_swizzle(enum bi_opcode op, unsigned src, enum bi_swizzle swizzle, unsigned arch) { - uint32_t supported_swizzles = arch >= 9 ? - va_op_swizzles(op, src) : bi_op_swizzles[op][src]; + uint32_t supported_swizzles = + arch >= 9 ? va_op_swizzles(op, src, arch) : bi_op_swizzles[op][src]; return supported_swizzles & BITFIELD_BIT(swizzle); } diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index 7f058bf0d3f..6bd38fd2ef3 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -382,8 +382,8 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, bi_foreach_ssa_src(ins, s) { if (bi_count_read_registers(ins, s) >= 2) l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; - else if (s < valhall_opcodes[ins->op].nr_srcs && - va_src_info(ins->op, s).size > VA_SIZE_32) + else if (s < get_valhall_opcode(ins->op, arch).nr_srcs && + va_src_info(ins->op, s, arch).size > VA_SIZE_32) l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; } } diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 8f459a1d8e6..c0b551425e2 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -4165,13 +4165,13 @@ va_count_stats(bi_context *ctx, unsigned nr_ins, unsigned size, } static unsigned -va_gather_stats_block(bi_block *block, struct va_stats *counts) +va_gather_stats_block(bi_block *block, unsigned arch, struct va_stats *counts) { unsigned nr_ins = 0; bi_foreach_instr_in_block(block, I) { nr_ins++; - va_count_instr_stats(I, counts); + va_count_instr_stats(I, arch, counts); } return nr_ins; } @@ -4180,7 +4180,8 @@ va_gather_stats_block(bi_block *block, struct va_stats *counts) * Gather stats for a minimum length path through the shader. */ static unsigned -va_gather_min_path_stats(bi_block *block, struct va_stats *counts) +va_gather_min_path_stats(bi_block *block, unsigned arch, + struct va_stats *counts) { struct va_stats min_counts; struct va_stats save_counts = *counts; @@ -4192,7 +4193,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts) if (bi_block_dominates(next, block)) { continue; } - nr_ins = va_gather_min_path_stats(next, counts); + nr_ins = va_gather_min_path_stats(next, arch, counts); if (min_ins == 0 || nr_ins < min_ins) { min_ins = nr_ins; min_counts = *counts; @@ -4202,7 +4203,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts) if (min_ins != 0) { *counts = min_counts; } - nr_ins = min_ins + va_gather_stats_block(block, counts); + nr_ins = min_ins + va_gather_stats_block(block, arch, counts); return nr_ins; } @@ -4213,7 +4214,8 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts) * bail out. */ static unsigned -va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *visited) +va_gather_max_path_stats(bi_block *block, unsigned arch, + struct va_stats *counts, BITSET_WORD *visited) { struct va_stats max_counts; struct va_stats save_counts = *counts; @@ -4226,7 +4228,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD * if (BITSET_TEST(visited, next->index)) { continue; } - nr_ins = va_gather_max_path_stats(next, counts, visited); + nr_ins = va_gather_max_path_stats(next, arch, counts, visited); if (nr_ins > max_ins) { max_ins = nr_ins; max_counts = *counts; @@ -4236,7 +4238,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD * if (max_ins != 0) { *counts = max_counts; } - nr_ins = max_ins + va_gather_stats_block(block, counts); + nr_ins = max_ins + va_gather_stats_block(block, arch, counts); return nr_ins; } @@ -4260,15 +4262,16 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out, case GATHER_STATS_FULL: bi_foreach_instr_global(ctx, I) { nr_ins++; - va_count_instr_stats(I, &counts); + va_count_instr_stats(I, ctx->arch, &counts); } break; case GATHER_STATS_MIN: - nr_ins = va_gather_min_path_stats(first_block, &counts); + nr_ins = va_gather_min_path_stats(first_block, ctx->arch, &counts); break; case GATHER_STATS_MAX: visited = BITSET_RZALLOC(NULL, ctx->num_blocks); - nr_ins = va_gather_max_path_stats(first_block, &counts, visited); + nr_ins = + va_gather_max_path_stats(first_block, ctx->arch, &counts, visited); ralloc_free(visited); break; } @@ -4630,7 +4633,7 @@ bi_compile_variant_nir(nir_shader *nir, bifrost_debug & BIFROST_DBG_VERBOSE); } else { disassemble_valhall(stderr, binary->data + offset, - binary->size - offset, + binary->size - offset, ctx->arch, bifrost_debug & BIFROST_DBG_VERBOSE); } diff --git a/src/panfrost/compiler/bifrost/cmdline.c b/src/panfrost/compiler/bifrost/cmdline.c index b94b7efbefe..1b3c7ceccc6 100644 --- a/src/panfrost/compiler/bifrost/cmdline.c +++ b/src/panfrost/compiler/bifrost/cmdline.c @@ -48,7 +48,8 @@ disassemble(const char *filename) } if (pan_arch(gpu_id) >= 9) - disassemble_valhall(stdout, entrypoint, filesize, verbose); + disassemble_valhall(stdout, entrypoint, filesize, pan_arch(gpu_id), + verbose); else disassemble_bifrost(stdout, entrypoint, filesize, verbose); diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index 6ec53f3fafd..d744d6bf45e 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -249,7 +249,7 @@ static bool is_branch_v15(uint64_t instr) } void -disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) +disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch, bool verbose) { assert((size & 7) == 0); @@ -275,11 +275,18 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) fprintf(fp, " "); } - va_disasm_instr(fp, instr); + bool instr_is_branch; + if (arch >= 15) { + va_disasm_instr_v15(fp, instr); + instr_is_branch = is_branch_v15(instr); + } else { + va_disasm_instr(fp, instr); + instr_is_branch = is_branch(instr); + } fprintf(fp, "\\n"); /* Separate blocks visually by inserting whitespace after branches */ - if (is_branch(instr)) + if (instr_is_branch) fprintf(fp, "\\n"); } diff --git a/src/panfrost/compiler/bifrost/valhall/disassemble.h b/src/panfrost/compiler/bifrost/valhall/disassemble.h index a7f73db52b8..05908b4d643 100644 --- a/src/panfrost/compiler/bifrost/valhall/disassemble.h +++ b/src/panfrost/compiler/bifrost/valhall/disassemble.h @@ -16,6 +16,7 @@ void va_disasm_instr(FILE *fp, uint64_t instr); void va_disasm_instr_v15(FILE *fp, uint64_t instr); -void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose); +void disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch, + bool verbose); #endif diff --git a/src/panfrost/compiler/bifrost/valhall/va_compiler.h b/src/panfrost/compiler/bifrost/valhall/va_compiler.h index 1d8a38a1f37..622ab81b302 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_compiler.h +++ b/src/panfrost/compiler/bifrost/valhall/va_compiler.h @@ -77,7 +77,7 @@ struct va_stats { unsigned nr_fau_uniforms; }; -void va_count_instr_stats(bi_instr *I, struct va_stats *stats); +void va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats); #ifdef __cplusplus } /* extern C */ diff --git a/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c b/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c index 5646f682b1d..4ab718f6b35 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c +++ b/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c @@ -211,7 +211,7 @@ va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info, static uint32_t va_resolve_swizzles(bi_context *ctx, bi_instr *I, unsigned s) { - struct va_src_info info = va_src_info(I->op, s); + struct va_src_info info = va_src_info(I->op, s, ctx->arch); uint32_t value = I->src[s].value; enum bi_swizzle swz = I->src[s].swizzle; @@ -257,9 +257,10 @@ va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts, /* abs(#c) is pointless, but -#c occurs in transcendental sequences */ assert(!I->src[s].abs && "redundant .abs modifier"); - bool is_signed = valhall_opcodes[I->op].is_signed; - bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs); - struct va_src_info info = va_src_info(I->op, s); + bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed; + bool staging = + (s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs); + struct va_src_info info = va_src_info(I->op, s, ctx->arch); const uint32_t value = va_resolve_swizzles(ctx, I, s); const uint32_t count = (uintptr_t)_mesa_hash_table_u64_search(counts, value); @@ -294,12 +295,13 @@ va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts) if (I->src[s].type != BI_INDEX_CONSTANT) continue; - const bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs); + const bool staging = + (s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs); if (staging) continue; - bool is_signed = valhall_opcodes[I->op].is_signed; - struct va_src_info info = va_src_info(I->op, s); + bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed; + struct va_src_info info = va_src_info(I->op, s, ctx->arch); uint32_t value = va_resolve_swizzles(ctx, I, s); bi_index cons = va_lookup_constant(value, info, is_signed); diff --git a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c index 6b81346845c..ac72b35261d 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c +++ b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c @@ -78,7 +78,7 @@ va_lower_split_64bit(bi_context *ctx) if (bi_is_null(I->src[s]) || s >= 4) continue; - struct va_src_info info = va_src_info(I->op, s); + struct va_src_info info = va_src_info(I->op, s, ctx->arch); /* Only split if the instruction expects 64-bit inputs as two separate * sources. */ diff --git a/src/panfrost/compiler/bifrost/valhall/va_mark_last.c b/src/panfrost/compiler/bifrost/valhall/va_mark_last.c index 0bfe93ce228..454fba92ef8 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_mark_last.c +++ b/src/panfrost/compiler/bifrost/valhall/va_mark_last.c @@ -179,7 +179,7 @@ va_mark_last(bi_context *ctx) break; /* Only need to unmark split registers. */ - if (va_src_info(I->op, s).size == VA_SIZE_64 && + if (va_src_info(I->op, s, ctx->arch).size == VA_SIZE_64 && bi_count_read_registers(I, s) == 1) { bool both_discard = I->src[s].discard && I->src[s + 1].discard; diff --git a/src/panfrost/compiler/bifrost/valhall/va_optimize.c b/src/panfrost/compiler/bifrost/valhall/va_optimize.c index a0609601d4e..41fc2721bbb 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_optimize.c +++ b/src/panfrost/compiler/bifrost/valhall/va_optimize.c @@ -286,7 +286,7 @@ va_fuse_cmp(bi_context *ctx, bi_instr **lut, const BITSET_WORD *multiple, static bool va_propagate_replicate_wide(bi_context *ctx, bi_instr **lut, bi_instr *I) { - struct va_opcode_info info = valhall_opcodes[I->op]; + struct va_opcode_info info = get_valhall_opcode(I->op, ctx->arch); bool progress = false; bi_foreach_ssa_src(I, s) { diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index d7f42168c7a..129512ce170 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -455,7 +455,7 @@ va_pack_rhadd(const bi_instr *I) static uint64_t va_pack_alu(const bi_instr *I, unsigned arch) { - struct va_opcode_info info = valhall_opcodes[I->op]; + struct va_opcode_info info = get_valhall_opcode(I->op, arch); uint64_t hex = 0; switch (I->op) { @@ -750,7 +750,8 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor) VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY, }; - unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7; + /* TODO hack */ + unsigned memory_size = (get_valhall_opcode(I->op, 10).exact >> 27) & 0x7; uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36; // unsigned @@ -826,7 +827,7 @@ va_pack_register_format(const bi_instr *I) uint64_t va_pack_instr(const bi_instr *I, unsigned arch) { - struct va_opcode_info info = valhall_opcodes[I->op]; + struct va_opcode_info info = get_valhall_opcode(I->op, arch); uint64_t hex = info.exact | (((uint64_t)I->flow) << 59); hex |= ((uint64_t)va_select_fau_page(I)) << 57; diff --git a/src/panfrost/compiler/bifrost/valhall/va_perf.c b/src/panfrost/compiler/bifrost/valhall/va_perf.c index 5067a2fc58e..5272a5bd084 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_perf.c +++ b/src/panfrost/compiler/bifrost/valhall/va_perf.c @@ -9,7 +9,7 @@ #include "valhall.h" void -va_count_instr_stats(bi_instr *I, struct va_stats *stats) +va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats) { /* Adjusted for 64-bit arithmetic */ unsigned words = bi_count_write_registers(I, 0); @@ -35,7 +35,7 @@ va_count_instr_stats(bi_instr *I, struct va_stats *stats) } } } - switch (valhall_opcodes[I->op].unit) { + switch (get_valhall_opcode(I->op, arch).unit) { /* Arithmetic is 2x slower for 64-bit than 32-bit */ case VA_UNIT_FMA: stats->fma += words; diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index ae7a1b5a001..b8808bd30e4 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -198,6 +198,16 @@ valhall_v15_opcodes[BI_NUM_OPCODES] = { % endif % endfor }; + +const struct va_opcode_info +get_valhall_opcode(enum bi_opcode op, unsigned arch) +{ + assert(arch >= 9); + if (arch < 15) + return valhall_opcodes[op]; + else + return valhall_v15_opcodes[op]; +} """ # Exact value to be ORed in to every opcode diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.h b/src/panfrost/compiler/bifrost/valhall/valhall.h index ae716c36ffd..763628cd1fb 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.h +++ b/src/panfrost/compiler/bifrost/valhall/valhall.h @@ -89,7 +89,8 @@ struct va_opcode_info { unsigned sr_control : 2; }; -extern const struct va_opcode_info valhall_opcodes[BI_NUM_OPCODES]; +const struct va_opcode_info get_valhall_opcode(enum bi_opcode op, + unsigned arch); /* Bifrost specifies the source of bitwise operations as (A, B, shift), but * Valhall specifies (A, shift, B). We follow Bifrost conventions in the @@ -130,10 +131,10 @@ va_swap_12(enum bi_opcode op) } static inline struct va_src_info -va_src_info(enum bi_opcode op, unsigned src) +va_src_info(enum bi_opcode op, unsigned src, unsigned arch) { unsigned idx = (va_swap_12(op) && (src == 1 || src == 2)) ? (3 - src) : src; - return valhall_opcodes[op].srcs[idx]; + return get_valhall_opcode(op, arch).srcs[idx]; } static inline bool diff --git a/src/panfrost/compiler/pan_compiler.c b/src/panfrost/compiler/pan_compiler.c index 3fd702227c8..9c27a36e6ee 100644 --- a/src/panfrost/compiler/pan_compiler.c +++ b/src/panfrost/compiler/pan_compiler.c @@ -288,7 +288,8 @@ pan_disassemble(FILE *fp, const void *code, size_t size, uint64_t gpu_id, bool verbose) { if (pan_arch(gpu_id) >= 9) - disassemble_valhall(fp, (const uint64_t *)code, size, verbose); + disassemble_valhall(fp, (const uint64_t *)code, size, pan_arch(gpu_id), + verbose); else if (pan_arch(gpu_id) >= 6) disassemble_bifrost(fp, code, size, verbose); else From e1739f271ade4eada87e5ccb384f637b97ffa6f7 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Fri, 17 Apr 2026 15:30:48 +0200 Subject: [PATCH 45/49] pan/va: Make packing tests explicitly use arch v10 Once we implement packing support for v15, the encoding will drastically change. Therefore, make the packing tests explicitly use v10 to allow for adding v15 support later. --- .../bifrost/valhall/test/test-packing.cpp | 375 +++++++++--------- 1 file changed, 185 insertions(+), 190 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 0ac71cc2f4f..3b92c96087f 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -22,8 +22,6 @@ } \ } while (0) -#define CASE(instr, expected) CASE_ARCH(instr, 10, expected) - class ValhallPacking : public testing::Test { protected: ValhallPacking() @@ -48,124 +46,128 @@ class ValhallPacking : public testing::Test { TEST_F(ValhallPacking, Moves) { - CASE(bi_mov_i32_to(b, bi_register(1), bi_register(2)), - 0x0091c10000000002ULL); - CASE(bi_mov_i32_to(b, bi_register(1), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)), - 0x0091c1000000008aULL); + bi_instr *I = bi_mov_i32_to(b, bi_register(1), bi_register(2)); + CASE_ARCH(I, 10, 0x0091c10000000002ULL); + + I = bi_mov_i32_to(b, bi_register(1), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)); + CASE_ARCH(I, 10, 0x0091c1000000008aULL); } TEST_F(ValhallPacking, Fadd) { - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)), - 0x00a4c00000000201ULL); - CASE( - bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))), - 0x00a4c02000000201ULL); - CASE( - bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))), - 0x00a4c01000000201ULL); + bi_instr *I = + bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)); + CASE_ARCH(I, 10, 0x00a4c00000000201ULL); - CASE(bi_fadd_v2f16_to(b, bi_register(0), - bi_swz_16(bi_register(1), false, false), - bi_swz_16(bi_register(0), true, true)), - 0x00a5c0000c000001ULL); + I = + bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))); + CASE_ARCH(I, 10, 0x00a4c02000000201ULL); - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)), - 0x00a5c00028000001ULL); + I = + bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))); + CASE_ARCH(I, 10, 0x00a4c01000000201ULL); - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), - bi_swz_16(bi_register(0), true, false)), - 0x00a5c00024000001ULL); + I = bi_fadd_v2f16_to(b, bi_register(0), + bi_swz_16(bi_register(1), false, false), + bi_swz_16(bi_register(0), true, true)); + CASE_ARCH(I, 10, 0x00a5c0000c000001ULL); - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))), - bi_neg(zero)), - 0x00a5c0902800c040ULL); + I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)); + CASE_ARCH(I, 10, 0x00a5c00028000001ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero), - 0x00a4c0000000c001ULL); + I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), + bi_swz_16(bi_register(0), true, false)); + CASE_ARCH(I, 10, 0x00a5c00024000001ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)), - 0x00a4c0100000c001ULL); + I = bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))), + bi_neg(zero)); + CASE_ARCH(I, 10, 0x00a5c0902800c040ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), - bi_half(bi_register(0), true)), - 0x00a4c00008000001ULL); + I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero); + CASE_ARCH(I, 10, 0x00a4c0000000c001ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), - bi_half(bi_register(0), false)), - 0x00a4c00004000001ULL); + I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)); + CASE_ARCH(I, 10, 0x00a4c0100000c001ULL); + + I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), + bi_half(bi_register(0), true)); + CASE_ARCH(I, 10, 0x00a4c00008000001ULL); + + I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), + bi_half(bi_register(0), false)); + CASE_ARCH(I, 10, 0x00a4c00004000001ULL); } TEST_F(ValhallPacking, Clper) { - CASE(bi_clper_i32_to(b, bi_register(0), bi_register(0), bi_byte(n4567, 0), - BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP16), - 0x00a0c030128fc900); + bi_instr *I = bi_clper_i32_to(b, bi_register(0), bi_register(0), + bi_byte(n4567, 0), BI_INACTIVE_RESULT_F1, + BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16); + CASE_ARCH(I, 10, 0x00a0c030128fc900); } TEST_F(ValhallPacking, Clamps) { bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_abs(bi_register(2)))); - CASE(I, 0x00a4c03000000201ULL); + CASE_ARCH(I, 10, 0x00a4c03000000201ULL); I->clamp = BI_CLAMP_CLAMP_M1_1; - CASE(I, 0x00a4c03200000201ULL); + CASE_ARCH(I, 10, 0x00a4c03200000201ULL); } TEST_F(ValhallPacking, Misc) { - CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), - bi_neg(zero)), - 0x00b2c10400c08841ULL); + bi_instr *I = bi_fma_f32_to( + b, bi_register(1), bi_discard(bi_register(1)), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), bi_neg(zero)); + CASE_ARCH(I, 10, 0x00b2c10400c08841ULL); - CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))), - BI_ROUND_RTN), - 0x0090c240800d0042ULL); + I = bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))), + BI_ROUND_RTN); + CASE_ARCH(I, 10, 0x0090c240800d0042ULL); - CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0), - BI_ROUND_RTN), - 0x00904000a00f0000ULL); + I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0), + BI_ROUND_RTN); + CASE_ARCH(I, 10, 0x00904000a00f0000ULL); - CASE( - bi_fround_v2f16_to(b, bi_half(bi_register(0), false), - bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN), - 0x00904000900f0001ULL); + I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), + bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN); + CASE_ARCH(I, 10, 0x00904000900f0001ULL); } TEST_F(ValhallPacking, FaddImm) { - CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)), - 0x4847C6C0), - 0x0114C24847C6C042ULL); + bi_instr *I = bi_fadd_imm_f32_to(b, bi_register(2), + bi_discard(bi_register(2)), 0x4847C6C0); + CASE_ARCH(I, 10, 0x0114C24847C6C042ULL); - CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), - 0x70AC6784), - 0x0115C270AC678442ULL); + I = bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), + 0x70AC6784); + CASE_ARCH(I, 10, 0x0115C270AC678442ULL); } TEST_F(ValhallPacking, Comparions) { - CASE(bi_icmp_or_v2s16_to(b, bi_register(2), - bi_discard(bi_swz_16(bi_register(3), true, false)), - bi_discard(bi_swz_16(bi_register(2), true, false)), - zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), - 0x00f9c21184c04243); + bi_instr *I = bi_icmp_or_v2s16_to( + b, bi_register(2), bi_discard(bi_swz_16(bi_register(3), true, false)), + bi_discard(bi_swz_16(bi_register(2), true, false)), zero, BI_CMPF_GT, + BI_RESULT_TYPE_M1); + CASE_ARCH(I, 10, 0x00f9c21184c04243); - CASE(bi_fcmp_or_v2f16_to(b, bi_register(2), - bi_discard(bi_swz_16(bi_register(3), true, false)), - bi_discard(bi_swz_16(bi_register(2), false, false)), - zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), - 0x00f5c20190c04243); + I = bi_fcmp_or_v2f16_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(3), true, false)), + bi_discard(bi_swz_16(bi_register(2), false, false)), + zero, BI_CMPF_GT, BI_RESULT_TYPE_M1); + CASE_ARCH(I, 10, 0x00f5c20190c04243); } TEST_F(ValhallPacking, Conversions) { - CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))), - 0x0090c22000070042); + bi_instr *I = + bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))); + CASE_ARCH(I, 10, 0x0090c22000070042); } TEST_F(ValhallPacking, BranchzI16) @@ -173,88 +175,86 @@ TEST_F(ValhallPacking, BranchzI16) bi_instr *I = bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ); I->branch_offset = 1; - CASE(I, 0x001fc03000000102); + CASE_ARCH(I, 10, 0x001fc03000000102); } TEST_F(ValhallPacking, BranchzI16Backwards) { bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ); I->branch_offset = -8; - CASE(I, 0x001fc017fffff8c0); + CASE_ARCH(I, 10, 0x001fc017fffff8c0); } TEST_F(ValhallPacking, Blend) { - CASE( + bi_instr *I = bi_blend_to(b, bi_null(), bi_register(0), bi_register(60), bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true), - bi_null(), BI_REGISTER_FORMAT_F16, 2, 0), - 0x007f4004333c00f0); + bi_null(), BI_REGISTER_FORMAT_F16, 2, 0); + CASE_ARCH(I, 10, 0x007f4004333c00f0); } TEST_F(ValhallPacking, Mux) { - CASE(bi_mux_i32_to(b, bi_register(0), bi_discard(bi_register(0)), - bi_discard(bi_register(4)), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), - BI_MUX_BIT), - 0x00b8c00300804440ull); + bi_instr *I = bi_mux_i32_to( + b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT); + CASE_ARCH(I, 10, 0x00b8c00300804440ull); } TEST_F(ValhallPacking, AtestFP16) { - CASE(bi_atest_to(b, bi_register(60), bi_register(60), - bi_half(bi_register(1), true), - bi_fau(BIR_FAU_ATEST_PARAM, false)), - 0x007dbc0208ea013c); + bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), + bi_half(bi_register(1), true), + bi_fau(BIR_FAU_ATEST_PARAM, false)); + CASE_ARCH(I, 10, 0x007dbc0208ea013c); } TEST_F(ValhallPacking, AtestFP32) { - CASE(bi_atest_to(b, bi_register(60), bi_register(60), one, - bi_fau(BIR_FAU_ATEST_PARAM, false)), - 0x007dbc0200ead03c); + bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), one, + bi_fau(BIR_FAU_ATEST_PARAM, false)); + CASE_ARCH(I, 10, 0x007dbc0200ead03c); } TEST_F(ValhallPacking, Transcendentals) { - CASE(bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true), - 0x0099c10001000000); + bi_instr *I = + bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true); + CASE_ARCH(I, 10, 0x0099c10001000000); - CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false, - true), - 0x0099c00001020040); + I = bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false, + true); + CASE_ARCH(I, 10, 0x0099c00001020040); - CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), 0x009cc20000020001); + I = bi_frsq_f32_to(b, bi_register(2), bi_register(1)); + CASE_ARCH(I, 10, 0x009cc20000020001); - CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), - bi_discard(bi_register(2)), bi_neg(zero), - bi_discard(bi_register(0)), BI_SPECIAL_LEFT), - 0x0162c00440c04241); + I = bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), + bi_discard(bi_register(2)), bi_neg(zero), + bi_discard(bi_register(0)), BI_SPECIAL_LEFT); + CASE_ARCH(I, 10, 0x0162c00440c04241); } TEST_F(ValhallPacking, Csel) { - CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)), - bi_discard(bi_register(3)), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), - BI_CMPF_EQ), - 0x0150c10085844342); + bi_instr *I = bi_csel_u32_to( + b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ); + CASE_ARCH(I, 10, 0x0150c10085844342); - CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)), - bi_discard(bi_register(3)), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), - BI_CMPF_LT), - 0x0150c10485844342); + I = bi_csel_u32_to( + b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT); + CASE_ARCH(I, 10, 0x0150c10485844342); - CASE(bi_csel_s32_to(b, bi_register(1), bi_discard(bi_register(2)), - bi_discard(bi_register(3)), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), - BI_CMPF_LT), - 0x0158c10485844342); + I = bi_csel_s32_to( + b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT); + CASE_ARCH(I, 10, 0x0158c10485844342); } TEST_F(ValhallPacking, LdAttrImm) @@ -264,64 +264,58 @@ TEST_F(ValhallPacking, LdAttrImm) bi_discard(bi_register(61)), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1); I->table = 1; - CASE(I, 0x0066800433117d7c); + CASE_ARCH(I, 10, 0x0066800433117d7c); } TEST_F(ValhallPacking, LdVarBufImmF16) { - CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, - BI_VECSIZE_V4, 0), - 0x005d82143300003d); + bi_instr *I = bi_ld_var_buf_imm_f16_to( + b, bi_register(2), bi_register(61), BI_REGISTER_FORMAT_F16, + BI_SAMPLE_CENTER, BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, BI_VECSIZE_V4, + 0); + CASE_ARCH(I, 10, 0x005d82143300003d); - CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE, - BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, - BI_VECSIZE_V4, 0), - 0x005d80843300003d); + I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE, + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 0); + CASE_ARCH(I, 10, 0x005d80843300003d); - CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, - BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 8), - 10, 0x005d80443308003d); - - CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, - BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 8), - 11, 0x005d80443300083d); + I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID, + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 8); + CASE_ARCH(I, 10, 0x005d80443308003d); + CASE_ARCH(I, 11, 0x005d80443300083d); } TEST_F(ValhallPacking, LdVarBufFlatImmFormat) { - CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), - BI_REGISTER_FORMAT_F32, - BI_VECSIZE_V4, 0x12), - 14, 0x0040800832001200); + bi_instr *I = bi_ld_var_buf_flat_imm_to( + b, bi_register(0), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 0x12); + CASE_ARCH(I, 14, 0x0040800832001200); - CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), - BI_REGISTER_FORMAT_F16, - BI_VECSIZE_V4, 0x12), - 14, 0x0040800433001200); + I = bi_ld_var_buf_flat_imm_to(b, bi_register(0), BI_REGISTER_FORMAT_F16, + BI_VECSIZE_V4, 0x12); + CASE_ARCH(I, 14, 0x0040800433001200); } TEST_F(ValhallPacking, LdVarBufFlat) { - CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4), - 14, 0x005f80083200003d); + bi_instr *I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); + CASE_ARCH(I, 14, 0x005f80083200003d); - CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4), - 14, 0x005f80043300003d); + I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4); + CASE_ARCH(I, 14, 0x005f80043300003d); } TEST_F(ValhallPacking, LeaBufImm) { - CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))), - 0x005e84040000007b); + bi_instr *I = + bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))); + CASE_ARCH(I, 10, 0x005e84040000007b); } TEST_F(ValhallPacking, StoreMemoryAccess) @@ -329,61 +323,62 @@ TEST_F(ValhallPacking, StoreMemoryAccess) bi_instr *I = bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)), bi_discard(bi_register(5)), BI_SEG_NONE, 0); I->mem_access = VA_MEMORY_ACCESS_ESTREAM; - CASE(I, 0x0061400632000044); + CASE_ARCH(I, 10, 0x0061400632000044); } TEST_F(ValhallPacking, Convert16To32) { - CASE(bi_u16_to_u32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), false))), - 0x0090c20000140077); + bi_instr *I = bi_u16_to_u32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), false))); + CASE_ARCH(I, 10, 0x0090c20000140077); - CASE(bi_u16_to_u32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), true))), - 0x0090c20010140077); + I = bi_u16_to_u32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), true))); + CASE_ARCH(I, 10, 0x0090c20010140077); - CASE(bi_u16_to_f32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), false))), - 0x0090c20000150077); + I = bi_u16_to_f32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), false))); + CASE_ARCH(I, 10, 0x0090c20000150077); - CASE(bi_u16_to_f32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), true))), - 0x0090c20010150077); + I = bi_u16_to_f32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), true))); + CASE_ARCH(I, 10, 0x0090c20010150077); - CASE(bi_s16_to_s32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), false))), - 0x0090c20000040077); + I = bi_s16_to_s32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), false))); + CASE_ARCH(I, 10, 0x0090c20000040077); - CASE(bi_s16_to_s32_to(b, bi_register(2), - bi_discard(bi_half(bi_register(55), true))), - 0x0090c20010040077); + I = bi_s16_to_s32_to(b, bi_register(2), + bi_discard(bi_half(bi_register(55), true))); + CASE_ARCH(I, 10, 0x0090c20010040077); } TEST_F(ValhallPacking, Swizzle8) { - CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero, - zero, BI_CMPF_NE, BI_RESULT_TYPE_I1), - 0x00f2c14300c0c000); + bi_instr *I = + bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero, + zero, BI_CMPF_NE, BI_RESULT_TYPE_I1); + CASE_ARCH(I, 10, 0x00f2c14300c0c000); } TEST_F(ValhallPacking, FauPage1) { - CASE(bi_mov_i32_to(b, bi_register(1), - bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)), - 0x0291c10000000080ULL); + bi_instr *I = bi_mov_i32_to( + b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)); + CASE_ARCH(I, 10, 0x0291c10000000080ULL); } TEST_F(ValhallPacking, LdTileV3F16) { - CASE(bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)), - bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16, - BI_VECSIZE_V3), - 0x0078840423033c40); + bi_instr *I = bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)), + bi_register(60), bi_register(3), + BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3); + CASE_ARCH(I, 10, 0x0078840423033c40); } TEST_F(ValhallPacking, Rhadd8) { - CASE(bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)), - bi_discard(bi_register(0)), BI_ROUND_RTP), - 0x00aac000400b4041); + bi_instr *I = bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)), + bi_discard(bi_register(0)), BI_ROUND_RTP); + CASE_ARCH(I, 10, 0x00aac000400b4041); } From 6a7aecaeecf1e79300777d9445348e7c10d0770f Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Thu, 12 Mar 2026 11:29:46 +0100 Subject: [PATCH 46/49] pan/va: Implement v15 encoding support Update va_pack to support the new encodings required by v15. --- .../compiler/bifrost/bifrost_compile.c | 2 +- src/panfrost/compiler/bifrost/valhall/ISA.xml | 12 + .../bifrost/valhall/test/test-packing.cpp | 85 +++ .../valhall/test/test-validate-fau.cpp | 8 +- .../compiler/bifrost/valhall/va_compiler.h | 13 +- .../compiler/bifrost/valhall/va_insert_flow.c | 2 +- .../compiler/bifrost/valhall/va_pack.c | 491 +++++++++++++++--- .../compiler/bifrost/valhall/va_validate.c | 25 +- 8 files changed, 539 insertions(+), 99 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index c0b551425e2..687a22979ba 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -4531,7 +4531,7 @@ bi_compile_variant_nir(nir_shader *nir, va_lower_constants(ctx, I, const_hist, min_count_for_fau); bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - va_repair_fau(&b, I); + va_repair_fau(&b, I, ctx->arch); } _mesa_hash_table_u64_destroy(const_hist); diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 6fc6e0d12de..53ddbc06856 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -597,6 +597,18 @@ slot7 + + + Dependency slot set on a message-passing instruction that writes to + registers. Before reading the destination, a future instruction must wait + on the specified slot. Slot #7 is for `BARRIER` instructions only. + + slot0 + slot1 + slot2 + slot7 + + Memory access hint for a `LOAD` or `STORE` instruction. none diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 3b92c96087f..44b8257b583 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -48,10 +48,12 @@ TEST_F(ValhallPacking, Moves) { bi_instr *I = bi_mov_i32_to(b, bi_register(1), bi_register(2)); CASE_ARCH(I, 10, 0x0091c10000000002ULL); + CASE_ARCH(I, 15, 0x0060010000200002ULL); I = bi_mov_i32_to(b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)); CASE_ARCH(I, 10, 0x0091c1000000008aULL); + CASE_ARCH(I, 15, 0x006101000020000aULL); } TEST_F(ValhallPacking, Fadd) @@ -59,44 +61,55 @@ TEST_F(ValhallPacking, Fadd) bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)); CASE_ARCH(I, 10, 0x00a4c00000000201ULL); + CASE_ARCH(I, 15, 0x00f0000000000201ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))); CASE_ARCH(I, 10, 0x00a4c02000000201ULL); + CASE_ARCH(I, 15, 0x00f0002000000201ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))); CASE_ARCH(I, 10, 0x00a4c01000000201ULL); + CASE_ARCH(I, 15, 0x00f0001000000201ULL); I = bi_fadd_v2f16_to(b, bi_register(0), bi_swz_16(bi_register(1), false, false), bi_swz_16(bi_register(0), true, true)); CASE_ARCH(I, 10, 0x00a5c0000c000001ULL); + CASE_ARCH(I, 15, 0x00f400000c000001ULL); I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)); CASE_ARCH(I, 10, 0x00a5c00028000001ULL); + CASE_ARCH(I, 15, 0x00f4000028000001ULL); I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_swz_16(bi_register(0), true, false)); CASE_ARCH(I, 10, 0x00a5c00024000001ULL); + CASE_ARCH(I, 15, 0x00f4000024000001ULL); I = bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))), bi_neg(zero)); CASE_ARCH(I, 10, 0x00a5c0902800c040ULL); + CASE_ARCH(I, 15, 0x00f600902800c080ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero); CASE_ARCH(I, 10, 0x00a4c0000000c001ULL); + CASE_ARCH(I, 15, 0x00f200000000c001ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)); CASE_ARCH(I, 10, 0x00a4c0100000c001ULL); + CASE_ARCH(I, 15, 0x00f200100000c001ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_half(bi_register(0), true)); CASE_ARCH(I, 10, 0x00a4c00008000001ULL); + CASE_ARCH(I, 15, 0x00f0000008000001ULL); I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_half(bi_register(0), false)); CASE_ARCH(I, 10, 0x00a4c00004000001ULL); + CASE_ARCH(I, 15, 0x00f0000004000001ULL); } TEST_F(ValhallPacking, Clper) @@ -105,6 +118,7 @@ TEST_F(ValhallPacking, Clper) bi_byte(n4567, 0), BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16); CASE_ARCH(I, 10, 0x00a0c030128fc900); + CASE_ARCH(I, 15, 0x00e20030028fc900); } TEST_F(ValhallPacking, Clamps) @@ -112,9 +126,11 @@ TEST_F(ValhallPacking, Clamps) bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_abs(bi_register(2)))); CASE_ARCH(I, 10, 0x00a4c03000000201ULL); + CASE_ARCH(I, 15, 0x00f0003000000201ULL); I->clamp = BI_CLAMP_CLAMP_M1_1; CASE_ARCH(I, 10, 0x00a4c03200000201ULL); + CASE_ARCH(I, 15, 0x00f0003080000201ULL); } TEST_F(ValhallPacking, Misc) @@ -123,18 +139,22 @@ TEST_F(ValhallPacking, Misc) b, bi_register(1), bi_discard(bi_register(1)), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), bi_neg(zero)); CASE_ARCH(I, 10, 0x00b2c10400c08841ULL); + CASE_ARCH(I, 15, 0x0166010400c00881ULL); I = bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))), BI_ROUND_RTN); CASE_ARCH(I, 10, 0x0090c240800d0042ULL); + CASE_ARCH(I, 15, 0x00600242004d0082ULL); I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0), BI_ROUND_RTN); CASE_ARCH(I, 10, 0x00904000a00f0000ULL); + /* Removed on v11 */ I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN); CASE_ARCH(I, 10, 0x00904000900f0001ULL); + /* Removed on v11 */ } TEST_F(ValhallPacking, FaddImm) @@ -142,10 +162,12 @@ TEST_F(ValhallPacking, FaddImm) bi_instr *I = bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)), 0x4847C6C0); CASE_ARCH(I, 10, 0x0114C24847C6C042ULL); + CASE_ARCH(I, 15, 0x0064024847c6c082ULL); I = bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), 0x70AC6784); CASE_ARCH(I, 10, 0x0115C270AC678442ULL); + CASE_ARCH(I, 15, 0x00620270ac678482ULL); } TEST_F(ValhallPacking, Comparions) @@ -155,12 +177,14 @@ TEST_F(ValhallPacking, Comparions) bi_discard(bi_swz_16(bi_register(2), true, false)), zero, BI_CMPF_GT, BI_RESULT_TYPE_M1); CASE_ARCH(I, 10, 0x00f9c21184c04243); + CASE_ARCH(I, 15, 0x01e40212c6c08283); I = bi_fcmp_or_v2f16_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(3), true, false)), bi_discard(bi_swz_16(bi_register(2), false, false)), zero, BI_CMPF_GT, BI_RESULT_TYPE_M1); CASE_ARCH(I, 10, 0x00f5c20190c04243); + CASE_ARCH(I, 15, 0x01e4020352c08283); } TEST_F(ValhallPacking, Conversions) @@ -168,6 +192,7 @@ TEST_F(ValhallPacking, Conversions) bi_instr *I = bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))); CASE_ARCH(I, 10, 0x0090c22000070042); + /* Removed on v11 */ } TEST_F(ValhallPacking, BranchzI16) @@ -176,6 +201,7 @@ TEST_F(ValhallPacking, BranchzI16) bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ); I->branch_offset = 1; CASE_ARCH(I, 10, 0x001fc03000000102); + CASE_ARCH(I, 15, 0x02b8003000000102); } TEST_F(ValhallPacking, BranchzI16Backwards) @@ -183,6 +209,7 @@ TEST_F(ValhallPacking, BranchzI16Backwards) bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ); I->branch_offset = -8; CASE_ARCH(I, 10, 0x001fc017fffff8c0); + CASE_ARCH(I, 15, 0x02b90017fffff8c0); } TEST_F(ValhallPacking, Blend) @@ -192,6 +219,7 @@ TEST_F(ValhallPacking, Blend) bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true), bi_null(), BI_REGISTER_FORMAT_F16, 2, 0); CASE_ARCH(I, 10, 0x007f4004333c00f0); + CASE_ARCH(I, 15, 0x031b0082333c00f0); } TEST_F(ValhallPacking, Mux) @@ -200,6 +228,7 @@ TEST_F(ValhallPacking, Mux) b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT); CASE_ARCH(I, 10, 0x00b8c00300804440ull); + CASE_ARCH(I, 15, 0x017c000c80008480ull); } TEST_F(ValhallPacking, AtestFP16) @@ -208,6 +237,7 @@ TEST_F(ValhallPacking, AtestFP16) bi_half(bi_register(1), true), bi_fau(BIR_FAU_ATEST_PARAM, false)); CASE_ARCH(I, 10, 0x007dbc0208ea013c); + CASE_ARCH(I, 15, 0x03d43c0108ea013c); } TEST_F(ValhallPacking, AtestFP32) @@ -215,6 +245,7 @@ TEST_F(ValhallPacking, AtestFP32) bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), one, bi_fau(BIR_FAU_ATEST_PARAM, false)); CASE_ARCH(I, 10, 0x007dbc0200ead03c); + CASE_ARCH(I, 15, 0x03d63c0100ead03c); } TEST_F(ValhallPacking, Transcendentals) @@ -222,18 +253,28 @@ TEST_F(ValhallPacking, Transcendentals) bi_instr *I = bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true); CASE_ARCH(I, 10, 0x0099c10001000000); + CASE_ARCH(I, 15, 0x0060010041200000); I = bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false, true); CASE_ARCH(I, 10, 0x0099c00001020040); + CASE_ARCH(I, 15, 0x0060000041220080); I = bi_frsq_f32_to(b, bi_register(2), bi_register(1)); CASE_ARCH(I, 10, 0x009cc20000020001); + CASE_ARCH(I, 15, 0x0060020001820001); I = bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_SPECIAL_LEFT); CASE_ARCH(I, 10, 0x0162c00440c04241); + CASE_ARCH(I, 15, 0x0264000e80c08281); + + I = bi_fma_rscale_f32_to(b, bi_register(0), bi_register(1), bi_register(2), + bi_neg(zero), bi_discard(bi_register(0)), + BI_SPECIAL_N); + CASE_ARCH(I, 10, 0x0161c00440c00201); + CASE_ARCH(I, 15, 0x0264000d80c00201); } TEST_F(ValhallPacking, Csel) @@ -243,18 +284,21 @@ TEST_F(ValhallPacking, Csel) bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ); CASE_ARCH(I, 10, 0x0150c10085844342); + CASE_ARCH(I, 15, 0x027c010005048382); I = bi_csel_u32_to( b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT); CASE_ARCH(I, 10, 0x0150c10485844342); + CASE_ARCH(I, 15, 0x027c010805048382); I = bi_csel_s32_to( b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT); CASE_ARCH(I, 10, 0x0158c10485844342); + CASE_ARCH(I, 15, 0x027c014805048382); } TEST_F(ValhallPacking, LdAttrImm) @@ -265,6 +309,7 @@ TEST_F(ValhallPacking, LdAttrImm) I->table = 1; CASE_ARCH(I, 10, 0x0066800433117d7c); + CASE_ARCH(I, 15, 0x038400023311bdbc); } TEST_F(ValhallPacking, LdVarBufImmF16) @@ -274,12 +319,14 @@ TEST_F(ValhallPacking, LdVarBufImmF16) BI_SAMPLE_CENTER, BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, BI_VECSIZE_V4, 0); CASE_ARCH(I, 10, 0x005d82143300003d); + CASE_ARCH(I, 15, 0x0310020a3f00003d); I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE, BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, BI_VECSIZE_V4, 0); CASE_ARCH(I, 10, 0x005d80843300003d); + CASE_ARCH(I, 15, 0x031000423f00003d); I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID, @@ -287,6 +334,7 @@ TEST_F(ValhallPacking, LdVarBufImmF16) BI_VECSIZE_V4, 8); CASE_ARCH(I, 10, 0x005d80443308003d); CASE_ARCH(I, 11, 0x005d80443300083d); + CASE_ARCH(I, 15, 0x031000223f00083d); } TEST_F(ValhallPacking, LdVarBufFlatImmFormat) @@ -294,10 +342,12 @@ TEST_F(ValhallPacking, LdVarBufFlatImmFormat) bi_instr *I = bi_ld_var_buf_flat_imm_to( b, bi_register(0), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 0x12); CASE_ARCH(I, 14, 0x0040800832001200); + CASE_ARCH(I, 15, 0x033900043a0012c0); I = bi_ld_var_buf_flat_imm_to(b, bi_register(0), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 0x12); CASE_ARCH(I, 14, 0x0040800433001200); + CASE_ARCH(I, 15, 0x033900023b0012c0); } TEST_F(ValhallPacking, LdVarBufFlat) @@ -305,10 +355,12 @@ TEST_F(ValhallPacking, LdVarBufFlat) bi_instr *I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); CASE_ARCH(I, 14, 0x005f80083200003d); + CASE_ARCH(I, 15, 0x031400043a00003d); I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4); CASE_ARCH(I, 14, 0x005f80043300003d); + CASE_ARCH(I, 15, 0x031400023b00003d); } TEST_F(ValhallPacking, LeaBufImm) @@ -316,6 +368,7 @@ TEST_F(ValhallPacking, LeaBufImm) bi_instr *I = bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))); CASE_ARCH(I, 10, 0x005e84040000007b); + CASE_ARCH(I, 15, 0x03080402000000bb); } TEST_F(ValhallPacking, StoreMemoryAccess) @@ -324,6 +377,7 @@ TEST_F(ValhallPacking, StoreMemoryAccess) bi_discard(bi_register(5)), BI_SEG_NONE, 0); I->mem_access = VA_MEMORY_ACCESS_ESTREAM; CASE_ARCH(I, 10, 0x0061400632000044); + CASE_ARCH(I, 15, 0x0320009302000084); } TEST_F(ValhallPacking, Convert16To32) @@ -331,26 +385,32 @@ TEST_F(ValhallPacking, Convert16To32) bi_instr *I = bi_u16_to_u32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), false))); CASE_ARCH(I, 10, 0x0090c20000140077); + CASE_ARCH(I, 15, 0x00600200005400b7); I = bi_u16_to_u32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), true))); CASE_ARCH(I, 10, 0x0090c20010140077); + CASE_ARCH(I, 15, 0x00600200105400b7); I = bi_u16_to_f32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), false))); CASE_ARCH(I, 10, 0x0090c20000150077); + /* Removed on v11 */ I = bi_u16_to_f32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), true))); CASE_ARCH(I, 10, 0x0090c20010150077); + /* Removed on v11 */ I = bi_s16_to_s32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), false))); CASE_ARCH(I, 10, 0x0090c20000040077); + CASE_ARCH(I, 15, 0x00600200004400b7); I = bi_s16_to_s32_to(b, bi_register(2), bi_discard(bi_half(bi_register(55), true))); CASE_ARCH(I, 10, 0x0090c20010040077); + CASE_ARCH(I, 15, 0x00600200104400b7); } TEST_F(ValhallPacking, Swizzle8) @@ -359,6 +419,7 @@ TEST_F(ValhallPacking, Swizzle8) bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero, zero, BI_CMPF_NE, BI_RESULT_TYPE_I1); CASE_ARCH(I, 10, 0x00f2c14300c0c000); + /* Removed on v11 */ } TEST_F(ValhallPacking, FauPage1) @@ -366,6 +427,7 @@ TEST_F(ValhallPacking, FauPage1) bi_instr *I = bi_mov_i32_to( b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)); CASE_ARCH(I, 10, 0x0291c10000000080ULL); + CASE_ARCH(I, 15, 0x0061010000200040ULL); } TEST_F(ValhallPacking, LdTileV3F16) @@ -374,6 +436,7 @@ TEST_F(ValhallPacking, LdTileV3F16) bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3); CASE_ARCH(I, 10, 0x0078840423033c40); + CASE_ARCH(I, 15, 0x03c0040223033c80); } TEST_F(ValhallPacking, Rhadd8) @@ -381,4 +444,26 @@ TEST_F(ValhallPacking, Rhadd8) bi_instr *I = bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(0)), BI_ROUND_RTP); CASE_ARCH(I, 10, 0x00aac000400b4041); + /* Removed on v11 */ +} + +TEST_F(ValhallPacking, Atomics) +{ + + bi_instr *I = + bi_atom1_return_i64_to(b, bi_register(0), bi_discard(bi_register(2)), + bi_register(3), BI_ATOM_OPC_AINC, 2); + CASE_ARCH(I, 10, 0x0069800428000042); + CASE_ARCH(I, 15, 0x0328000220000082); + + I = bi_atom_return_i32_to(b, bi_register(0), bi_discard(bi_register(1)), + bi_register(2), bi_register(3), BI_ATOM_OPC_AXCHG, + 1); + CASE_ARCH(I, 10, 0x0120c1021bc00002); + CASE_ARCH(I, 15, 0x032401c10f000002); + + I = bi_atom_return_i64_to(b, bi_register(0), bi_register(2), bi_register(6), + bi_register(7), BI_ATOM_OPC_ACMPXCHG, 2); + CASE_ARCH(I, 10, 0x0120c2182fc00006); + CASE_ARCH(I, 15, 0x032802cc2f000006); } diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp index e4a0945f1f6..6c3f1f44905 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp @@ -9,9 +9,9 @@ #include -#define CASE(instr, expected) \ +#define CASE_ARCH(instr, arch, expected) \ do { \ - if (va_validate_fau(instr) != expected) { \ + if (va_validate_fau(instr, arch) != expected) { \ fprintf(stderr, "Incorrect validation for:\n"); \ bi_print_instr(instr, stderr); \ fprintf(stderr, "\n"); \ @@ -19,8 +19,8 @@ } \ } while (0) -#define VALID(instr) CASE(instr, true) -#define INVALID(instr) CASE(instr, false) +#define VALID(instr) CASE_ARCH(instr, 10, true) +#define INVALID(instr) CASE_ARCH(instr, 10, false) class ValidateFau : public testing::Test { protected: diff --git a/src/panfrost/compiler/bifrost/valhall/va_compiler.h b/src/panfrost/compiler/bifrost/valhall/va_compiler.h index 622ab81b302..5a227c80412 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_compiler.h +++ b/src/panfrost/compiler/bifrost/valhall/va_compiler.h @@ -13,9 +13,9 @@ extern "C" { #endif -bool va_validate_fau(bi_instr *I); +bool va_validate_fau(bi_instr *I, unsigned arch); void va_validate(FILE *fp, bi_context *ctx); -void va_repair_fau(bi_builder *b, bi_instr *I); +void va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch); void va_fuse_add_imm(bi_instr *I); void va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts, uint32_t min_fau_count); void va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts); @@ -28,14 +28,15 @@ void va_gather_hsr_info(bi_context *ctx, struct pan_shader_info *info); uint64_t va_pack_instr(const bi_instr *I, unsigned arch); static inline unsigned -va_fau_page(enum bir_fau value) +va_fau_page(enum bir_fau value, unsigned arch) { /* Uniform slots of FAU have a 7-bit index. The top 2-bits are the page; the * bottom 5-bits are specified in the source. */ if (value & BIR_FAU_UNIFORM) { + unsigned value_shift = arch >= 15 ? 6 : 5; unsigned slot = value & ~BIR_FAU_UNIFORM; - unsigned page = slot >> 5; + unsigned page = slot >> value_shift; assert(page <= 3); return page; @@ -57,11 +58,11 @@ va_fau_page(enum bir_fau value) } static inline unsigned -va_select_fau_page(const bi_instr *I) +va_select_fau_page(const bi_instr *I, unsigned arch) { bi_foreach_src(I, s) { if (I->src[s].type == BI_INDEX_FAU) - return va_fau_page((enum bir_fau)I->src[s].value); + return va_fau_page((enum bir_fau)I->src[s].value, arch); } return 0; diff --git a/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c b/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c index 9f3e7881ac3..c2812546067 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c +++ b/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c @@ -520,7 +520,7 @@ va_assign_slots(bi_context *ctx) bi_foreach_instr_global(ctx, I) { if (I->op == BI_OPCODE_BARRIER) { - I->slot = 7; + I->slot = (ctx->arch >= 15) ? VA_SLOT_V15_SLOT7 : VA_SLOT_SLOT7; } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) { I->slot = 0; } else if (bi_get_opcode_props(I)->message) { diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index 129512ce170..ea2e78e98e0 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -74,6 +74,15 @@ va_pack_reg(const bi_instr *I, bi_index idx) return idx.value; } +static unsigned +va_pack_reg_v15(const bi_instr *I, bi_index idx) +{ + pack_assert(I, idx.type == BI_INDEX_REGISTER); + pack_assert(I, idx.value < 128); + + return idx.value; +} + static unsigned va_pack_fau_special(const bi_instr *I, enum bir_fau fau) { @@ -124,6 +133,21 @@ va_pack_fau_64(const bi_instr *I, bi_index idx) return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1); } +static unsigned +va_pack_fau_64_v15(const bi_instr *I, bi_index idx) +{ + pack_assert(I, idx.type == BI_INDEX_FAU); + + unsigned val = (idx.value & BITFIELD_MASK(6)); + + if (idx.value & BIR_FAU_IMMEDIATE) + return (0x7 << 6) | (val << 1); + else if (idx.value & BIR_FAU_UNIFORM) + return (0x2 << 7) | (val << 1); + else + return (0xf << 5) | (va_pack_fau_special(I, idx.value) << 1); +} + static unsigned va_pack_src(const bi_instr *I, unsigned s) { @@ -142,6 +166,33 @@ va_pack_src(const bi_instr *I, unsigned s) invalid_instruction(I, "type of source %u", s); } +static uint64_t +va_pack_src_v15(const bi_instr *I, unsigned s, unsigned loc) +{ + bi_index idx = I->src[s]; + + uint64_t hex = 0; + uint64_t regval = 0; + + if (idx.type == BI_INDEX_REGISTER) { + regval = va_pack_reg_v15(I, idx); + if (idx.discard) + regval |= (1 << 7); + } else if (idx.type == BI_INDEX_FAU) { + pack_assert(I, idx.offset <= 1); + regval = va_pack_fau_64_v15(I, idx) | idx.offset; + } else + invalid_instruction(I, "type of source %u", s); + + uint64_t low8 = regval & 0xff; + uint64_t high1 = (regval >> 8) & 0x1; + + hex |= (low8 << (8 * loc)); + hex |= (high1 << (48 + loc)); + + return hex; +} + static unsigned va_pack_wrmask(const bi_instr *I) { @@ -211,6 +262,20 @@ va_pack_dest(const bi_instr *I) return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6); } +static unsigned +va_pack_dest_v15(const bi_instr *I) +{ + assert(I->nr_dests); + switch (I->op) { + case BI_OPCODE_SHADDX_S64: + case BI_OPCODE_SHADDX_U64: + /* 64 bit dest has a 0x0 wrmask */ + return va_pack_reg_v15(I, I->dest[0]); + default: + return va_pack_reg_v15(I, I->dest[0]) | (va_pack_wrmask(I) << 13); + } +} + static enum va_widen va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz) { @@ -452,6 +517,18 @@ va_pack_rhadd(const bi_instr *I) } } +static uint64_t +va_pack_clamp_special_round_v15(const bi_instr *I) +{ + pack_assert(I, I->special < 4); + if (I->special == BI_SPECIAL_N && I->round == BI_ROUND_RTZ) + return 0x4; + else if (I->special) + return 0x4 | I->special; + else + return I->clamp; +} + static uint64_t va_pack_alu(const bi_instr *I, unsigned arch) { @@ -465,25 +542,25 @@ va_pack_alu(const bi_instr *I, unsigned arch) case BI_OPCODE_FREXPM_F32: case BI_OPCODE_FREXPM_V2F16: if (I->sqrt) - hex |= 1ull << 24; + hex |= 1ull << ((arch >= 15) ? 30 : 24); if (I->log) - hex |= 1ull << 25; + hex |= 1ull << ((arch >= 15) ? 31 : 25); break; case BI_OPCODE_FLUSH_F32: case BI_OPCODE_FLUSH_V2F16: - hex |= I->nan_mode << 8; + hex |= I->nan_mode << ((arch >= 15) ? 30 : 8); if (I->ftz) - hex |= 1ull << 10; + hex |= 1ull << ((arch >= 15) ? 32 : 10); if (I->flush_inf) - hex |= 1ull << 11; + hex |= 1ull << ((arch >= 15) ? 33 : 11); break; /* Add mux type */ case BI_OPCODE_MUX_I32: case BI_OPCODE_MUX_V2I16: case BI_OPCODE_MUX_V4I8: - hex |= (uint64_t)I->mux << 32; + hex |= (uint64_t)I->mux << ((arch >= 15) ? 34 : 32); break; /* Add .eq flag */ @@ -495,7 +572,7 @@ va_pack_alu(const bi_instr *I, unsigned arch) hex |= (1ull << 36); if (I->op == BI_OPCODE_BRANCHZI) - hex |= (0x1ull << 40); /* Absolute */ + hex |= (0x1ull << ((arch >= 15) ? 31 : 40)); /* Absolute */ else hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8; @@ -511,7 +588,46 @@ va_pack_alu(const bi_instr *I, unsigned arch) case BI_OPCODE_RSHIFT_XOR_I32: case BI_OPCODE_RSHIFT_XOR_V2I16: case BI_OPCODE_RSHIFT_XOR_V4I8: - hex |= (uint64_t)I->arithmetic << 34; + if (arch >= 15) { + /* Rewrite exact to ARSHIFT */ + if (I->arithmetic) { + switch (I->op) { + case BI_OPCODE_RSHIFT_AND_I32: + case BI_OPCODE_RSHIFT_AND_V2I16: + case BI_OPCODE_RSHIFT_AND_V4I8: { + uint64_t arshift_and_op = (0xcULL << 30); + /* Check that we can safely overwrite opcode */ + pack_assert(I, ((info.exact & (0xfULL << 30)) | + arshift_and_op) == arshift_and_op); + hex |= arshift_and_op; + break; + } + case BI_OPCODE_RSHIFT_OR_I32: + case BI_OPCODE_RSHIFT_OR_V2I16: + case BI_OPCODE_RSHIFT_OR_V4I8: { + uint64_t arshift_or_op = (0xdULL << 30); + /* Check that we can safely overwrite opcode */ + pack_assert(I, ((info.exact & (0xfULL << 30)) | arshift_or_op) == + arshift_or_op); + hex |= arshift_or_op; + break; + } + case BI_OPCODE_RSHIFT_XOR_I32: + case BI_OPCODE_RSHIFT_XOR_V2I16: + case BI_OPCODE_RSHIFT_XOR_V4I8: { + uint64_t arshift_xor_op = (0xbULL << 30); + /* Check that we can safely overwrite opcode */ + pack_assert(I, ((info.exact & (0xfULL << 30)) | + arshift_xor_op) == arshift_xor_op); + hex |= arshift_xor_op; + break; + } + default: + UNREACHABLE("RSHIFT->ARSHIFT"); + } + } + } else + hex |= (uint64_t)I->arithmetic << 34; break; case BI_OPCODE_LEA_BUF_IMM: @@ -562,8 +678,8 @@ va_pack_alu(const bi_instr *I, unsigned arch) } hex |= ((uint64_t)va_pack_source_format(I)) << 24; - hex |= ((uint64_t)I->update) << 36; - hex |= ((uint64_t)I->sample) << 38; + hex |= ((uint64_t)I->update) << ((arch >= 15) ? 35 : 36); + hex |= ((uint64_t)I->sample) << ((arch >= 15) ? 37 : 38); break; case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: @@ -601,20 +717,18 @@ va_pack_alu(const bi_instr *I, unsigned arch) break; } - /* FMA_RSCALE.f32 special modes treated as extra opcodes */ - if (I->op == BI_OPCODE_FMA_RSCALE_F32) { - pack_assert(I, I->special < 4); - hex |= ((uint64_t)I->special) << 48; - } - /* Add the normal destination or a placeholder. Staging destinations are * added elsewhere, as they require special handling for control fields. */ if (info.has_dest && info.nr_staging_dests == 0) { - hex |= (uint64_t)va_pack_dest(I) << 40; + if (arch >= 15) + hex |= (uint64_t)va_pack_dest_v15(I) << 40; + else + hex |= (uint64_t)va_pack_dest(I) << 40; } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) { pack_assert(I, I->nr_dests == 0); - hex |= 0xC0ull << 40; /* Placeholder */ + if (arch < 15) + hex |= 0xC0ull << 40; /* Placeholder */ } bool swap12 = va_swap_12(I->op); @@ -629,7 +743,10 @@ va_pack_alu(const bi_instr *I, unsigned arch) enum va_size size = src_info.size; bi_index src = I->src[logical_i + src_offset]; - hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i); + if (arch >= 15) + hex |= va_pack_src_v15(I, logical_i + src_offset, i); + else + hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i); if (src_info.notted) { if (src.neg) @@ -638,10 +755,15 @@ va_pack_alu(const bi_instr *I, unsigned arch) unsigned neg_offs = 32 + 2 + ((2 - i) * 2); unsigned abs_offs = 33 + 2 + ((2 - i) * 2); - if (src.neg) - hex |= 1ull << neg_offs; - if (src.abs) - hex |= 1ull << abs_offs; + if (arch >= 15 && I->op == BI_OPCODE_FMA_RSCALE_F32 && i == 2) { + if (src.neg) + hex |= 1ull << (neg_offs + 1); + } else { + if (src.neg) + hex |= 1ull << neg_offs; + if (src.abs) + hex |= 1ull << abs_offs; + } } else { if (src.neg) invalid_instruction(I, "negate"); @@ -661,8 +783,8 @@ va_pack_alu(const bi_instr *I, unsigned arch) unsigned offs = (i == 1) ? 26 : 36; hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs; } else if (src_info.lane) { - unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? - ((i == 0) ? 38 : 36) : ((i == 0) ? 28 : 26); + unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) + : ((i == 0) ? 28 : 26); if (src_info.size == VA_SIZE_16) { hex |= (src.swizzle == BI_SWIZZLE_H1 ? 1 : 0) << offs; @@ -675,7 +797,25 @@ va_pack_alu(const bi_instr *I, unsigned arch) } else if (src_info.lanes) { pack_assert(I, src_info.size == VA_SIZE_8); pack_assert(I, i == 1); - hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26; + if (arch >= 15 && I->op == BI_OPCODE_CLPER_I32) { + switch (src.swizzle) { + case BI_SWIZZLE_B00: + hex |= 0x0ULL << 28; + break; + case BI_SWIZZLE_B11: + hex |= 0x1ULL << 28; + break; + case BI_SWIZZLE_B22: + hex |= 0x2ULL << 28; + break; + case BI_SWIZZLE_B33: + hex |= 0x3ULL << 28; + break; + default: + invalid_instruction(I, "lane shift"); + } + } else + hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26; } else if (src_info.combine) { /* Treat as swizzle, subgroup ops not yet supported */ pack_assert(I, src_info.size == VA_SIZE_32); @@ -691,17 +831,33 @@ va_pack_alu(const bi_instr *I, unsigned arch) } if (info.saturate) - hex |= (uint64_t)I->saturate << 30; - if (info.rhadd) + hex |= (uint64_t)I->saturate << ((arch >= 15) ? 25 : 30); + if (info.rhadd) { + pack_assert(I, arch < 15); hex |= va_pack_rhadd(I); - if (info.clamp) - hex |= (uint64_t)I->clamp << 32; - if (info.round_mode) - hex |= (uint64_t)I->round << 30; + } + /* FMA_RSCALE.f32 special modes treated as extra opcodes */ + if (I->op == BI_OPCODE_FMA_RSCALE_F32) { + if (arch >= 15) { + hex |= va_pack_clamp_special_round_v15(I) << 32; + } else { + pack_assert(I, I->special < 4); + hex |= ((uint64_t)I->special) << 48; + if (info.clamp) + hex |= (uint64_t)I->clamp << 32; + if (info.round_mode && I->round == BI_ROUND_RTZ) + hex |= (uint64_t)0x1 << 50; + } + } else { + if (info.clamp) + hex |= (uint64_t)I->clamp << ((arch >= 15) ? 30 : 32); + if (info.round_mode) + hex |= (uint64_t)I->round << ((arch >= 15) ? 32 : 30); + } if (info.condition) - hex |= (uint64_t)I->cmpf << 32; + hex |= (uint64_t)I->cmpf << ((arch >= 15) ? 33 : 32); if (info.result_type) - hex |= (uint64_t)I->result_type << 30; + hex |= (uint64_t)I->result_type << ((arch >= 15) ? 24 : 30); return hex; } @@ -768,6 +924,26 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor) return hex; } + +static uint64_t +va_pack_load_v15(const bi_instr *I, bool buffer_descriptor) +{ + /* This implicitly means identity: VA_LOAD_LANE_8_BIT_B0 for i8 (bits[28;27]) + * and VA_LOAD_LANE_16_BIT_H0 for i16 (bit[27]) */ + uint64_t hex = 0; + + if (!buffer_descriptor) + hex |= va_pack_byte_offset(I); + + hex |= va_pack_src_v15(I, 0, 0); + hex |= (uint64_t)I->mem_access << 24; + + if (buffer_descriptor) + hex |= va_pack_src_v15(I, 1, 1); + + return hex; +} + static uint64_t va_pack_store(const bi_instr *I) { @@ -782,6 +958,20 @@ va_pack_store(const bi_instr *I) return hex; } +static uint64_t +va_pack_store_v15(const bi_instr *I) +{ + uint64_t hex = 0; + + va_validate_register_pair(I, 1); + hex |= va_pack_src_v15(I, 1, 0); + hex |= I->mem_access << 24; + + hex |= va_pack_byte_offset(I); + + return hex; +} + static enum va_lod_mode va_pack_lod_mode(const bi_instr *I) { @@ -824,13 +1014,45 @@ va_pack_register_format(const bi_instr *I) } } +static uint64_t +va_pack_src_null_v15(unsigned loc) +{ + uint64_t hex = 0; + uint64_t regval = 0x1c0; + + uint64_t low8 = regval & 0xff; + uint64_t high1 = (regval >> 8) & 0x1; + + hex |= (low8 << (8 * loc)); + hex |= (high1 << (48 + loc)); + + return hex; +} + +static unsigned +va_repack_sr_control_v15(unsigned sr_control) +{ + unsigned repacked = 0; + bool read = sr_control & 0x1; + bool write = sr_control & 0x2; + + if (read) { + repacked |= 0x2; + if (write) + repacked |= 0x1; + } + + return repacked; +} + uint64_t va_pack_instr(const bi_instr *I, unsigned arch) { struct va_opcode_info info = get_valhall_opcode(I->op, arch); - uint64_t hex = info.exact | (((uint64_t)I->flow) << 59); - hex |= ((uint64_t)va_select_fau_page(I)) << 57; + uint64_t hex = + info.exact | (((uint64_t)I->flow) << ((arch >= 15) ? 58 : 59)); + hex |= ((uint64_t)va_select_fau_page(I, arch)) << ((arch >= 15) ? 62 : 57); if (info.slot) hex |= ((uint64_t)I->slot << 30); @@ -842,14 +1064,60 @@ va_pack_instr(const bi_instr *I, unsigned arch) unsigned count = read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0); - hex |= ((uint64_t)count << 33); - hex |= (uint64_t)va_pack_reg(I, sr) << 40; - hex |= ((uint64_t)info.sr_control << 46); + hex |= ((uint64_t)count << ((arch >= 15) ? 32 : 33)); + if (arch >= 15) { + hex |= (uint64_t)va_pack_reg_v15(I, sr) << 40; + hex |= ((uint64_t)va_repack_sr_control_v15(info.sr_control) << 38); + } else { + hex |= (uint64_t)va_pack_reg(I, sr) << 40; + hex |= ((uint64_t)info.sr_control << 46); + } + } + + /* On v15, some instructions require special sr_control values */ + if (arch >= 15) { + switch (I->op) { + case BI_OPCODE_BARRIER: { + unsigned sr_control = va_repack_sr_control_v15(info.sr_control); + pack_assert(I, sr_control == 0x0 || sr_control == 0x2); + hex |= (uint64_t)0x2 << 38; + break; + } + case BI_OPCODE_ATOM1_RETURN_I32: + case BI_OPCODE_ATOM1_RETURN_I64: { + unsigned sr_control = va_repack_sr_control_v15(info.sr_control); + pack_assert(I, sr_control == 0x0); + break; + } + case BI_OPCODE_ATOM_I32: + case BI_OPCODE_ATOM_I64: { + unsigned sr_control = va_repack_sr_control_v15(info.sr_control); + pack_assert(I, sr_control == 0x2); + break; + } + case BI_OPCODE_ATOM_RETURN_I32: + case BI_OPCODE_ATOM_RETURN_I64: + case BI_OPCODE_AXCHG_I32: + case BI_OPCODE_AXCHG_I64: + case BI_OPCODE_ACMPXCHG_I32: + case BI_OPCODE_ACMPXCHG_I64: { + unsigned sr_control = va_repack_sr_control_v15(info.sr_control); + pack_assert(I, sr_control == 0x0 || sr_control == 0x3); + hex |= (uint64_t)0x3 << 38; + break; + } + default: + break; + } } if (info.sr_write_count) { - hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36; - hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16; + hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) + << ((arch >= 15) ? 35 : 36); + if (arch >= 15) + hex |= ((uint64_t)va_pack_reg_v15(I, I->dest[0])) << 16; + else + hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16; } if (info.vecsize) @@ -867,7 +1135,10 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_LOAD_I64: case BI_OPCODE_LOAD_I96: case BI_OPCODE_LOAD_I128: - hex |= va_pack_load(I, false); + if (arch >= 15) + hex |= va_pack_load_v15(I, false); + else + hex |= va_pack_load(I, false); break; case BI_OPCODE_LD_PKA_I8: @@ -878,7 +1149,10 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_LD_PKA_I64: case BI_OPCODE_LD_PKA_I96: case BI_OPCODE_LD_PKA_I128: - hex |= va_pack_load(I, true); + if (arch >= 15) + hex |= va_pack_load_v15(I, true); + else + hex |= va_pack_load(I, true); break; case BI_OPCODE_STORE_I8: @@ -889,20 +1163,26 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_STORE_I64: case BI_OPCODE_STORE_I96: case BI_OPCODE_STORE_I128: - hex |= va_pack_store(I); + if (arch >= 15) + hex |= va_pack_store_v15(I); + else + hex |= va_pack_store(I); break; case BI_OPCODE_ATOM1_RETURN_I64: /* Permit omitting the destination for plain ATOM1 */ - if (!bi_count_write_registers(I, 0)) { + if (arch < 15 && !bi_count_write_registers(I, 0)) { hex |= (0x40ull << 40); // fake read } /* 64-bit source */ va_validate_register_pair(I, 0); - hex |= (uint64_t)va_pack_src(I, 0) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 0, 0); + else + hex |= (uint64_t)va_pack_src(I, 0) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22); break; case BI_OPCODE_ACMPXCHG_I64: @@ -911,29 +1191,43 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_ATOM_RETURN_I64: /* 64-bit source */ va_validate_register_pair(I, 1); - hex |= (uint64_t)va_pack_src(I, 1) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 1, 0); + else + hex |= (uint64_t)va_pack_src(I, 1) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t)va_pack_atom_opc(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22); - if (I->op == BI_OPCODE_ATOM_RETURN_I64) - hex |= (0xc0ull << 40); // flags + if (arch >= 15) { + if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) { + /* Change bits [51;50] to be ACMPXCHG */ + pack_assert(I, ((hex >> 50) & 0b11) == 0b01); + hex ^= (0b11ull << 50); + } + } else { + if (I->op == BI_OPCODE_ATOM_RETURN_I64) + hex |= (0xc0ull << 40); // flags - if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) - hex |= (1 << 26); /* .compare */ + if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) + hex |= (1 << 26); /* .compare */ + } break; case BI_OPCODE_ATOM1_RETURN_I32: /* Permit omitting the destination for plain ATOM1 */ - if (!bi_count_write_registers(I, 0)) { + if (arch < 15 && !bi_count_write_registers(I, 0)) { hex |= (0x40ull << 40); // fake read } /* 64-bit source */ va_validate_register_pair(I, 0); - hex |= (uint64_t)va_pack_src(I, 0) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 0, 0); + else + hex |= (uint64_t)va_pack_src(I, 0) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22); break; case BI_OPCODE_ACMPXCHG_I32: @@ -942,41 +1236,67 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_ATOM_RETURN_I32: /* 64-bit source */ va_validate_register_pair(I, 1); - hex |= (uint64_t)va_pack_src(I, 1) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 1, 0); + else + hex |= (uint64_t)va_pack_src(I, 1) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t)va_pack_atom_opc(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22); - if (I->op == BI_OPCODE_ATOM_RETURN_I32) - hex |= (0xc0ull << 40); // flags + if (arch >= 15) { + if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) { + /* Change bits [51;50] to be ACMPXCHG */ + pack_assert(I, ((hex >> 50) & 0b11) == 0b01); + hex ^= (0b11ull << 50); + } + } else { + if (I->op == BI_OPCODE_ATOM_RETURN_I32) + hex |= (0xc0ull << 40); // flags - if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) - hex |= (1 << 26); /* .compare */ + if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) + hex |= (1 << 26); /* .compare */ + } break; case BI_OPCODE_LD_CVT: - hex |= (uint64_t)va_pack_src(I, 0); + if (arch >= 15) + hex |= va_pack_src_v15(I, 0, 0); + else + hex |= (uint64_t)va_pack_src(I, 0); hex |= va_pack_byte_offset(I); /* Conversion descriptor */ - hex |= (uint64_t)va_pack_src(I, 2) << 16; - hex |= (uint64_t)I->mem_access << 37; + if (arch >= 15) + hex |= va_pack_src_v15(I, 2, 2); + else + hex |= (uint64_t)va_pack_src(I, 2) << 16; + hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37); break; case BI_OPCODE_ST_CVT: /* Staging read */ va_validate_register_pair(I, 1); - hex |= (uint64_t)va_pack_src(I, 1) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 1, 0); + else + hex |= (uint64_t)va_pack_src(I, 1) << 0; hex |= va_pack_byte_offset(I); /* Conversion descriptor */ - hex |= (uint64_t)va_pack_src(I, 3) << 16; - hex |= (uint64_t)I->mem_access << 37; + if (arch >= 15) + hex |= va_pack_src_v15(I, 3, 2); + else + hex |= (uint64_t)va_pack_src(I, 3) << 16; + hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37); break; case BI_OPCODE_BLEND: { /* Source 0 - Blend descriptor (64-bit) */ - hex |= ((uint64_t)va_pack_src(I, 2)) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 2, 0); + else + hex |= ((uint64_t)va_pack_src(I, 2)) << 0; va_validate_register_pair(I, 2); /* Target */ @@ -987,7 +1307,10 @@ va_pack_instr(const bi_instr *I, unsigned arch) hex |= ((I->branch_offset >> 3) << 8); /* Source 2 - coverage mask */ - hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16; + if (arch >= 15) + hex |= va_pack_src_v15(I, 1, 2); + else + hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16; /* Vector size */ unsigned vecsize = 4; @@ -997,7 +1320,7 @@ va_pack_instr(const bi_instr *I, unsigned arch) } case BI_OPCODE_LD_GCLK_U64: - hex |= va_pack_gclk(I); + hex |= va_pack_gclk(I) << ((arch >= 15) ? 8 : 0); break; case BI_OPCODE_TEX_GRADIENT: @@ -1005,7 +1328,10 @@ va_pack_instr(const bi_instr *I, unsigned arch) case BI_OPCODE_TEX_FETCH: case BI_OPCODE_TEX_GATHER: { /* Image to read from */ - hex |= ((uint64_t)va_pack_src(I, 1)) << 0; + if (arch >= 15) + hex |= va_pack_src_v15(I, 1, 0); + else + hex |= ((uint64_t)va_pack_src(I, 1)) << 0; if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) && I->shadow) @@ -1022,7 +1348,7 @@ va_pack_instr(const bi_instr *I, unsigned arch) if (I->skip) hex |= (1ull << 39); if (!bi_is_regfmt_16(I->register_format)) - hex |= (1ull << 46); + hex |= (1ull << ((arch >= 15) ? 38 : 46)); if (I->op == BI_OPCODE_TEX_GRADIENT) { if (I->force_delta_enable) @@ -1044,20 +1370,35 @@ va_pack_instr(const bi_instr *I, unsigned arch) hex |= ((uint64_t)I->fetch_component) << 14; } - hex |= (I->write_mask << 22); + hex |= (I->write_mask << ((arch >= 15) ? 24 : 22)); hex |= ((uint64_t)I->dimension) << 28; break; } default: - if (!info.exact && I->op != BI_OPCODE_NOP) + if (!info.exact && (arch >= 15 || I->op != BI_OPCODE_NOP)) invalid_instruction(I, "opcode"); hex |= va_pack_alu(I, arch); break; } + /* On v15, some instrutions require an encoded null src. */ + if (arch >= 15) { + switch (I->op) { + case BI_OPCODE_NOP: + case BI_OPCODE_LD_VAR_FLAT_IMM: + case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: + case BI_OPCODE_LD_GCLK_U64: + case BI_OPCODE_BARRIER: + hex |= va_pack_src_null_v15(0); + break; + default: + break; + } + } + return hex; } diff --git a/src/panfrost/compiler/bifrost/valhall/va_validate.c b/src/panfrost/compiler/bifrost/valhall/va_validate.c index b597692eb00..da32405849d 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_validate.c +++ b/src/panfrost/compiler/bifrost/valhall/va_validate.c @@ -93,7 +93,8 @@ fau_state_uniform(struct fau_state *fau, bi_index idx, enum bi_opcode op) } static bool -fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op) +fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op, + unsigned arch) { for (unsigned i = 0; i < ARRAY_SIZE(fau->buffer); ++i) { bi_index buf = fau->buffer[i]; @@ -106,7 +107,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op) /* Instructions executed by the messaging unit should not encode WARP_ID or * anything from special page 3. */ if (can_run_on_message_unit(op) && - (va_fau_page(idx.value) == 3 || idx.value == BIR_FAU_WARP_ID)) + (va_fau_page(idx.value, arch) == 3 || idx.value == BIR_FAU_WARP_ID)) return false; return fau->uniform_slot == -1 || can_use_two_fau_indices(op); @@ -114,7 +115,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op) static bool valid_src(struct fau_state *fau, unsigned fau_page, bi_index src, - enum bi_opcode op) + enum bi_opcode op, unsigned arch) { if (src.type != BI_INDEX_FAU) return true; @@ -128,42 +129,42 @@ valid_src(struct fau_state *fau, unsigned fau_page, bi_index src, return fau_state_buffer(fau, src); } - bool valid = (fau_page == va_fau_page(src.value)); + bool valid = (fau_page == va_fau_page(src.value, arch)); valid &= fau_state_buffer(fau, src); if (src.value & BIR_FAU_UNIFORM) valid &= fau_state_uniform(fau, src, op); else if (fau_is_special(src.value)) - valid &= fau_state_special(fau, src, op); + valid &= fau_state_special(fau, src, op, arch); return valid; } bool -va_validate_fau(bi_instr *I) +va_validate_fau(bi_instr *I, unsigned arch) { bool valid = true; struct fau_state fau = {.uniform_slot = -1}; - unsigned fau_page = va_select_fau_page(I); + unsigned fau_page = va_select_fau_page(I, arch); bi_foreach_src(I, s) { - valid &= valid_src(&fau, fau_page, I->src[s], I->op); + valid &= valid_src(&fau, fau_page, I->src[s], I->op, arch); } return valid; } void -va_repair_fau(bi_builder *b, bi_instr *I) +va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch) { struct fau_state fau = {.uniform_slot = -1}; - unsigned fau_page = va_select_fau_page(I); + unsigned fau_page = va_select_fau_page(I, arch); bi_foreach_src(I, s) { struct fau_state push = fau; bi_index src = I->src[s]; - if (!valid_src(&fau, fau_page, src, I->op)) { + if (!valid_src(&fau, fau_page, src, I->op, arch)) { bi_replace_src(I, s, bi_mov_i32(b, bi_strip_index(src))); /* Rollback update. Since the replacement move doesn't affect FAU @@ -180,7 +181,7 @@ va_validate(FILE *fp, bi_context *ctx) bool errors = false; bi_foreach_instr_global(ctx, I) { - if (!va_validate_fau(I)) { + if (!va_validate_fau(I, ctx->arch)) { if (!errors) { fprintf(fp, "Validation failed, this is a bug. Shader:\n\n"); bi_print_shader(ctx, fp); From 842a9a7e7c0d64dda2510dbc71ed76f850918e80 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Thu, 16 Apr 2026 12:42:13 +0200 Subject: [PATCH 47/49] pan/va: Implement v15 disassembly support This is currently implemented by adding specific _v15 data structures where required. --- src/panfrost/compiler/bifrost/valhall/ISA.xml | 155 ++++++++++++++++-- .../compiler/bifrost/valhall/disasm.py | 124 ++++++++++++-- .../compiler/bifrost/valhall/valhall.c.py | 4 +- .../compiler/bifrost/valhall/valhall.py | 94 ++++++++++- 4 files changed, 340 insertions(+), 37 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 53ddbc06856..772deb1e485 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -459,6 +459,20 @@ clamp_0_1 + + + Encoded clamp/special/round for v15. + + none + clamp_0_inf + clamp_m1_1 + clamp_0_1 + n_round_zero + n + left + n_add + + Condition code. Type must be inferred from the instruction. IEEE 754 total @@ -790,6 +804,83 @@ quiet_nan + + + Make rshift_and instructions signed. + + none + + + + + + + + + + + + signed + + + + + + + + Make rshift_or instructions signed. + + + + + + none + + + + + + + + + signed + + + + + + + Make rshift_xor instructions signed. + + + + + + + + + + none + + + signed + + + + + + + + + Make atomic instructions compare. + + + none + + compare + + + Address to load from after adding offset Mode descriptor @@ -1507,7 +1604,9 @@ + + Byte offset Mode descriptor @@ -1536,6 +1635,7 @@ + Byte offset Mode descriptor @@ -1564,6 +1664,7 @@ + Byte offset Mode descriptor @@ -1592,6 +1693,7 @@ + Byte offset Mode descriptor @@ -1620,6 +1722,7 @@ + Byte offset Mode descriptor @@ -1648,6 +1751,7 @@ + Byte offset Mode descriptor @@ -1676,6 +1780,7 @@ + Byte offset Mode descriptor @@ -1741,7 +1846,9 @@ + + Address to load from after adding offset @@ -1766,7 +1873,9 @@ + + Address to load from after adding offset @@ -1792,6 +1901,7 @@ + Address to load from after adding offset @@ -1817,6 +1927,7 @@ + Address to load from after adding offset @@ -1842,6 +1953,7 @@ + Address to load from after adding offset @@ -1867,6 +1979,7 @@ + Address to load from after adding offset @@ -1892,6 +2005,7 @@ + Address to load from after adding offset @@ -1917,6 +2031,7 @@ + Address to load from after adding offset @@ -2089,6 +2204,7 @@ + @@ -2118,6 +2234,7 @@ + @@ -2263,7 +2380,9 @@ modifiers are set. Used to implement gl_FragDepth and gl_FragStencil. + + Updated coverage mask Depth value Stencil value @@ -2790,8 +2909,11 @@ to zero or signaling NaNs to quiet NaNs depending on the mode. + + + @@ -2840,7 +2962,9 @@ and square root computation respectively. + + @@ -3217,6 +3341,7 @@ `.second` is set (indicating the FATAN_TABLE.f32 instruction). + A B @@ -3440,7 +3565,9 @@ + + A B @@ -3476,6 +3603,7 @@ + A B @@ -3785,6 +3913,7 @@ it performs an unsigned right shift. + A shift @@ -3906,6 +4035,7 @@ it performs an unsigned right shift. + A shift @@ -4027,6 +4157,7 @@ it performs an unsigned right shift. + A shift @@ -4051,6 +4182,7 @@ `(A & mask) | (B & ~mask)`. + A B Mask @@ -4074,6 +4206,7 @@ `(A & mask) | (B & ~mask)`. + A B Mask @@ -4097,6 +4230,7 @@ `(A & mask) | (B & ~mask)`. + A B Mask @@ -4716,6 +4850,7 @@ + @@ -4743,6 +4878,7 @@ + @@ -5180,7 +5316,6 @@ - First calculates $A \cdot B + C$ and then biases the exponent by D. Used in @@ -5189,7 +5324,8 @@ `FMA.f32` operations. Equivalent to `FMA.f32` back-to-back with `LDEXP.f32` - + + A B C @@ -5200,11 +5336,6 @@ - - - - - First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an @@ -5224,11 +5355,6 @@ - - - - - First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply is treated as $A$ even if an @@ -5248,11 +5374,6 @@ - - - - - First calculates $A \cdot B + C$ and then biases the exponent by D, interpreted as a 16-bit value. Used in special transcendental function diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index d744d6bf45e..05e0facb9a1 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -28,6 +28,10 @@ template = """ #define VA_SRC_UNIFORM_TYPE 0x2 #define VA_SRC_IMM_TYPE 0x3 +#define VA_SRC_V15_MODE1 BIT(8) +#define VA_SRC_V15_MODE2 BIT(7) +#define VA_SRC_V15_MODE4 BIT(5) + % for name, en in ENUMS.items(): UNUSED static const char *valhall_${name}[] = { % for v in en.values: @@ -91,22 +95,84 @@ va_print_float_src(FILE *fp, unsigned type, unsigned value, unsigned size, unsig fprintf(fp, ".abs"); } +static inline void +va_print_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page) +{ + unsigned src = (high1 << 8) | low8; + + /* Not reg */ + if (src & VA_SRC_V15_MODE1) { + /* Not uniform */ + if (src & VA_SRC_V15_MODE2) { + /* FAU special */ + if (src & VA_SRC_V15_MODE4) { + unsigned value = src & MASK(5); + if (fau_page == 0) + fputs(valhall_fau_special_page_0[value >> 1] + 1, fp); + else if (fau_page == 1) + fputs(valhall_fau_special_page_1[value >> 1] + 1, fp); + else if (fau_page == 3) + fputs(valhall_fau_special_page_3[value >> 1] + 1, fp); + else + fprintf(fp, "reserved_page2"); + + fprintf(fp, ".w%u", value & 1); + } + /* Imm */ + else { + unsigned value = src & MASK(5); + assert(value < 32 && "overflow in LUT"); + fprintf(fp, "0x%X", va_immediates[value]); + } + } + /* Uniform */ + else { + unsigned value = src & MASK(7); + fprintf(fp, "u%u", value >> 1 | (fau_page << 6)); + if (size <= 32) + fprintf(fp, ".w%u", value & 1); + } + } + /* Reg */ + else { + unsigned value = src & MASK(7); + bool discard = (src & BIT(7)); + char *dmark = discard ? "^" : ""; + if (size > 32) + fprintf(fp, "[r%u%s:r%u%s]", value, dmark, value + 1, dmark); + else + fprintf(fp, "r%u%s", value, dmark); + } +} + +static inline void +va_print_float_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page, bool neg, bool abs) +{ + va_print_src_v15(fp, high1, low8, size, fau_page); + + if (neg) + fprintf(fp, ".neg"); + + if (abs) + fprintf(fp, ".abs"); +} + static inline void va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) { if (size > 32) fprintf(fp, "[r%u:r%u]", value, value + 1); - else + else { fprintf(fp, "r%u", value); - - if (mask != 0x3) - fprintf(fp, ".h%u", (mask == 1) ? 0 : 1); + if (mask != 0x3) + fprintf(fp, ".h%u", (mask == 1) ? 0 : 1); + } } -<%def name="print_instr(op)"> +<%def name="print_instr(op, v15)"> <% no_comma = True %> fputs("${op.name}", fp); -% for mod in op.modifiers: +% for mod in (op.modifiers_v15 if v15 else op.modifiers): % if mod.name not in ["staging_register_count", "staging_register_write_count"]: % if mod.is_enum: fputs(valhall_${safe_name(mod.enum)}[(instr >> ${mod.start}) & ${hex((1 << mod.size) - 1)}], fp); @@ -115,10 +181,18 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) % endif % endif % endfor +% if v15: + fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow_v15']}) & ${hex(op.mask['flow_v15'])}]); +% else: fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow']}) & ${hex(op.mask['flow'])}]); +% endif % for i, dest in enumerate(op.dests): <% no_comma = False %> +% if v15: + va_print_dest(fp, (instr >> ${dest.offset['mode_v15']}) & ${hex(dest.mask['mode_v15'])}, (instr >> ${dest.offset['value_v15']}) & ${hex(dest.mask['value_v15'])}, ${dest.size}); +% else: va_print_dest(fp, (instr >> ${dest.offset['mode']}) & ${hex(dest.mask['mode'])}, (instr >> ${dest.offset['value']}) & ${hex(dest.mask['value'])}, ${dest.size}); +% endif % endfor % for index, sr in enumerate(op.staging): % if not no_comma: @@ -130,13 +204,12 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) if sr.count != 0: sr_count = sr.count; else: - for mod in op.modifiers: + for mod in (op.modifiers_v15 if v15 else op.modifiers): if mod.name == "staging_register_write_count" and sr.write: sr_count = f"(((instr >> {mod.start}) & {hex((1 << mod.size) - 1)}) + 1)"; elif mod.name == "staging_register_count": sr_count = f"((instr >> {mod.start}) & {hex((1 << mod.size) - 1)})"; %> -// assert(((instr >> ${sr.start}) & 0xC0) == ${sr.encoded_flags}); fprintf(fp, "@"); for (unsigned i = 0; i < ${sr_count}; ++i) { fprintf(fp, "%sr%u", (i == 0) ? "" : ":", @@ -148,6 +221,28 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) fputs(", ", fp); % endif <% no_comma = False %> +% if v15: +% if src.absneg: + va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${hex(src.mask['high1_v15'])}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])}, + ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}, +% if op.name[:4] == "FMA." and i == 0: + false, + instr & BIT(${src.offset['abs']})); +% elif op.name[:10] == "FMA_RSCALE" and i == 2: + instr & BIT(${src.offset['neg'] + 1}), + false); +% else: + instr & BIT(${src.offset['neg']}), + instr & BIT(${src.offset['abs']})); +% endif +% elif src.is_float: + va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])}, + ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}, false, false); +% else: + va_print_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])}, + ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}); +% endif +% else: % if src.absneg: va_print_float_src(fp, (instr >> ${src.offset['mode']}) & ${hex(src.mask['mode'])}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])}, ${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])}, @@ -160,6 +255,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) va_print_src(fp, (instr >> ${src.offset['mode']}) & ${src.mask['mode']}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])}, ${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])}); % endif +% endif % if src.swizzle: % if src.size == 32: fputs(valhall_widen[(instr >> ${src.offset['swizzle']}) & ${hex(src.mask['swizzle'])}], fp); @@ -183,7 +279,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) if (instr & BIT(${src.offset['not']})) fputs(".not", fp); % endif % endfor -% for imm in op.immediates: +% for imm in (op.immediates_v15 if v15 else op.immediates): <% prefix = "#" if imm.name == "constant" else imm.name + ":" fmt = "%d" if imm.signed else "0x%X" @@ -192,16 +288,16 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size) % endfor -<%def name="recurse_subcodes(op_bucket)"> +<%def name="recurse_subcodes(op_bucket, v15)"> %if op_bucket.instr: -${print_instr(op_bucket.instr)} +${print_instr(op_bucket.instr, v15)} %else: opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)}; switch (opcode) { %for op in op_bucket.children: case ${hex(op)}: { -${recurse_subcodes(op_bucket.children[op])} +${recurse_subcodes(op_bucket.children[op], v15)} break; } %endfor @@ -215,7 +311,7 @@ va_disasm_instr(FILE *fp, uint64_t instr) { unsigned opcode; -${recurse_subcodes(OPCODES)} +${recurse_subcodes(OPCODES, False)} } void @@ -223,7 +319,7 @@ va_disasm_instr_v15(FILE *fp, uint64_t instr) { unsigned opcode; -${recurse_subcodes(OPCODES_V15)} +${recurse_subcodes(OPCODES_V15, True)} } static bool is_branch(uint64_t instr) diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index b8808bd30e4..31e00f34e31 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -97,7 +97,7 @@ valhall_opcodes[BI_NUM_OPCODES] = { sr_control = 0 if len(op.staging) > 0: - sr_control = op.staging[0].encoded_flags >> 6 + sr_control = op.staging[0].encoded_flags %> [BI_OPCODE_${name.replace('.', '_').upper()}] = { .exact = ${hex(exact(op.opcode))}ULL, @@ -154,7 +154,7 @@ valhall_v15_opcodes[BI_NUM_OPCODES] = { sr_control = 0 if len(op.staging) > 0: - sr_control = op.staging[0].encoded_flags >> 6 + sr_control = op.staging[0].encoded_flags %> [BI_OPCODE_${name.replace('.', '_').upper()}] = { .exact = ${hex(exact(op.opcode_v15))}ULL, diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index c6cbf31ed86..b6a10fb3077 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -14,6 +14,7 @@ import sys instructions = [] MODIFIERS = {} +MODIFIERS_V15 = {} enums = {} immediates = [] @@ -102,6 +103,11 @@ class Source: self.offset['value'] = self.start self.mask['value'] = bitmask(6) + self.offset['high1_v15'] = (index + 48) + self.mask['high1_v15'] = bitmask(1) + self.offset['low8_v15'] = self.start + self.mask['low8_v15'] = bitmask(8) + if absneg: self.offset['neg'] = 32 + 2 + ((2 - index) * 2) self.offset['abs'] = 33 + 2 + ((2 - index) * 2) @@ -137,6 +143,11 @@ class Dest: self.offset['value'] = self.start self.mask['value'] = bitmask(6) + self.offset['mode_v15'] = self.start + 13 + self.mask['mode_v15'] = bitmask(2) + self.offset['value_v15'] = self.start + self.mask['value_v15'] = bitmask(8) + class Staging: def __init__(self, read = False, write = False, count = 0, flags = 'true', name = ""): self.name = name @@ -152,6 +163,14 @@ class Staging: self.offset['value'] = self.start self.mask['value'] = bitmask(6) + self.offset['flags'] = self.start + 6 + self.mask['flags'] = bitmask(2) + + self.offset['value_v15'] = self.start + self.mask['value_v15'] = bitmask(8) + self.offset['flags_v15'] = 38 + self.mask['flags_v15'] = bitmask(2) + # For compatibility self.absneg = False @@ -166,11 +185,14 @@ class Staging: if not self.flags: self.encoded_flags = 0 + self.encoded_flags_v15 = 0 elif flags == 'rw': - self.encoded_flags = 0xc0 + self.encoded_flags = 0b11 + self.encoded_flags_v15 = 0b11 else: assert(flags == 'true') - self.encoded_flags = (0x80 if write else 0) | (0x40 if read else 0) + self.encoded_flags = (0b10 if write else 0) | (0b01 if read else 0) + self.encoded_flags_v15 = (0b10 if read else 0) | (0b01 if read and write else 0) class Immediate: def __init__(self, name, start, size, signed): @@ -186,14 +208,16 @@ class Opcode: self.mask = mask class Instruction: - def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): + def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], immediates_v15 = [], modifiers = [], modifiers_v15 = [], staging = None, unit = None): self.name = name self.srcs = srcs self.dests = dests self.opcode = opcode self.opcode_v15 = opcode_v15 self.immediates = immediates + self.immediates_v15 = immediates_v15 self.modifiers = modifiers + self.modifiers_v15 = modifiers_v15 self.staging = staging self.unit = unit self.is_signed = len(name.split(".")) > 1 and ('s' in name.split(".")[1]) @@ -206,6 +230,11 @@ class Instruction: self.offset['fau_page'] = 57 self.mask['fau_page'] = bitmask(2) + self.offset['flow_v15'] = 58 + self.mask['flow_v15'] = bitmask(4) + self.offset['fau_page_v15'] = 62 + self.mask['fau_page_v15'] = bitmask(2) + # Message-passing instruction <===> not ALU instruction self.message = unit not in ["FMA", "CVT", "SFU"] @@ -306,15 +335,25 @@ def build_instr(el, overrides = {}): # Get immediates imms = [build_imm(imm) for imm in el.findall('imm')] + imms_v15 = [build_imm(imm) for imm in el.findall('imm_v15_override')] + for imm in imms: + if imm.name not in {imm.name for imm in imms_v15}: + imms_v15.append(imm) modifiers = [] + modifiers_v15 = [] for mod in el: if (mod.tag in MODIFIERS) and not (mod.attrib.get('pseudo', False)): modifiers.append(MODIFIERS[mod.tag]) + modifiers_v15.append(MODIFIERS_V15[mod.tag]) elif mod.tag =='va_mod': modifiers.append(build_modifier(mod)) + elif mod.tag =='va_mod_v15': + modifiers_v15.append(build_modifier(mod)) - instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) + + instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, immediates_v15 = imms_v15, + modifiers = modifiers, modifiers_v15 = modifiers_v15, staging = staging, unit = unit) instructions.append(instr) @@ -380,6 +419,7 @@ def typesize(name): # Parse the ISA def valhall_parse_isa(xmlfile): global MODIFIERS + global MODIFIERS_V15 global enums global immediates global root @@ -440,6 +480,52 @@ def valhall_parse_isa(xmlfile): "sample": Modifier("sample_mode", 38, 2), } + MODIFIERS_V15 = { + # Texture instructions share a common encoding + "wide_indices": Flag("wide_indices", 8), + "array_enable": Flag("array_enable", 10), + "texel_offset": Flag("texel_offset", 11), + "shadow": Flag("shadow", 12), + "integer_coordinates": Flag("integer_coordinates", 13), + "fetch_component": Modifier("fetch_component", 14, 2), + "lod_mode": Modifier("lod_mode", 13, 3), + "lod_bias_disable": Modifier("lod_mode", 13, 1), + "lod_clamp_disable": Modifier("lod_mode", 14, 1), + "write_mask": Modifier("write_mask", 24, 4), + "dimension": Modifier("dimension", 28, 2), + "skip": Flag("skip", 39), + "register_width": Modifier("register_width", 38, 1, force_enum = "register_width"), + "secondary_register_width": Modifier("secondary_register_width", 54, 1, force_enum = "register_width"), + "vartex_register_width": Modifier("varying_texture_register_width", 24, 2), + + "atom_opc": Modifier("atomic_operation", 24, 4), + "atom_opc_1": Modifier("atomic_operation_with_1", 24, 3), + "inactive_result": Modifier("inactive_result", 22, 4), + "memory_access": Modifier("memory_access", 24, 2), + "regfmt": Modifier("register_format", 24, 3), + "source_format": Modifier("source_format", 24, 2), + "vecsize": Modifier("vector_size", 28, 2), + + "slot": Modifier("slot_v15", 30, 2), + "roundmode": Modifier("round_mode", 32, 2), + "result_type": Modifier("result_type", 24, 2), + "saturate": Flag("saturate", 25), + "not_result": Flag("not_result", 34), + + "lane_op": Modifier("lane_operation", 32, 4), + "cmp": Modifier("condition", 33, 3), + "clamp": Modifier("clamp", 30, 2), + "sr_count": Modifier("staging_register_count", 32, 3, implied = True), + "sample_and_update": Modifier("sample_and_update_mode", 32, 3), + "sr_write_count": Modifier("staging_register_write_count", 35, 3, implied = True), + + "conservative": Flag("conservative", 35), + "subgroup": Modifier("subgroup_size", 36, 4), + "update": Modifier("update_mode", 35, 2), + "sample": Modifier("sample_mode", 37, 2), + } + + for child in root: if child.tag == 'group': build_group(child) From 70444a0a2a917765bf34316974d06da4441270d0 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Thu, 23 Apr 2026 14:15:35 +0200 Subject: [PATCH 48/49] pan/va: Add v15 asm/disasm tests To support this, we also need to add a way to pass arch version to the asm/disasm tests. --- src/panfrost/compiler/bifrost/valhall/asm.py | 178 +++++++++++++--- .../compiler/bifrost/valhall/meson.build | 28 ++- .../compiler/bifrost/valhall/test-assembly.py | 26 ++- .../valhall/test/assembler-cases-v15.txt | 195 ++++++++++++++++++ .../bifrost/valhall/test/assembler-cases.txt | 1 + .../bifrost/valhall/test/test-disassembler.c | 19 +- 6 files changed, 400 insertions(+), 47 deletions(-) create mode 100644 src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt diff --git a/src/panfrost/compiler/bifrost/valhall/asm.py b/src/panfrost/compiler/bifrost/valhall/asm.py index ca5766e93aa..2790e3029cb 100644 --- a/src/panfrost/compiler/bifrost/valhall/asm.py +++ b/src/panfrost/compiler/bifrost/valhall/asm.py @@ -29,16 +29,20 @@ class FAUState: die_if(self.page is not None and self.page != page, 'Mismatched pages') self.page = page - def push(self, source): - if not (source & (1 << 7)): - # Skip registers + def push(self, source, arch): + # Skip registers + if arch >= 15 and not (source & (1 << 8)): + return + elif arch < 15 and not (source & (1 << 7)): return self.buffer.add(source) die_if(len(self.buffer) > 2, "Overflowed FAU buffer") - if (source >> 5) == 0b110: - # Small constants need to check if the buffer overflows but no else + # Small constants need to check if the buffer overflows but no else + if arch >= 15 and (source >> 5) == 0b1110: + return + elif arch < 15 and (source >> 5) == 0b110: return slot = (source >> 1) @@ -120,6 +124,50 @@ def encode_source(op, fau): die('Invalid operand') +def encode_source_v15(op, fau): + # Reg tuple + if op[0] == '[' and op[-1:] == ']': + # Remove brackets and split on ":" + unpacked = op[1:-1].split(":") + die_if(len(unpacked) != 2, 'Invalid tuple') + die_if(unpacked[0][0] != 'r', 'Invalid tuple') + die_if(unpacked[1][0] != 'r', 'Invalid tuple') + if (unpacked[0][-1:] == '^'): + val0 = parse_int(unpacked[0][1:-1], 0, 127) + val1 = parse_int(unpacked[1][1:-1], 0, 127) + die_if(val1 != val0 + 1, 'Invalid tuple value') + return val0 | 0x80 + else: + val0 = parse_int(unpacked[0][1:], 0, 127) + val1 = parse_int(unpacked[1][1:], 0, 127) + die_if(val1 != val0 + 1, 'Invalid tuple value') + return val0 + elif op[0] == 'r': + if (op[-1:] == '^'): + return parse_int(op[1:-1], 0, 127) | 0x80 + return parse_int(op[1:], 0, 127) + elif op[0] == 'u': + val = parse_int(op[1:], 0, 254) + fau.set_page(val >> 6) + return ((val & 0x3F) << 1) | 0x100 + elif op[0] == 'i': + return int(op[3:]) | 0x1C0 + elif op.startswith('0x'): + try: + val = int(op, base=0) + except ValueError: + die('Expected value') + + die_if(val not in immediates, 'Unexpected immediate value') + return immediates.index(val) | 0x1C0 + else: + for i in [0, 1, 3]: + if op in enums[f'fau_special_page_{i}'].bare_values: + idx = 32 + (enums[f'fau_special_page_{i}'].bare_values.index(op) << 1) + fau.set_page(i) + return idx | 0x1E0 + + die('Invalid operand') def encode_dest(op): # Reg tuple @@ -156,7 +204,47 @@ def encode_dest(op): return value | (wrmask << 6) -def parse_asm(line): +def encode_dest_v15(op, dst64): + # Reg tuple + if op[0] == '[' and op[-1:] == ']': + # Remove brackets and split on ":" + unpacked = op[1:-1].split(":") + die_if(len(unpacked) != 2, 'Invalid tuple') + die_if(unpacked[0][0] != 'r', 'Invalid tuple') + die_if(unpacked[1][0] != 'r', 'Invalid tuple') + + parts = unpacked[0].split(".") + reg = parts[0] + value = parse_int(reg[1:], 0, 127) + + parts1 = unpacked[1].split(".") + reg1 = parts1[0] + val1 = parse_int(reg1[1:], 0, 127) + die_if(val1 != value + 1, 'Invalid tuple value') + else: + die_if(op[0] != 'r', f"Expected register destination {op}") + parts = op.split(".") + reg = parts[0] + value = parse_int(reg[1:], 0, 127) + + # Default to writing in full + if (dst64): + wrmask = 0x0 + die_if(len(parts) > 1, "Must write full") + else: + wrmask = 0x3 + + if len(parts) > 1: + WMASKS = ["h0", "h1"] + die_if(len(parts) > 2, "Too many modifiers") + mask = parts[1]; + die_if(mask not in WMASKS, "Expected a write mask") + wrmask = 1 << WMASKS.index(mask) + + return value | (wrmask << 13) + + +def parse_asm(line, arch): global LINE LINE = line # For better errors encoded = 0 @@ -187,7 +275,7 @@ def parse_asm(line): tail = line[(len(head) + 1):] operands = [x.strip() for x in tail.split(",") if len(x.strip()) > 0] - expected_op_count = len(ins.srcs) + len(ins.dests) + len(ins.immediates) + len(ins.staging) + expected_op_count = len(ins.srcs) + len(ins.dests) + len((ins.immediates_v15 if arch >= 15 else ins.immediates)) + len(ins.staging) if len(operands) != expected_op_count: die(f"Wrong number of operands in {line}, expected {expected_op_count}, got {len(operands)} {operands}") @@ -200,9 +288,9 @@ def parse_asm(line): parts = [] die_if(any([x[0] != 'r' for x in parts]), f'Expected registers, got {op}') - regs = [parse_int(x[1:], 0, 63) for x in parts] + regs = [parse_int(x[1:], 0, (127 if arch >= 15 else 63)) for x in parts] - extended_write = "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write + extended_write = "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write max_sr_count = 8 if extended_write else 7 sr_count = len(regs) @@ -215,22 +303,31 @@ def parse_asm(line): 'Consecutive staging registers must be aligned to a register pair') if sr.count == 0: - if "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write: + if "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write: modifier_map["staging_register_write_count"] = sr_count - 1 else: - assert "staging_register_count" in [x.name for x in ins.modifiers] + assert "staging_register_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] modifier_map["staging_register_count"] = sr_count else: die_if(sr_count != sr.count, f"Expected {sr.count} staging registers, got {sr_count}") - encoded |= ((sr.encoded_flags | base) << sr.start) + encoded |= base << sr.start + if arch >= 15: + encoded |= sr.encoded_flags_v15 << sr.offset['flags_v15'] + else: + encoded |= sr.encoded_flags << sr.offset['flags'] + + # On v15, some instructions require special sr_control values + if arch >= 15 and ins.name == "BARRIER": + encoded |= 0b10 << 38 + operands = operands[len(ins.staging):] for op, dest in zip(operands, ins.dests): - encoded |= encode_dest(op) << 40 + encoded |= (encode_dest_v15(op, dest.size >= 64) if arch >= 15 else encode_dest(op)) << 40 operands = operands[len(ins.dests):] - if len(ins.dests) == 0 and len(ins.staging) == 0: + if arch < 15 and len(ins.dests) == 0 and len(ins.staging) == 0: # Set a placeholder writemask to prevent encoding faults encoded |= (0xC0 << 40) @@ -238,12 +335,18 @@ def parse_asm(line): for i, (op, src) in enumerate(zip(operands, ins.srcs)): parts = op.split('.') - encoded_src = encode_source(parts[0], fau) - - # Require a word selection for special FAU values - may_have_word_select = ((encoded_src >> 5) == 0b111) - # or for regular FAU values - may_have_word_select |= ((encoded_src >> 6) == 0b10) + if (arch >= 15): + encoded_src = encode_source_v15(parts[0], fau) + # Require a word selection for special FAU values + may_have_word_select = ((encoded_src >> 5) == 0b1111) + # or for regular FAU values + may_have_word_select |= ((encoded_src >> 7) == 0b10) + else: + encoded_src = encode_source(parts[0], fau) + # Require a word selection for special FAU values + may_have_word_select = ((encoded_src >> 5) == 0b111) + # or for regular FAU values + may_have_word_select |= ((encoded_src >> 6) == 0b10) # Has a swizzle been applied yet? swizzled = False @@ -251,7 +354,11 @@ def parse_asm(line): for mod in parts[1:]: # Encode the modifier if mod in src.offset and src.mask[mod] == 0x1: - encoded |= (1 << src.offset[mod]) + # On v15, FMA_RSCALE has a different offset src2.neg + if arch >= 15 and ins.name[:10] == "FMA_RSCALE" and mod == "neg" and i == 2: + encoded |= (1 << (src.offset[mod] + 1)) + else: + encoded |= (1 << src.offset[mod]) elif src.halfswizzle and mod in enums[f'half_swizzles_{src.size}_bit'].bare_values: die_if(swizzled, "Multiple swizzles specified") swizzled = True @@ -318,12 +425,15 @@ def parse_asm(line): val = enums['swizzles_16_bit'].bare_values.index(mod) encoded |= (val << src.offset['widen']) - encoded |= encoded_src << src.start - fau.push(encoded_src) + if arch >= 15: + encoded |= ((encoded_src & 0x100) << (src.offset['high1_v15'] - 8)) | ((encoded_src & 0xFF) << src.start) + else: + encoded |= encoded_src << src.start + fau.push(encoded_src, arch) operands = operands[len(ins.srcs):] - for i, (op, imm) in enumerate(zip(operands, ins.immediates)): + for i, (op, imm) in enumerate(zip(operands, (ins.immediates_v15 if arch >= 15 else ins.immediates))): if op[0] == '#': die_if(imm.name != 'constant', "Wrong syntax for immediate") parts = [imm.name, op[1:]] @@ -347,15 +457,15 @@ def parse_asm(line): encoded |= (val << imm.start) - operands = operands[len(ins.immediates):] + operands = operands[len((ins.immediates_v15 if arch >= 15 else ins.immediates)):] # Encode the operation itself - for subcode in ins.opcode: + for subcode in (ins.opcode_v15 if arch >= 15 else ins.opcode): encoded |= (subcode.value << subcode.start) # Encode FAU page if fau.page: - encoded |= (fau.page << ins.offset['fau_page']) + encoded |= (fau.page << (ins.offset['fau_page_v15'] if arch >= 15 else ins.offset['fau_page'])) # Encode modifiers has_flow = False @@ -366,9 +476,10 @@ def parse_asm(line): if mod in enums['flow'].bare_values: die_if(has_flow, "Multiple flow control modifiers specified") has_flow = True - encoded |= (enums['flow'].bare_values.index(mod) << ins.offset['flow']) + encoded |= (enums['flow'].bare_values.index(mod) << (ins.offset['flow_v15'] if arch >= 15 else + ins.offset['flow'])) else: - candidates = [c for c in ins.modifiers if mod in c.bare_values] + candidates = [c for c in (ins.modifiers_v15 if arch >= 15 else ins.modifiers) if mod in c.bare_values] die_if(len(candidates) == 0, f"Invalid modifier {mod} used") assert(len(candidates) == 1) # No ambiguous modifiers @@ -380,13 +491,20 @@ def parse_asm(line): die_if(opts.name in modifier_map, f"{opts.name} specified twice") modifier_map[opts.name] = value - for mod in ins.modifiers: + + for mod in (ins.modifiers_v15 if arch >= 15 else ins.modifiers): value = modifier_map.get(mod.name, mod.default) die_if(value is None, f"Missing required modifier {mod.name}") assert(value < (1 << mod.size)) encoded |= (value << mod.start) + # On v15, some instrutions require an encoded null src. + requires_nullsrc = ['BARRIER', 'NOP', 'LD_GCLK_U64', 'LD_VAR_FLAT_IMM', 'LD_VAR_BUF_FLAT_IMM']; + if arch >= 15 and ins.name in requires_nullsrc: + enc_src = 0x1C0 + encoded |= ((enc_src >> 8) & 0x1) << 48 | (enc_src & 0xFF) + return encoded if __name__ == "__main__": diff --git a/src/panfrost/compiler/bifrost/valhall/meson.build b/src/panfrost/compiler/bifrost/valhall/meson.build index 9cf75fdf78e..6e58b745aba 100644 --- a/src/panfrost/compiler/bifrost/valhall/meson.build +++ b/src/panfrost/compiler/bifrost/valhall/meson.build @@ -44,9 +44,7 @@ libpanfrost_valhall_disasm = static_library( ) if with_tests - test( - 'valhall_disasm', - executable( + valhall_disasm_test_e = executable( 'valhall_disasm_test', files('test/test-disassembler.c'), c_args : [c_msvc_compat_args, no_override_init_args], @@ -54,15 +52,33 @@ if with_tests include_directories : [inc_include, inc_src], dependencies: [idep_valhall_enums_h], link_with : [libpanfrost_valhall_disasm], - ), + ) + + test( + 'valhall_disasm', + valhall_disasm_test_e, suite : ['panfrost'], - args : files('test/assembler-cases.txt'), + args : [files('test/assembler-cases.txt'), 'v10'], + ) + + test( + 'valhall_disasm', + valhall_disasm_test_e, + suite : ['panfrost'], + args : [files('test/assembler-cases-v15.txt'), 'v15'], ) test( 'valhall_asm', prog_python, - args : files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'), + args : [files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'), 'v10'], + suite : ['panfrost'], + ) + + test( + 'valhall_asm', + prog_python, + args : [files('test-assembly.py', 'test/assembler-cases-v15.txt', 'test/negative-cases.txt'), 'v15'], suite : ['panfrost'], ) endif diff --git a/src/panfrost/compiler/bifrost/valhall/test-assembly.py b/src/panfrost/compiler/bifrost/valhall/test-assembly.py index 4f2851ee549..64856058030 100644 --- a/src/panfrost/compiler/bifrost/valhall/test-assembly.py +++ b/src/panfrost/compiler/bifrost/valhall/test-assembly.py @@ -17,19 +17,19 @@ def hex_8(u64): return ' '.join(as_strings) # These should not throw exceptions -def positive_test(machine, assembly): +def positive_test(machine, assembly, arch): try: expected = parse_hex_8(machine) - val = parse_asm(assembly) + val = parse_asm(assembly, arch) if val != expected: return f"{hex_8(val)} Incorrect assembly" except ParseError as exc: return f"Unexpected exception: {exc}" # These should throw exceptions -def negative_test(assembly): +def negative_test(assembly, arch): try: - parse_asm(assembly) + parse_asm(assembly, arch) return "Expected exception" except Exception: return None @@ -43,24 +43,34 @@ def record_case(case, error): else: FAIL.append((case, error)) -if len(sys.argv) < 3: - print("Expected positive and negative case lists") +if len(sys.argv) < 4: + print("Expected positive and negative case lists, followed by arch") sys.exit(1) +if sys.argv[3][0] == 'v': + try: + arch = int(sys.argv[3][1:], base = 0) + except ValueError: + print(f"Expected arch number {sys.argv[3][1:]}") + sys.exit(1) +else: + print(f"Expected arch version {sys.argv[3]}") + + with open(sys.argv[1], "r") as f: cases = f.read().split('\n') cases = [x for x in cases if len(x) > 0 and x[0] != '#'] for case in cases: (machine, assembly) = case.split(' ') - record_case(case, positive_test(machine, assembly)) + record_case(case, positive_test(machine, assembly, arch)) with open(sys.argv[2], "r") as f: cases = f.read().split('\n') cases = [x for x in cases if len(x) > 0] for case in cases: - record_case(case, negative_test(case)) + record_case(case, negative_test(case, arch)) print("Passed {}/{} tests.".format(len(PASS), len(PASS) + len(FAIL))) diff --git a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt new file mode 100644 index 00000000000..43a8a5641e9 --- /dev/null +++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt @@ -0,0 +1,195 @@ +02 00 20 00 00 01 60 00 MOV.i32 r1, r2 +0a 00 20 00 00 01 61 00 MOV.i32 r1, u5.w0 +e3 00 20 00 00 01 61 40 MOV.i32 r1, thread_local_pointer.w1 +e6 00 20 00 00 01 61 40 MOV.i32 r1, workgroup_local_pointer.w0 +e2 00 20 00 00 01 61 c0 MOV.i32 r1, lane_id.w0 +e6 00 20 00 00 01 61 c0 MOV.i32 r1, core_id.w0 +01 02 00 00 00 00 f0 00 FADD.f32 r0, r1, r2 +01 02 00 00 20 00 f0 00 FADD.f32 r0, r1, r2.abs +01 02 00 00 10 00 f0 00 FADD.f32 r0, r1, r2.neg +01 02 00 00 30 00 f0 00 FADD.f32 r0, r1, r2.neg.abs +01 02 00 80 30 00 f0 00 FADD.f32.clamp_m1_1 r0, r1, r2.neg.abs +81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3 +01 d0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x3F800000 +01 d0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x3F800000.neg +01 c0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x0 +01 c0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x0.neg +01 c9 00 00 00 00 e2 00 IADD.u32 r0, r1, 0x7060504 +01 00 00 08 00 00 f0 00 FADD.f32 r0, r1, r0.h1 +01 00 00 04 00 00 f0 00 FADD.f32 r0, r1, r0.h0 +01 00 00 0c 00 00 f4 00 FADD.v2f16 r0, r1.h00, r0.h11 +01 00 00 28 00 00 f4 00 FADD.v2f16 r0, r1, r0 +01 00 00 24 00 00 f4 00 FADD.v2f16 r0, r1, r0.h10 +01 02 00 08 00 00 e0 00 IADD.u32 r0, r1, r2.h0 +01 02 00 0c 00 00 e0 00 IADD.u32 r0, r1, r2.h1 +01 02 00 0c 70 00 e0 00 IADD.u32 r0, r1.b3, r2.h1 +01 c9 00 18 00 00 e2 00 IADD.u32 r0, r1, 0x7060504.b2 +01 02 00 08 20 00 e4 00 IADD.v2u16 r0, r1, r2 +02 3c 47 20 00 00 91 02 SHADDX.u64 [r0:r1], u1, [r60:r61].w0, shift:0x2 +80 00 00 00 19 00 20 07 LOAD.i32.slot0.wait0 @r0, [r0^:r1^], offset:0 +00 bc 87 20 00 00 91 02 SHADDX.u64 [r0:r1], u0, [r60^:r61^].w0, shift:0x4 +80 00 00 00 9c 04 20 3f STORE.i128.slot0.end @r4:r5:r6:r7, [r0^:r1^], offset:0 +c0 00 e0 01 00 00 a1 3e NOP.end +80 c4 c0 1e 02 01 e6 01 ICMP_OR.u32.gt.m1 r1, r0^, 0x1000000.b3, 0x0 +82 00 00 00 99 00 20 2b STORE.i32.slot0.reconverge @r0, [r2^:r3^], offset:0 +00 c9 8f 12 30 00 e2 00 CLPER.i32.f1 r0, r0, 0x7060504.b00 +00 00 4b 00 00 02 60 00 F16_TO_F32 r2, r0.h0 +80 00 4b 10 00 03 60 00 F16_TO_F32 r3, r0^.h1 +c0 00 e0 01 00 00 a1 22 NOP.wait0126 +80 c0 00 28 90 00 f6 24 FADD.v2f16.wait r0, r0^.abs, 0x0.neg +c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0 +3c d0 ea 00 01 3c d6 37 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0 +80 db 05 04 00 01 e6 00 MKVEC.v2i16 r1, r0^.h0, 0x3C000000.h1 +f0 00 3c 33 82 00 1b 3f BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0 +bb 0d 00 40 02 04 08 07 LEA_BUF_IMM.slot1.wait0 @r4:r5, r59^, table:0xD, index:0x0 +00 dd c0 08 14 02 66 01 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg +81 08 c0 00 04 01 66 01 FMA.f32 r1, r1^, u4.w0, 0x0.neg +80 08 c0 00 04 00 66 09 FMA.f32.wait1 r0, r0^, u4.w0, 0x0.neg +84 00 00 02 93 00 20 3f STORE.i96.estream.slot0.end @r0:r1:r2, [r4^:r5^], offset:0 +84 00 00 01 9c 08 20 3f STORE.i128.istream.slot0.end @r8:r9:r10:r11, [r4^:r5^], offset:0 +c0 00 00 c0 80 00 3d 27 BARRIER.slot7.wait +00 00 00 00 01 02 21 03 LOAD.i8.slot0 @r2, u0, offset:0 +00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0 +00 00 00 00 11 02 21 03 LOAD.i24.slot0 @r2, u0, offset:0 +00 00 00 00 19 02 21 03 LOAD.i32.slot0 @r2, u0, offset:0 +00 00 00 00 02 02 21 03 LOAD.i48.slot0 @r2:r3, u0, offset:0 +00 00 00 00 0a 02 21 03 LOAD.i64.slot0 @r2:r3, u0, offset:0 +00 00 00 00 13 02 21 03 LOAD.i96.slot0 @r2:r3:r4, u0, offset:0 +00 00 00 00 1c 04 21 03 LOAD.i128.slot0 @r4:r5:r6:r7, u0, offset:0 +00 00 00 08 01 02 21 03 LOAD.i8.b1.slot0 @r2, u0, offset:0 +00 00 00 10 01 02 21 03 LOAD.i8.b2.slot0 @r2, u0, offset:0 +00 00 00 18 01 02 21 03 LOAD.i8.b3.slot0 @r2, u0, offset:0 +00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0 +00 14 00 08 09 02 21 03 LOAD.i16.h1.slot0 @r2, u0, offset:20 +82 00 4d 00 42 02 60 00 FROUND.f32.rtn r2, r2^.neg +82 00 4b 00 40 02 60 00 F16_TO_F32 r2, r2^.neg.h0 +82 00 4c 00 43 02 60 00 F32_TO_S32.rtz r2, r2^.neg +82 c0 c6 47 48 02 64 00 FADD_IMM.f32 r2, r2^, #0x4847C6C0 +82 84 67 ac 70 02 62 00 FADD_IMM.v2f16 r2, r2^, #0x70AC6784 +82 14 00 13 00 02 6a 00 IADD_IMM.v2i16 r2, r2^, #0x130014 +82 ab 4b 00 00 02 6c 00 IADD_IMM.i32 r2, r2^, #0x4BAB +83 82 c0 c6 12 02 e4 01 ICMP_OR.v2s16.gt.m1 r2, r3^.h10, r2^.h10, 0x0 +83 82 c0 52 03 02 e4 01 FCMP_OR.v2f16.gt.m1 r2, r3^.h10, r2^.h00, 0x0 +81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3 +00 03 00 00 20 00 b8 2a BRANCHZ.reconverge r0.h0, offset:3 +00 03 00 00 40 00 b8 2a BRANCHZ.reconverge r0.h1, offset:3 +00 03 00 00 00 00 b8 2a BRANCHZ.reconverge r0, offset:3 +c0 00 00 00 00 00 6d 00 IADD_IMM.i32 r0, 0x0, #0x0 +c0 01 00 00 00 04 6d 28 IADD_IMM.i32.reconverge r4, 0x0, #0x1 +00 00 47 20 00 02 91 02 SHADDX.u64 [r2:r3], u0, [r0:r1].w0, shift:0x2 +80 c9 00 10 00 00 e2 00 IADD.u32 r0, r0^, 0x7060504.b0 +00 02 c0 02 06 01 e6 01 ICMP_OR.u32.ne.m1 r1, r0, u1.w0, 0x0 +04 00 20 00 00 05 60 00 MOV.i32 r5, r4 +04 00 20 00 00 06 60 00 MOV.i32 r6, r4 +04 00 20 00 00 07 60 04 MOV.i32.wait0 r7, r4 +82 00 00 00 9c 04 20 03 STORE.i128.slot0 @r4:r5:r6:r7, [r2^:r3^], offset:0 +81 f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r1^, offset:-8 +bd c0 00 08 10 3c c6 00 IADD.v2u16 r60.h1, r61^.h10, 0x0 +84 00 86 32 8c 00 12 3f ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0 +84 00 86 34 8c 00 12 3f ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0 +84 00 86 36 8c 00 12 3f ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0 +bc c0 12 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x1 +bc c0 02 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x0 +02 01 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w0, u0.w1 +00 01 00 40 0a 00 8b 03 LD_PKA.i64.slot1 @r0:r1, u0.w0, u0.w1 +04 01 00 80 0a 26 8b 03 LD_PKA.i64.slot2 @r38:r39, u2.w0, u0.w1 +03 01 00 80 0a 24 8b 03 LD_PKA.i64.slot2 @r36:r37, u1.w1, u0.w1 +03 04 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w1, u2.w0 +81 02 00 00 13 02 8a 03 LD_PKA.i96.slot0 @r2:r3:r4, r1^, u1.w0 +80 03 00 00 13 06 8a 07 LD_PKA.i96.slot0.wait0 @r6:r7:r8, r0^, u1.w1 +80 00 80 01 c0 00 60 20 FRCP.f32.wait0126 r0, r0^.neg.abs +80 84 00 80 00 00 7c 01 MUX.i32.neg r0, r0^, r4^, u0.w0 +80 84 00 80 04 00 7c 01 MUX.i32 r0, r0^, r4^, u0.w0 +80 84 00 80 08 00 7c 01 MUX.i32.fp_zero r0, r0^, r4^, u0.w0 +80 84 00 80 0c 00 7c 01 MUX.i32.bit r0, r0^, r4^, u0.w0 +00 00 20 41 00 01 60 34 FREXPM.f32.sqrt.discard r1, r0 +01 00 82 01 00 02 60 00 FRSQ.f32 r2, r1 +80 00 22 41 00 00 60 00 FREXPE.f32.sqrt r0, r0^ +81 82 c0 80 0a 00 64 02 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^ +81 82 c0 80 0e 00 64 22 FMA_RSCALE.f32.left.wait0126 r0, r1^, r2^, 0x0.neg, r0^ +82 83 04 05 00 01 7c 02 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1 +82 83 04 05 08 01 7c 02 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1 +82 83 04 05 48 01 7c 02 CSEL.s32.lt r1, r2^, r3^, u2.w0, u2.w1 +3d 00 00 12 5a 02 18 07 LD_VAR_SPECIAL.v2.f32.sample.clobber.slot0.wait0 @r2:r3, r61, index:0x0 +3d 00 00 3f 0a 02 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.center.retrieve.wait0 @r2:r3, r61, index:0x0 +3d 00 00 3f 42 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.sample.store.wait0 @r0:r1, r61, index:0x0 +3d 08 00 3f 22 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.centroid.store.wait0 @r0:r1, r61, index:0x8 +bc bd 11 33 02 00 84 03 LD_ATTR_IMM.v4.f16.slot0 @r0:r1, r60^, r61^, index:0x1, table:0x1 +80 3c 03 23 02 04 c0 03 LD_TILE.v3.f16.slot0 @r4:r5, r0^, r60, r3 +00 c9 00 20 10 01 c6 00 IADD.v2u16 r1.h1, r0.h10, 0x7060504.b11 +80 c0 00 08 10 01 a6 00 IADD.v2u16 r1.h0, r0^.h10, 0x0 +02 02 00 04 20 02 a4 00 IADD.v2u16 r2.h0, r2, r2.h10 +82 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r2^.h0, 0x0.h0 +b7 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r55^.h0, 0x0.h0 +b7 c0 05 10 00 02 e6 00 MKVEC.v2i16 r2, r55^.h1, 0x0.h0 +c0 b7 05 00 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h0 +c0 b7 05 04 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h1 +b7 00 54 00 00 02 60 00 U16_TO_U32 r2, r55^.h0 +b7 00 54 10 00 02 60 00 U16_TO_U32 r2, r55^.h1 +b7 00 44 00 00 02 60 00 S16_TO_S32 r2, r55^.h0 +b7 00 44 10 00 02 60 00 S16_TO_S32 r2, r55^.h1 +c0 b7 01 08 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h0 +c0 b7 01 0c 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h1 +00 c0 c0 c0 c0 07 7e 01 MKVEC.v2i8 r7, r0.b3, 0x0.b0, 0x0 +00 c0 c0 c0 80 06 7e 01 MKVEC.v2i8 r6, r0.b2, 0x0.b0, 0x0 +00 c0 c0 c0 00 04 7e 01 MKVEC.v2i8 r4, r0.b0, 0x0.b0, 0x0 +80 c0 c0 c0 40 05 7e 01 MKVEC.v2i8 r5, r0^.b1, 0x0.b0, 0x0 + +3d 00 00 ba 44 00 10 37 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0 +3d 10 00 7a 0c 04 10 03 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10 +c0 00 00 00 00 08 6d 00 IADD_IMM.i32 r8, 0x0, #0x0 +c0 00 00 00 00 09 6d 00 IADD_IMM.i32 r9, 0x0, #0x0 +3d 00 54 00 00 0a 60 00 U16_TO_U32 r10, r61.h0 +3d 09 00 00 30 00 b8 2a BRANCHZ.eq.reconverge r61.h0, offset:9 +0a 00 20 00 00 0b 60 28 MOV.i32.reconverge r11, r10 +c0 00 e0 01 00 00 a1 26 NOP.wait +01 0b 00 33 02 0e c5 03 LD_TILE.v4.f16.slot0 @r14:r15, u0.w1, r11, u0.w0 +0b 00 24 00 00 0c 60 00 CLZ.u32 r12, r11 +02 8c c0 10 06 0c 6d 01 RSHIFT_XOR.i32.not_result r12, u1.w0, r12^.b00, 0x0 +8b c0 8c 50 00 0b 6a 05 LSHIFT_AND.i32.wait0 r11, r11^, 0x0.b00, r12^ +8f 89 00 28 00 09 f4 00 FADD.v2f16 r9, r15^, r9^ +8e 88 00 28 00 08 f4 00 FADD.v2f16 r8, r14^, r8^ +0b f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r11, offset:-8 +8a 00 2c 00 00 3e 60 00 POPCOUNT.i32 r62, r10^ +be 00 59 00 00 3e 60 00 U32_TO_F32 r62, r62^ +be 00 81 01 00 3e 60 00 FRCP.f16 r62, r62^.h00 +89 3e c0 22 44 09 64 19 FMA.v2f16.wait12 r9, r9^, r62.h00, 0x0.neg +87 83 00 00 00 03 f0 00 FADD.f32 r3, r7^, r3^ +83 09 00 08 00 03 f0 20 FADD.f32.wait0126 r3, r3^, r9.h1 +3c 03 ea 00 01 3c d4 37 ATEST.discard @r60, r60, r3, atest_datum.w0 +86 82 00 00 00 02 f0 00 FADD.f32 r2, r6^, r2^ +84 80 00 00 00 00 f0 00 FADD.f32 r0, r4^, r0^ +88 be c0 22 44 3f 64 01 FMA.v2f16 r63, r8^, r62^.h00, 0x0.neg +85 81 00 00 00 01 f0 00 FADD.f32 r1, r5^, r1^ +81 3f 00 08 00 01 f0 00 FADD.f32 r1, r1^, r63.h1 +80 bf 00 04 00 00 f0 00 FADD.f32 r0, r0^, r63^.h0 +82 89 00 04 00 02 f0 24 FADD.f32.wait r2, r2^, r9^.h0 +f0 00 3c 32 84 00 1b 3f BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0 +c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0 +c0 f1 0f 80 10 00 b3 06 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1 +00 00 00 1f 5a 3c 69 03 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0 +40 00 20 00 00 01 61 00 MOV.i32 r1, u32.w0 +41 00 20 00 00 01 61 00 MOV.i32 r1, u32.w1 +4a 00 20 00 00 01 61 00 MOV.i32 r1, u37.w0 +30 00 37 0f c1 0c 24 07 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0 +32 00 00 02 81 0c 2c 07 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0 +32 00 00 00 01 0c 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0 +32 00 00 00 01 00 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0 +02 00 00 11 da 00 d5 27 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0 +02 20 00 11 da 00 d5 07 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0 +02 20 00 11 c2 00 d5 23 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0 +80 c0 c0 02 06 00 e6 09 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0 +82 83 80 80 02 00 e8 01 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^ +82 c0 c0 03 06 00 f6 09 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0 +84 86 c0 03 02 02 f4 01 ICMP_MULTI.u32.gt.u1 r2, r4^, r6^, 0x0 +85 87 82 02 02 02 f0 01 ICMP_MULTI.u32.gt.m1 r2, r5^, r7^, r2^ +83 c0 80 02 06 00 f2 01 ICMP_MULTI.u32.ne.m1 r0, r3^, 0x0, r0^ +80 82 c0 03 02 00 f4 01 ICMP_MULTI.u32.gt.u1 r0, r0^, r2^, 0x0 +81 83 80 82 02 04 f0 01 ICMP_MULTI.s32.gt.m1 r4, r1^, r3^, r0^ +80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0 +81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^ +80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0 +81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^ +c4 c0 80 52 70 00 6b 01 LSHIFT_AND.v4i8 r0, 0x1000000.b3333, 0x0.b00, r0^ +80 81 82 80 24 00 78 01 MUX.v4i8 r0, r0^, r1^, r2^ +c0 c0 00 00 02 02 8f 03 LEA_PKA.slot0 @r2:r3, 0x0, 0x0 diff --git a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt index 26e389697f4..f4092dd2af5 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt +++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt @@ -126,6 +126,7 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1 00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0 01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1 40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, r0^ +41 42 c0 40 06 c0 60 01 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^ 41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, r1^, r2^, 0x0.neg, r0^ 42 43 84 85 00 c1 50 01 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1 42 43 84 85 04 c1 50 01 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1 diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c b/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c index fdf9874f046..92f9517327f 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c +++ b/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c @@ -33,8 +33,18 @@ parse_hex(const char *in) int main(int argc, const char **argv) { - if (argc < 2) { - fprintf(stderr, "Expected case list\n"); + if (argc < 3) { + fprintf(stderr, "Expected case list and arch version\n"); + return 1; + } + + if (argv[2][0] != 'v') { + fprintf(stderr, "Invalid arch version: %s\n", argv[2]); + return 1; + } + unsigned arch = atoi(&argv[2][1]); + if (arch < 9 || arch > 15) { + fprintf(stderr, "Non-supported arch version: %d\n", arch); return 1; } @@ -65,7 +75,10 @@ main(int argc, const char **argv) uint64_t bin = parse_hex(line); FILE *outputp = open_memstream(&output, &sz); - va_disasm_instr(outputp, bin); + if (arch < 15) + va_disasm_instr(outputp, bin); + else + va_disasm_instr_v15(outputp, bin); fprintf(outputp, "\n"); fclose(outputp); From b4f5227efe0b2a6c86bfcdda7ba50c1e1f2e6449 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Tue, 17 Feb 2026 13:33:40 +0100 Subject: [PATCH 49/49] panfrost: Advertize Mali-TMAx support --- docs/drivers/panfrost.rst | 2 ++ src/panfrost/model/pan_model.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/docs/drivers/panfrost.rst b/docs/drivers/panfrost.rst index d9e3a618128..e5a3901fb0e 100644 --- a/docs/drivers/panfrost.rst +++ b/docs/drivers/panfrost.rst @@ -36,6 +36,8 @@ The following hardware is currently supported: +--------------------+---------------+-----------+--------+--------+ | G1-Pro | 5th Gen (v14) | 3.1 | 3.1 | 1.4 | +--------------------+---------------+-----------+--------+--------+ +| TMAx | 5th Gen (v15) | 3.1 | 3.1 | 1.4 | ++--------------------+---------------+-----------+--------+--------+ Other Midgard and Bifrost chips (e.g. G71) are not yet supported. diff --git a/src/panfrost/model/pan_model.c b/src/panfrost/model/pan_model.c index 4b28c4067fb..a70d2317cbe 100644 --- a/src/panfrost/model/pan_model.c +++ b/src/panfrost/model/pan_model.c @@ -99,6 +99,10 @@ const struct pan_model pan_model_list[] = { MODEL_RATES(4, 8, 64)), FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), MODEL_RATES(4, 8, 128)), + FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 0, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 64)), + FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 4, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 128)), }; /* clang-format on */