From ad81596b6d87eea4a27842b4632f30a9ee5846b5 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 25 Feb 2026 14:44:18 +0100 Subject: [PATCH] pan/va: Implement v15 register count changes With v15, we get support for 128 registers in any multiple of 16 (vs previously having the choice between 32 or 64 register mode). To support this, shader register count is passed in a different way from v15, requiring some updates to how we encode the ShaderProgramDescriptor and the ShaderProgramPointer. Note that this currently does not change the compiler behavior of running in either 32 or 64 register mode, just how this is passed to the GPU. --- src/gallium/drivers/panfrost/pan_cmdstream.c | 9 +++++- src/gallium/drivers/panfrost/pan_fb_preload.c | 4 +++ src/gallium/drivers/panfrost/pan_precomp.c | 14 +++++++++ src/panfrost/genxml/decode_csf.c | 30 ++++++++++++++++--- src/panfrost/genxml/v15.xml | 8 ++++- .../vulkan/csf/panvk_vX_cmd_dispatch.c | 13 +++++++- .../vulkan/csf/panvk_vX_cmd_precomp.c | 10 +++++++ .../vulkan/panvk_vX_cmd_frame_shaders.c | 4 +++ src/panfrost/vulkan/panvk_vX_shader.c | 12 ++++++++ 9 files changed, 97 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 7fad87f7e6d..15680c000bc 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -4456,9 +4456,12 @@ prepare_shader(struct panfrost_compiled_shader *state, else if (vs) cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif - +#if PAN_ARCH >= 15 + cfg.register_count = state->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); +#endif cfg.binary = state->bin.gpu; cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); @@ -4476,8 +4479,12 @@ prepare_shader(struct panfrost_compiled_shader *state, #if PAN_ARCH < 12 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif +#if PAN_ARCH >= 15 + cfg.register_count = state->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); +#endif cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; cfg.preload.r48_r63 = (state->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); diff --git a/src/gallium/drivers/panfrost/pan_fb_preload.c b/src/gallium/drivers/panfrost/pan_fb_preload.c index 172398b6ec8..be4c2c9965e 100644 --- a/src/gallium/drivers/panfrost/pan_fb_preload.c +++ b/src/gallium/drivers/panfrost/pan_fb_preload.c @@ -1105,7 +1105,11 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool, pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) { cfg.stage = MALI_SHADER_STAGE_FRAGMENT; cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; +#if PAN_ARCH >= 15 + cfg.register_count = preload_shader->info.work_reg_count; +#else cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; +#endif cfg.binary = preload_shader->address; cfg.preload.r48_r63 = preload_shader->info.preload >> 48; } diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index da2d3f51f7d..9ca77b32dd7 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -98,8 +98,12 @@ panfrost_precomp_shader_create( pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&res->info); +#if PAN_ARCH >= 15 + cfg.register_count = res->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(res->info.work_reg_count); +#endif cfg.binary = res->code_ptr; cfg.preload.r48_r63 = (res->info.preload >> 48); cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info); @@ -326,7 +330,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch, uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr); +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = shader->info.work_reg_count; + ctx.pointer = shader->state_ptr; + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr); +#else cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr); +#endif cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd); /* Global attribute offset */ diff --git a/src/panfrost/genxml/decode_csf.c b/src/panfrost/genxml/decode_csf.c index 10f062cebda..7c43991f64d 100644 --- a/src/panfrost/genxml/decode_csf.c +++ b/src/panfrost/genxml/decode_csf.c @@ -651,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp, if (fau) GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU"); - GENX(pandecode_shader) - (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id); + uint64_t addr = cs_get_u64(qctx, reg_spd); +#if PAN_ARCH >= 15 + const struct mali_shader_program_pointer_packed spp_packed = { + .opaque[0] = addr & 0xFFFFFFFF, + .opaque[1] = (addr >> 32) & 0xFFFFFFFF, + }; + pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp) + ; + DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp, + "Shader Program Pointer (%" PRIx64 "):\n", addr); + addr = spp.pointer; +#endif + GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id); DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd), "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd)); @@ -693,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp, if (fau) GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU"); - GENX(pandecode_shader) - (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id); + uint64_t addr = cs_get_u64(qctx, reg_spd); +#if PAN_ARCH >= 15 + const struct mali_shader_program_pointer_packed spp_packed = { + .opaque[0] = addr & 0xFFFFFFFF, + .opaque[1] = (addr >> 32) & 0xFFFFFFFF, + }; + pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp) + ; + DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp, + "Shader Program Pointer (%" PRIx64 "):\n", addr); + addr = spp.pointer; +#endif + GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id); DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd), "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd)); diff --git a/src/panfrost/genxml/v15.xml b/src/panfrost/genxml/v15.xml index 2b35043f964..983834f16e4 100644 --- a/src/panfrost/genxml/v15.xml +++ b/src/panfrost/genxml/v15.xml @@ -2040,14 +2040,20 @@ + - + + + + + + diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 1fd8e437d49..8de3de939b0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -209,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr); } - if (compute_state_dirty(cmdbuf, CS)) + if (compute_state_dirty(cmdbuf, CS)) { +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = cs->info.work_reg_count; + ctx.pointer = panvk_priv_mem_dev_addr(cs->spd); + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr); +#else cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), panvk_priv_mem_dev_addr(cs->spd)); +#endif + } cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index 56f6c546217..386f2b317a5 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr); +#if PAN_ARCH >= 15 + struct mali_shader_program_pointer_packed spp; + pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) { + ctx.register_count = shader->info.work_reg_count; + ctx.pointer = panvk_priv_mem_dev_addr(shader->spd); + } + uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0]; + cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr); +#else cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), panvk_priv_mem_dev_addr(shader->spd)); +#endif cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd); diff --git a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c index 3bd5eda41f0..afa2692bde0 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c @@ -239,8 +239,12 @@ get_frame_shader(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) { cfg.stage = MALI_SHADER_STAGE_FRAGMENT; cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem); cfg.preload.r48_r63 = shader->info.preload >> 48; } diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 03da805a49c..927a72cd7c9 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -1180,8 +1180,12 @@ panvk_shader_upload(struct panvk_device *dev, cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; #endif +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); @@ -1199,8 +1203,12 @@ panvk_shader_upload(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&shader->info); +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader); cfg.preload.r48_r63 = (shader->info.preload >> 48); cfg.flush_to_zero_mode = shader_ftz_mode(shader); @@ -1214,8 +1222,12 @@ panvk_shader_upload(struct panvk_device *dev, panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&shader->info); +#if PAN_ARCH >= 15 + cfg.register_count = shader->info.work_reg_count; +#else cfg.register_allocation = pan_register_allocation(shader->info.work_reg_count); +#endif cfg.binary = panvk_shader_variant_get_dev_addr(shader) + shader->info.vs.no_psiz_offset; cfg.preload.r48_r63 = (shader->info.preload >> 48);