pan/va: Implement v15 register count changes

With v15, we get support for 128 registers in any multiple of 16 (vs
previously having the choice between 32 or 64 register mode).

To support this, shader register count is passed in a different way from
v15, requiring some updates to how we encode the ShaderProgramDescriptor
and the ShaderProgramPointer.

Note that this currently does not change the compiler behavior of
running in either 32 or 64 register mode, just how this is passed to the
GPU.
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2026-02-25 14:44:18 +01:00
parent b7afb629c3
commit ad81596b6d
9 changed files with 97 additions and 7 deletions

View file

@ -4456,9 +4456,12 @@ prepare_shader(struct panfrost_compiled_shader *state,
else if (vs)
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = state->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(state->info.work_reg_count);
#endif
cfg.binary = state->bin.gpu;
cfg.preload.r48_r63 = (state->info.preload >> 48);
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
@ -4476,8 +4479,12 @@ prepare_shader(struct panfrost_compiled_shader *state,
#if PAN_ARCH < 12
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = state->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(state->info.work_reg_count);
#endif
cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
cfg.preload.r48_r63 = (state->info.preload >> 48);
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);

View file

@ -1105,7 +1105,11 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
#if PAN_ARCH >= 15
cfg.register_count = preload_shader->info.work_reg_count;
#else
cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
#endif
cfg.binary = preload_shader->address;
cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
}

View file

@ -98,8 +98,12 @@ panfrost_precomp_shader_create(
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
cfg.stage = pan_shader_stage(&res->info);
#if PAN_ARCH >= 15
cfg.register_count = res->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(res->info.work_reg_count);
#endif
cfg.binary = res->code_ptr;
cfg.preload.r48_r63 = (res->info.preload >> 48);
cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info);
@ -326,7 +330,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr);
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = shader->info.work_reg_count;
ctx.pointer = shader->state_ptr;
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr);
#else
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr);
#endif
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd);
/* Global attribute offset */

View file

@ -651,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
if (fau)
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
GENX(pandecode_shader)
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
uint64_t addr = cs_get_u64(qctx, reg_spd);
#if PAN_ARCH >= 15
const struct mali_shader_program_pointer_packed spp_packed = {
.opaque[0] = addr & 0xFFFFFFFF,
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
};
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
;
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
"Shader Program Pointer (%" PRIx64 "):\n", addr);
addr = spp.pointer;
#endif
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
@ -693,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
if (fau)
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
GENX(pandecode_shader)
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
uint64_t addr = cs_get_u64(qctx, reg_spd);
#if PAN_ARCH >= 15
const struct mali_shader_program_pointer_packed spp_packed = {
.opaque[0] = addr & 0xFFFFFFFF,
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
};
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
;
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
"Shader Program Pointer (%" PRIx64 "):\n", addr);
addr = spp.pointer;
#endif
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));

View file

@ -2040,14 +2040,20 @@
<field name="Suppress NaN" size="1" start="0:16" type="bool"/>
<field name="Flush to zero mode" size="2" start="0:17" type="Flush to zero mode"/>
<field name="Suppress Inf" size="1" start="0:19" type="bool"/>
<field name="Register Count" size="5" start="0:20" type="uint" modifier="align(16) shr(3) minus(1)"/>
<field name="Requires helper threads" size="1" start="0:28" type="bool"/> <!-- Fragment only -->
<field name="Shader contains JUMP_EX" size="1" start="0:29" type="bool"/>
<field name="Register allocation" size="2" start="0:30" type="Shader Register Allocation"/>
<field name="Preload" size="16" start="1:0" type="Preload"/>
<field name="Max Warps" size="16" start="1:16" type="uint"/>
<field name="Binary" size="64" start="2:0" type="address"/>
</struct>
<!-- Only used by RUN_COMPUTE -->
<struct name="Shader Program Pointer" size="2" align="8">
<field name="Register Count" size="5" start="0:0" type="uint" modifier="align(16) shr(3) minus(1)"/>
<field name="Pointer" size="51" start="0:5" type="address" modifier="shr(5)"/>
</struct>
<struct name="Scissor">
<field name="Scissor Minimum X" size="16" start="0:0" type="uint"/>
<field name="Scissor Minimum Y" size="16" start="0:16" type="uint"/>

View file

@ -209,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr);
}
if (compute_state_dirty(cmdbuf, CS))
if (compute_state_dirty(cmdbuf, CS)) {
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = cs->info.work_reg_count;
ctx.pointer = panvk_priv_mem_dev_addr(cs->spd);
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr);
#else
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD),
panvk_priv_mem_dev_addr(cs->spd));
#endif
}
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd);

View file

@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr);
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = shader->info.work_reg_count;
ctx.pointer = panvk_priv_mem_dev_addr(shader->spd);
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr);
#else
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD),
panvk_priv_mem_dev_addr(shader->spd));
#endif
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd);

View file

@ -239,8 +239,12 @@ get_frame_shader(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
#endif
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
cfg.preload.r48_r63 = shader->info.preload >> 48;
}

View file

@ -1180,8 +1180,12 @@ panvk_shader_upload(struct panvk_device *dev,
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
@ -1199,8 +1203,12 @@ panvk_shader_upload(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
cfg) {
cfg.stage = pan_shader_stage(&shader->info);
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
@ -1214,8 +1222,12 @@ panvk_shader_upload(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
cfg) {
cfg.stage = pan_shader_stage(&shader->info);
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
shader->info.vs.no_psiz_offset;
cfg.preload.r48_r63 = (shader->info.preload >> 48);