pan/genxml: Define RUN_COMPUTE staging registers in an enum

This makes it more clear what is what.

It will also reduce the pain of migration on newer gen.

RUN_COMPUTE_INDIRECT also use the same SRs so we also map to RUN_COMPUTE
there.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Benjamin Lee <benjamin.lee@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33736>
This commit is contained in:
Mary Guillemard 2025-02-24 11:05:19 +01:00 committed by Marge Bot
parent 11beea6242
commit c8882d83fd
5 changed files with 104 additions and 59 deletions

View file

@ -881,10 +881,10 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch,
csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
batch->rsd[PIPE_SHADER_COMPUTE]);
cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_TSD_0), batch->tls.gpu);
/* Global attribute offset */
cs_move32_to(b, cs_reg32(b, 32), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_GLOBAL_ATTRIBUTE_OFFSET), 0);
/* Compute workgroup size */
struct mali_compute_size_workgroup_packed wg_size;
@ -903,11 +903,11 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch,
(info->variable_shared_mem == 0);
}
cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_WG_SIZE), wg_size.opaque[0]);
/* Offset */
for (unsigned i = 0; i < 3; ++i)
cs_move32_to(b, cs_reg32(b, 34 + i), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_X), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Y), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Z), 0);
unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
unsigned max_thread_cnt = panfrost_compute_max_thread_count(
@ -920,7 +920,7 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch,
b, address,
pan_resource(info->indirect)->image.data.base + info->indirect_offset);
struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
struct cs_index grid_xyz = cs_reg_tuple(b, MALI_COMPUTE_SR_JOB_SIZE_X, 3);
cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
/* Wait for the load */
@ -942,8 +942,9 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch,
false, cs_shader_res_sel(0, 0, 0, 0));
} else {
/* Set size in workgroups per dimension immediately */
for (unsigned i = 0; i < 3; ++i)
cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X), info->grid[0]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y), info->grid[1]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z), info->grid[2]);
/* Pick the task_axis and task_increment to maximize thread utilization. */
unsigned task_axis = MALI_TASK_AXIS_X;
@ -984,10 +985,11 @@ GENX(csf_launch_xfb)(struct panfrost_batch *batch,
{
struct cs_builder *b = batch->csf.cs.builder;
cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_TSD_0), batch->tls.gpu);
/* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_GLOBAL_ATTRIBUTE_OFFSET),
batch->ctx->offset_start);
/* Compute workgroup size */
struct mali_compute_size_workgroup_packed wg_size;
@ -1001,15 +1003,16 @@ GENX(csf_launch_xfb)(struct panfrost_batch *batch,
*/
cfg.allow_merging_workgroups = true;
}
cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_WG_SIZE), wg_size.opaque[0]);
/* Offset */
for (unsigned i = 0; i < 3; ++i)
cs_move32_to(b, cs_reg32(b, 34 + i), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_X), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Y), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Z), 0);
cs_move32_to(b, cs_reg32(b, 37), count);
cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
cs_move32_to(b, cs_reg32(b, 39), 1);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X), count);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y),
info->instance_count);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z), 1);
csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
batch->rsd[PIPE_SHADER_VERTEX]);

View file

@ -320,18 +320,18 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
struct cs_builder *b = batch->csf.cs.builder;
/* No resource table */
cs_move64_to(b, cs_reg64(b, 0), 0);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_SRT_0), 0);
uint64_t fau_count =
DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8);
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_reg64(b, 8), fau_ptr);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_FAU_0), fau_ptr);
cs_move64_to(b, cs_reg64(b, 16), shader->state_ptr);
cs_move64_to(b, cs_reg64(b, 24), tsd);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_SPD_0), shader->state_ptr);
cs_move64_to(b, cs_reg64(b, MALI_COMPUTE_SR_TSD_0), tsd);
/* Global attribute offset */
cs_move32_to(b, cs_reg32(b, 32), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_GLOBAL_ATTRIBUTE_OFFSET), 0);
/* Compute workgroup size */
struct mali_compute_size_workgroup_packed wg_size;
@ -341,17 +341,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
cfg.workgroup_size_z = shader->local_size.z;
cfg.allow_merging_workgroups = false;
}
cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_WG_SIZE), wg_size.opaque[0]);
/* Job offset */
cs_move32_to(b, cs_reg32(b, 34), 0);
cs_move32_to(b, cs_reg32(b, 35), 0);
cs_move32_to(b, cs_reg32(b, 36), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_X), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Y), 0);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Z), 0);
/* Job size */
cs_move32_to(b, cs_reg32(b, 37), grid.count[0]);
cs_move32_to(b, cs_reg32(b, 38), grid.count[1]);
cs_move32_to(b, cs_reg32(b, 39), grid.count[2]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X), grid.count[0]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y), grid.count[1]);
cs_move32_to(b, cs_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z), grid.count[2]);
unsigned threads_per_wg =
shader->local_size.x * shader->local_size.y * shader->local_size.z;

View file

@ -840,6 +840,33 @@
<field name="Opcode" size="8" start="56" type="CS Opcode" default="SYNC_WAIT64"/>
</struct>
<enum name="COMPUTE SR">
<value name="SRT_0" value="0"/>
<value name="SRT_1" value="2"/>
<value name="SRT_2" value="4"/>
<value name="SRT_3" value="6"/>
<value name="FAU_0" value="8"/>
<value name="FAU_1" value="10"/>
<value name="FAU_2" value="12"/>
<value name="FAU_3" value="14"/>
<value name="SPD_0" value="16"/>
<value name="SPD_1" value="18"/>
<value name="SPD_2" value="20"/>
<value name="SPD_3" value="22"/>
<value name="TSD_0" value="24"/>
<value name="TSD_1" value="26"/>
<value name="TSD_2" value="28"/>
<value name="TSD_3" value="30"/>
<value name="GLOBAL_ATTRIBUTE_OFFSET" value="32"/>
<value name="WG_SIZE" value="33"/>
<value name="JOB_OFFSET_X" value="34"/>
<value name="JOB_OFFSET_Y" value="35"/>
<value name="JOB_OFFSET_Z" value="36"/>
<value name="JOB_SIZE_X" value="37"/>
<value name="JOB_SIZE_Y" value="38"/>
<value name="JOB_SIZE_Z" value="39"/>
</enum>
<enum name="IDVS SR">
<value name="SRT_0" value="0"/>
<value name="SRT_1" value="2"/>

View file

@ -236,22 +236,24 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_update_compute_ctx(b) {
if (compute_state_dirty(cmdbuf, CS) ||
compute_state_dirty(cmdbuf, DESC_STATE))
cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_SRT_0),
cs_desc_state->res_table);
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
uint64_t fau_ptr = cmdbuf->state.compute.push_uniforms |
((uint64_t)shader->fau.total_count << 56);
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_FAU_0), fau_ptr);
}
if (compute_state_dirty(cmdbuf, CS))
cs_move64_to(b, cs_sr_reg64(b, 16),
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_SPD_0),
panvk_priv_mem_dev_addr(shader->spd));
cs_move64_to(b, cs_sr_reg64(b, 24), tsd);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_TSD_0), tsd);
/* Global attribute offset */
cs_move32_to(b, cs_sr_reg32(b, 32), 0);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_GLOBAL_ATTRIBUTE_OFFSET),
0);
struct mali_compute_size_workgroup_packed wg_size;
pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
@ -260,47 +262,54 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cfg.workgroup_size_z = shader->local_size.z;
cfg.allow_merging_workgroups = false;
}
cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, 34),
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_WG_SIZE),
wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_X),
info->wg_base.x * shader->local_size.x);
cs_move32_to(b, cs_sr_reg32(b, 35),
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Y),
info->wg_base.y * shader->local_size.y);
cs_move32_to(b, cs_sr_reg32(b, 36),
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Z),
info->wg_base.z * shader->local_size.z);
if (indirect) {
/* Load parameters from indirect buffer and update workgroup count
* registers and sysvals */
cs_move64_to(b, cs_scratch_reg64(b, 0),
info->indirect.buffer_dev_addr);
cs_load_to(b, cs_sr_reg_tuple(b, 37, 3), cs_scratch_reg64(b, 0),
BITFIELD_MASK(3), 0);
cs_load_to(b, cs_sr_reg_tuple(b, MALI_COMPUTE_SR_JOB_SIZE_X, 3),
cs_scratch_reg64(b, 0), BITFIELD_MASK(3), 0);
cs_move64_to(b, cs_scratch_reg64(b, 0),
cmdbuf->state.compute.push_uniforms);
cs_wait_slot(b, SB_ID(LS), false);
if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
cs_store32(b, cs_sr_reg32(b, 37), cs_scratch_reg64(b, 0),
cs_store32(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X),
cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.x)));
}
if (shader_uses_sysval(shader, compute, num_work_groups.y)) {
cs_store32(b, cs_sr_reg32(b, 38), cs_scratch_reg64(b, 0),
cs_store32(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y),
cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.y)));
}
if (shader_uses_sysval(shader, compute, num_work_groups.z)) {
cs_store32(b, cs_sr_reg32(b, 39), cs_scratch_reg64(b, 0),
cs_store32(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z),
cs_scratch_reg64(b, 0),
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.z)));
}
cs_wait_slot(b, SB_ID(LS), false);
} else {
cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.wg_count.x);
cs_move32_to(b, cs_sr_reg32(b, 38), info->direct.wg_count.y);
cs_move32_to(b, cs_sr_reg32(b, 39), info->direct.wg_count.z);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X),
info->direct.wg_count.x);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y),
info->direct.wg_count.y);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z),
info->direct.wg_count.z);
}
}

View file

@ -103,19 +103,21 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
cs_update_compute_ctx(b) {
/* No resource table */
cs_move64_to(b, cs_sr_reg64(b, 0), 0);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_SRT_0), 0);
uint64_t fau_count =
DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8);
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_FAU_0), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_SPD_0),
panvk_priv_mem_dev_addr(shader->spd));
cs_move64_to(b, cs_sr_reg64(b, 24), tsd);
cs_move64_to(b, cs_sr_reg64(b, MALI_COMPUTE_SR_TSD_0), tsd);
/* Global attribute offset */
cs_move32_to(b, cs_sr_reg32(b, 32), 0);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_GLOBAL_ATTRIBUTE_OFFSET),
0);
struct mali_compute_size_workgroup_packed wg_size;
pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
@ -124,17 +126,21 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
cfg.workgroup_size_z = shader->local_size.z;
cfg.allow_merging_workgroups = false;
}
cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_WG_SIZE),
wg_size.opaque[0]);
/* Job offset */
cs_move32_to(b, cs_sr_reg32(b, 34), 0);
cs_move32_to(b, cs_sr_reg32(b, 35), 0);
cs_move32_to(b, cs_sr_reg32(b, 36), 0);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_X), 0);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Y), 0);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_OFFSET_Z), 0);
/* Job size */
cs_move32_to(b, cs_sr_reg32(b, 37), grid.count[0]);
cs_move32_to(b, cs_sr_reg32(b, 38), grid.count[1]);
cs_move32_to(b, cs_sr_reg32(b, 39), grid.count[2]);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_X),
grid.count[0]);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Y),
grid.count[1]);
cs_move32_to(b, cs_sr_reg32(b, MALI_COMPUTE_SR_JOB_SIZE_Z),
grid.count[2]);
}
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);