panvk: Hook up RUN_FRAGMENT2

Set the FBD size/alignment correctly and emit the fragment staging
registers before issuing fragment commands.

Also, move some temporary registers to non-conflicting ones.

Incremental rendering is left as TODO for later.

Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
This commit is contained in:
Marc Alcala Prieto 2026-04-17 14:45:56 +02:00
parent 52d6c19293
commit d425c52a8a
4 changed files with 212 additions and 21 deletions

View file

@ -74,7 +74,11 @@ static inline uint32_t
get_fbd_size(bool has_zs_ext, uint32_t rt_count)
{
assert(rt_count >= 1 && rt_count <= MAX_RTS);
#if PAN_ARCH >= 14
uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
#else
uint32_t fbd_size = pan_size(FRAMEBUFFER);
#endif
if (has_zs_ext)
fbd_size += pan_size(ZS_CRC_EXTENSION);
fbd_size += pan_size(RENDER_TARGET) * rt_count;
@ -209,13 +213,27 @@ enum panvk_cs_regs {
PANVK_CS_REG_RUN_IDVS_SR_END = 60,
#endif
#if PAN_ARCH >= 14
/* RUN_FRAGMENT2 staging regs.
* SW ABI:
* - r54:55 contain the pointer to the current FBD layer state.
* - r58:59 contain the pointer to the first tiler descriptor. This is
* needed to gather completed heap chunks after a run_fragment2.
*/
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55,
PANVK_CS_REG_FBD_LAYER_PTR = 54,
PANVK_CS_REG_TILER_DESC_PTR = 58,
#else
/* RUN_FRAGMENT staging regs.
* SW ABI:
* - r38:39 contain the pointer to the first tiler descriptor. This is
* - r58:59 contain the pointer to the first tiler descriptor. This is
* needed to gather completed heap chunks after a run_fragment.
*/
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
PANVK_CS_REG_TILER_DESC_PTR = 58,
#endif
/* RUN_COMPUTE staging regs. */
PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
@ -870,4 +888,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages,
void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf,
struct panvk_cs_deps deps);
#if PAN_ARCH >= 14
static inline void
cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
{
/* Emit the dynamic fragment state. This state may change per-layer. */
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
offsetof(struct pan_fbd_layer, flags0));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
offsetof(struct pan_fbd_layer, flags2));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
offsetof(struct pan_fbd_layer, z_clear));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, tiler));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, rtd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dbd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
offsetof(struct pan_fbd_layer, frame_argument));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dcd_pointer));
cs_flush_loads(b);
}
#endif /* PAN_ARCH >= 14 */
#endif /* PANVK_CMD_BUFFER_H */

View file

@ -1245,8 +1245,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
uint32_t fbd_sz = calc_fbd_size(cmdbuf);
uint32_t fbds_sz = enabled_layer_count * fbd_sz;
cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
#if PAN_ARCH >= 14
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
#else
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
#endif
cmdbuf->state.gfx.render.fbds =
panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment);
if (!cmdbuf->state.gfx.render.fbds.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@ -1323,7 +1328,16 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
fbd_flags = new_fbd_flags;
}
#if PAN_ARCH >= 14
/* fbd_flags is unused on v14+. */
assert(!fbd_flags);
#endif
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) {
struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
@ -1366,16 +1380,21 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
/* Wait for IR info push to complete */
cs_wait_slot(b, SB_ID(LS));
#endif /* PAN_ARCH >= 14 */
bool unset_provoking_vertex =
cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
if (copy_fbds) {
struct cs_index cur_tiler = cs_reg64(b, 38);
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
#if PAN_ARCH >= 14
struct cs_index dst_fbd_ptr = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
struct cs_index fbd_idx = cs_reg32(b, 47);
struct cs_index src_fbd_ptr = cs_reg64(b, 48);
struct cs_index remaining_layers_in_td = cs_reg32(b, 50);
#endif
struct cs_index fbd_idx = cs_reg32(b, 60);
struct cs_index src_fbd_ptr = cs_reg64(b, 64);
struct cs_index remaining_layers_in_td = cs_reg32(b, 61);
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
@ -1455,10 +1474,16 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
-(full_td_count * pan_size(TILER_CONTEXT)));
}
} else {
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
#endif
cs_update_frag_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
fbds.gpu | fbd_flags);
cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
cs_move64_to(b, fbd_pointer, fbds.gpu | fbd_flags);
cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR),
cmdbuf->state.gfx.render.tiler);
}
/* If we don't know what provoking vertex mode the application wants yet,
@ -3295,6 +3320,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
{
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
const bool has_zs_ext = pan_fb_has_zs(fb);
@ -3339,6 +3367,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
TILER_OOM_CTX_FIELD_OFFSET(layer_count));
cs_flush_stores(b);
#endif /* PAN_ARCH >= 14 */
}
static uint32_t
@ -3347,17 +3376,87 @@ pack_32_2x16(uint16_t lo, uint16_t hi)
return (((uint32_t)hi) << 16) | (uint32_t)lo;
}
#if PAN_ARCH >= 14
static void
cs_emit_static_fragment_state(struct cs_builder *b,
struct panvk_cmd_buffer *cmdbuf)
{
/* Emit the static fragment staging registers. These don't change per-layer. */
const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render;
const struct pan_fb_layout *fb = &render->fb.layout;
const uint8_t sample_count = render->fb.layout.sample_count;
const struct pan_fb_bbox fb_area_px =
pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px);
const struct pan_fb_bbox bbox_px =
pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px);
assert(pan_fb_bbox_is_valid(fb->tiling_area_px));
struct mali_fragment_bounding_box_packed bbox;
pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
cfg.bound_min_x = bbox_px.min_x;
cfg.bound_min_y = bbox_px.min_y;
cfg.bound_max_x = bbox_px.max_x;
cfg.bound_max_y = bbox_px.max_y;
}
struct mali_frame_size_packed frame_size;
pan_pack(&frame_size, FRAME_SIZE, cfg) {
cfg.width = fb->width_px;
cfg.height = fb->height_px;
}
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
bbox.opaque[0] | (uint64_t)bbox.opaque[1] << 32);
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
cs_move64_to(
b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
dev->sample_positions->addr.dev +
pan_sample_positions_offset(pan_sample_pattern(sample_count)));
/* Flags 1 */
struct mali_fragment_flags_1_packed flags1;
pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
cfg.sample_count = fb->sample_count;
cfg.sample_pattern = pan_sample_pattern(fb->sample_count);
cfg.effective_tile_size = fb->tile_size_px;
cfg.point_sprite_coord_origin_max_y = false;
cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf);
assert(fb->rt_count > 0);
cfg.render_target_count = fb->rt_count;
cfg.color_buffer_allocation = fb->tile_rt_alloc_B;
}
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
/* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
}
#endif /* PAN_ARCH >= 14 */
static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
{
#if PAN_ARCH < 14
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
#endif
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
/* Now initialize the fragment bits. */
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
cs_update_frag_ctx(b) {
cs_emit_static_fragment_state(b, cmdbuf);
cs_emit_layer_fragment_state(b, fbd_pointer);
}
#else
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
cs_update_frag_ctx(b) {
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
pack_32_2x16(fb->tiling_area_px.min_x,
@ -3366,6 +3465,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
pack_32_2x16(fb->tiling_area_px.max_x,
fb->tiling_area_px.max_y));
}
#endif
bool simul_use =
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
@ -3397,6 +3497,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
* state for this renderpass, so it's safe to enable. */
struct cs_index addr_reg = cs_scratch_reg64(b, 0);
struct cs_index length_reg = cs_scratch_reg32(b, 2);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
handler_idx * dev->tiler_oom.handler_stride;
@ -3404,6 +3507,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
length_reg);
#endif
/* Wait for the tiling to be done before submitting the fragment job. */
wait_finish_tiling(cmdbuf);
@ -3418,8 +3522,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
* up. */
cs_move64_to(b, addr_reg, 0);
cs_move32_to(b, length_reg, 0);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
length_reg);
#endif
/* Applications tend to forget to describe subpass dependencies, especially
* when it comes to write -> read dependencies on attachments. The
@ -3435,8 +3543,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
}
if (cmdbuf->state.gfx.render.layer_count <= 1) {
#if PAN_ARCH >= 14
cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
} else {
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4);
struct cs_index remaining_layers = cs_scratch_reg32(b, 4);
@ -3445,12 +3558,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) {
cs_add32(b, remaining_layers, remaining_layers, -1);
#if PAN_ARCH >= 14
cs_emit_layer_fragment_state(b, fbd_pointer);
cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
#endif
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz);
cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz);
}
}
@ -3464,8 +3583,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
struct cs_index completed_top = cs_scratch_reg64(b, 10);
struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
struct cs_index cur_tiler = cs_reg64(b, 38);
struct cs_index tiler_count = cs_reg32(b, 47);
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
struct cs_index tiler_count = cs_reg32(b, 60);
struct cs_index oq_chain = cs_scratch_reg64(b, 10);
struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);

View file

@ -13,8 +13,13 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
{
switch (reg) {
/* The bbox is set up by the fragment subqueue, we should not modify it. */
#if PAN_ARCH >= 14
case 28:
case 29:
#else
case 42:
case 43:
#endif
/* We should only load from the subqueue context. */
case PANVK_CS_REG_SUBQUEUE_CTX_START:
case PANVK_CS_REG_SUBQUEUE_CTX_END:
@ -42,8 +47,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8),
8 * sizeof(uint32_t));
#if PAN_ARCH >= 14
const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
#else
const size_t fbd_size = sizeof(struct mali_framebuffer_packed);
#endif
if (has_zs_ext) {
const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed);
const uint16_t dbd_offset = fbd_size;
/* Copy the whole DBD. */
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other,
@ -57,8 +68,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
}
const uint16_t rts_offset =
sizeof(struct mali_framebuffer_packed) +
(has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
for (uint32_t rt = 0; rt < rt_count; rt++) {
const uint16_t rt_offset =
@ -110,12 +120,14 @@ generate_tiler_oom_handler(struct panvk_device *dev,
.tracebuf_addr_offset =
offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
};
struct mali_framebuffer_pointer_packed fb_tag;
#if PAN_ARCH < 14
struct mali_framebuffer_pointer_packed fb_tag;
pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) {
cfg.zs_crc_extension_present = has_zs_ext;
cfg.render_target_count = rt_count;
}
#endif
cs_function_def(&b, &handler, handler_ctx) {
struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
@ -140,7 +152,7 @@ generate_tiler_oom_handler(struct panvk_device *dev,
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4);
/* The tiler pointer is pre-filled. */
struct cs_index tiler_ptr = cs_reg64(&b, 38);
struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR);
cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx,
TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr));
@ -175,12 +187,22 @@ generate_tiler_oom_handler(struct panvk_device *dev,
/* Flush copies before the RUN_FRAGMENT. */
cs_wait_slot(&b, SB_ID(LS));
#if PAN_ARCH >= 14
/* Set FBD pointer to the scratch fbd */
struct cs_index fbd_pointer = cs_reg64(&b, PANVK_CS_REG_FBD_LAYER_PTR);
cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0);
cs_emit_layer_fragment_state(&b, fbd_pointer);
cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
/* Set FBD pointer to the scratch fbd */
cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER),
scratch_fbd_ptr_reg, fb_tag.opaque[0]);
cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
/* Serialize run fragments since we reuse FBD for the runs */
cs_wait_slots(&b, dev->csf.sb.all_iters_mask);

View file

@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue)
tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size;
alloc_info.size = get_fbd_size(true, MAX_RTS);
alloc_info.alignment = pan_alignment(FRAMEBUFFER);
#if PAN_ARCH >= 14
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
#else
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
#endif
alloc_info.alignment = fbds_alignment;
tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) {
result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,