pan/genxml: Implement RUN_FRAGMENT2

Add support for emitting and decoding RUN_FRAGMENT2 instructions.

Some existing decoding logic from decode.c is modified to be reusable
by the new RUN_FRAGMENT2 decoding logic.

Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
This commit is contained in:
Marc Alcala Prieto 2026-04-17 11:16:05 +02:00
parent 4a0477373d
commit b02aaa2e33
4 changed files with 291 additions and 48 deletions

View file

@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
case MALI_CS_OPCODE_STORE_MULTIPLE:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2:
#else
case MALI_CS_OPCODE_RUN_FRAGMENT:
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN:
#if PAN_ARCH >= 12
case MALI_CS_OPCODE_RUN_IDVS2:
@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
}
#endif
#if PAN_ARCH >= 14
static inline void
cs_run_fragment2(struct cs_builder *b, bool enable_tem,
enum mali_tile_render_order tile_order)
{
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_FRAG_RES;
cs_emit(b, RUN_FRAGMENT2, I) {
I.enable_tem = enable_tem;
I.tile_order = tile_order;
}
}
#else
static inline void
cs_run_fragment(struct cs_builder *b, bool enable_tem,
enum mali_tile_render_order tile_order)
@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
I.tile_order = tile_order;
}
}
#endif
static inline void
cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
sizeof(struct cs_##__type##_trace))
#if PAN_ARCH >= 14
#define CS_RUN_FRAGMENT2_SR_COUNT 56
#define CS_RUN_FRAGMENT2_SR_MASK BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT)
struct cs_run_fragment2_trace {
uint64_t ip;
uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, bool enable_tem,
enum mali_tile_render_order tile_order)
{
if (likely(!ctx->enabled)) {
cs_run_fragment2(b, enable_tem, tile_order);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_fragment2_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_fragment2(b, enable_tem, tile_order);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip));
ASSERTED unsigned sr_count = 0;
unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr);
for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) {
unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16);
if (!mask)
continue;
cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask,
sr_offset);
sr_offset += util_bitcount(mask) * sizeof(uint32_t);
sr_count += util_bitcount(mask);
}
assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT);
cs_flush_stores(b);
}
#else
struct cs_run_fragment_trace {
uint64_t ip;
uint32_t sr[7];
@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_trace_field_offset(run_fragment, sr));
cs_flush_stores(b);
}
#endif
#if PAN_ARCH >= 13
#define CS_RUN_FULLSCREEN_SR_MASK \

View file

@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va)
}
static void
pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va,
const struct MALI_FRAMEBUFFER_PARAMETERS *fb)
void
GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
uint32_t render_target_count)
{
pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va);
ctx->indent++;
for (int i = 0; i < (fb->render_target_count); i++)
for (int i = 0; i < render_target_count; i++)
pandecode_rt(ctx, i, gpu_va);
ctx->indent--;
pandecode_log(ctx, "\n");
}
static void
pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
void
GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va)
{
const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR(
ctx, zs_crc_packed, (uint64_t)gpu_va);
@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
#if PAN_ARCH >= 6
static void
pandecode_sample_locations(struct pandecode_context *ctx, const void *fb)
void
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
uint64_t dcd_pointer, unsigned pre_frame_0,
unsigned pre_frame_1, unsigned post_frame,
unsigned job_type_param, uint64_t gpu_id)
{
pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params);
const unsigned dcd_size = pan_size(DRAW);
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations);
if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (0 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer,
pre_frame_0);
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n",
params.sample_locations);
if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (1 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
dcd_pointer + (1 * dcd_size));
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (2 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Post frame:\n");
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
}
void
GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
uint64_t sample_locations)
{
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations);
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations);
for (int i = 0; i < 33; i++) {
pandecode_log(ctx, " (%d, %d),\n", samples[2 * i] - 128,
samples[2 * i + 1] - 128);
}
}
#endif
#endif /* PAN_ARCH >= 6 */
#if PAN_ARCH < 14
struct pandecode_fbd
GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
bool is_fragment, uint64_t gpu_id)
@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n");
#if PAN_ARCH >= 6
pandecode_sample_locations(ctx, fb);
GENX(pandecode_sample_locations)(ctx, params.sample_locations);
unsigned dcd_size = pan_size(DRAW);
unsigned job_type_param = 0;
#if PAN_ARCH <= 9
job_type_param = MALI_JOB_TYPE_FRAGMENT;
#endif
if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (0 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n",
params.frame_shader_dcds, params.pre_frame_0);
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (1 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
params.frame_shader_dcds + (1 * dcd_size));
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (2 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Post frame:\n");
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
GENX(pandecode_frame_shader_dcds)(ctx, params.frame_shader_dcds,
params.pre_frame_0, params.pre_frame_1,
params.post_frame, job_type_param, gpu_id);
#else
DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");
@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
gpu_va += pan_size(FRAMEBUFFER);
if (params.has_zs_crc_extension) {
pandecode_zs_crc_ext(ctx, gpu_va);
GENX(pandecode_zs_crc_ext)(ctx, gpu_va);
gpu_va += pan_size(ZS_CRC_EXTENSION);
}
if (is_fragment)
pandecode_rts(ctx, gpu_va, &params);
GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count);
return (struct pandecode_fbd){
.rt_count = params.render_target_count,
@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
};
#endif
}
#endif /* PAN_ARCH < 14 */
#if PAN_ARCH >= 5
uint64_t

View file

@ -275,4 +275,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx,
#endif
#if PAN_ARCH >= 6
void GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
uint64_t sample_locations);
void
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
uint64_t dcd_pointer, unsigned pre_frame_0,
unsigned pre_frame_1, unsigned post_frame,
unsigned job_type_param, uint64_t gpu_id);
#endif
#if PAN_ARCH >= 5
void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
uint32_t render_target_count);
void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va);
#endif
#endif /* __MMAP_TRACE_H__ */

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2022-2023 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -343,6 +344,23 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
static const char *tile_order[] = {
"zorder", "horizontal", "vertical", "unknown",
"unknown", "rev_horizontal", "rev_vertical", "unknown",
"unknown", "unknown", "unknown", "unknown",
"unknown", "unknown", "unknown", "unknown",
};
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s",
I.enable_tem ? ".tile_enable_map_enable" : "",
tile_order[I.tile_order]);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
static const char *tile_order[] = {
"zorder", "horizontal", "vertical", "unknown",
@ -350,6 +368,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
"unknown", "unknown", "unknown", "unknown",
"unknown", "unknown", "unknown", "unknown",
};
cs_unpack(instr, CS_RUN_FRAGMENT, I);
fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
@ -358,6 +377,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
tile_order[I.tile_order]);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
cs_unpack(instr, CS_RUN_FULLSCREEN, I);
@ -1097,6 +1117,99 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
}
#endif
#if PAN_ARCH >= 14
static void
pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp,
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I)
{
if (qctx->in_exception_handler)
return;
ctx->indent++;
pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0));
pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1));
pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n",
cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER));
pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE));
for (unsigned i = 0; i < 11; ++i) {
const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2);
pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i,
cs_get_u64(qctx, reg));
}
DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3],
"Flags 3:\n");
DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX, &qctx->regs[MALI_FRAGMENT_SR_BBOX_MIN],
"Bounding Box:\n");
DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE],
"Frame size:\n");
pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx
->regs[MALI_FRAGMENT_SR_FLAGS_0],
FRAGMENT_FLAGS_0, flags0_unpacked);
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n");
pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx
->regs[MALI_FRAGMENT_SR_FLAGS_1],
FRAGMENT_FLAGS_1, flags1_unpacked);
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n");
DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2],
"Flags 2:\n");
pandecode_log(ctx, "Z clear: %f\n",
uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR)));
const uint64_t tiler_pointer =
cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER);
pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n",
tiler_pointer);
const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER);
pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer);
const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER);
pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer);
pandecode_log(ctx, "Frame argument: %" PRIx64 "\n",
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG));
const uint64_t sample_locations =
cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER);
pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations);
const uint64_t dcd_pointer =
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER);
pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer);
DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE],
"VRS image:\n");
GENX(pandecode_sample_locations)(ctx, sample_locations);
const unsigned job_type_param = 0;
GENX(pandecode_frame_shader_dcds)(ctx, dcd_pointer,
flags0_unpacked.pre_frame_0,
flags0_unpacked.pre_frame_1,
flags0_unpacked.post_frame,
job_type_param, qctx->gpu_id);
if (tiler_pointer)
GENX(pandecode_tiler)(ctx, tiler_pointer);
if (dbd_pointer)
GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer);
if (rtd_pointer)
GENX(pandecode_rts)(ctx, rtd_pointer, flags1_unpacked.render_target_count);
ctx->indent--;
}
#else
static void
pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
@ -1115,6 +1228,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
ctx->indent--;
}
#endif /* PAN_ARCH >= 14 */
static void
pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
@ -1261,11 +1375,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
cs_unpack(bytes, CS_RUN_FRAGMENT2, I);
pandecode_run_fragment2(ctx, fp, qctx, &I);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
cs_unpack(bytes, CS_RUN_FRAGMENT, I);
pandecode_run_fragment(ctx, fp, qctx, &I);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
@ -2430,7 +2552,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
#else
case MALI_CS_OPCODE_RUN_IDVS:
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2:
#else
case MALI_CS_OPCODE_RUN_FRAGMENT:
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
@ -2539,6 +2666,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
struct cs_run_fragment2_trace *frag_trace = trace_data;
assert(trace_size >= sizeof(*frag_trace));
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
memcpy(&regs[0], frag_trace->sr, sizeof(frag_trace->sr));
pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I);
trace_data = frag_trace + 1;
trace_size -= sizeof(*frag_trace);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
struct cs_run_fragment_trace *frag_trace = trace_data;
@ -2550,6 +2690,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
trace_size -= sizeof(*frag_trace);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
struct cs_run_fullscreen_trace *fs_trace = trace_data;