diff --git a/docs/drivers/panfrost.rst b/docs/drivers/panfrost.rst index 2e214ded1e9..d9e3a618128 100644 --- a/docs/drivers/panfrost.rst +++ b/docs/drivers/panfrost.rst @@ -34,6 +34,8 @@ The following hardware is currently supported: +--------------------+---------------+-----------+--------+--------+ | G725 | 5th Gen (v13) | 3.1 | 3.1 | 1.4 | +--------------------+---------------+-----------+--------+--------+ +| G1-Pro | 5th Gen (v14) | 3.1 | 3.1 | 1.4 | ++--------------------+---------------+-----------+--------+--------+ Other Midgard and Bifrost chips (e.g. G71) are not yet supported. diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 5b3e5e41d97..ba243f5a4ed 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -41,7 +41,7 @@ compile_args_panfrost = [ '-Wno-pointer-arith' ] -panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13'] +panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_versions = [] foreach ver : panfrost_versions @@ -54,7 +54,7 @@ foreach ver : panfrost_versions ] if ver in ['4', '5', '6', '7', '9'] files_panfrost_vx += ['pan_jm.c'] - elif ver in ['10', '12', '13'] + elif ver in ['10', '12', '13', '14'] files_panfrost_vx += ['pan_csf.c'] endif libpanfrost_versions += static_library( diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 87a3cbbe7ea..aa32944195f 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -49,7 +49,7 @@ * functions. */ #if PAN_ARCH <= 9 #define JOBX(__suffix) GENX(jm_##__suffix) -#elif PAN_ARCH <= 13 +#elif PAN_ARCH <= 14 #define JOBX(__suffix) GENX(csf_##__suffix) #else #error "Unsupported arch" diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 2246804b85c..d328c3647aa 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2023 Collabora Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -13,6 +14,7 @@ #include "pan_cmdstream.h" #include "pan_context.h" #include "pan_csf.h" +#include "pan_fb.h" #include "pan_fb_preload.h" #include "pan_job.h" #include "pan_trace.h" @@ -75,6 +77,87 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr) (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) + \ offsetof(struct pan_ptr, gpu)) +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panfrost_batch *batch, + const struct pan_fb_info *fb) +{ + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width; + cfg.height = fb->height; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + fb->sample_positions); + + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + /* The force_samples setting dictates the sample-count that is used + * for rasterization, and works like D3D11's ForcedSampleCount + * feature: + * + * - If force_samples == 0: Let nr_samples dictate sample count + * - If force_samples == 1: force single-sampled rasterization + * - If force_samples >= 1: force multi-sampled rasterization + * + * This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the + * fragment shader, even when performing single-sampled rendering. + */ + if (fb->pls_enabled) { + cfg.sample_count = 4; + cfg.sample_pattern = pan_sample_pattern(1); + } else if (!fb->force_samples) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(fb->nr_samples); + } else if (fb->force_samples == 1) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(1); + } else { + cfg.sample_count = 1; + cfg.sample_pattern = pan_sample_pattern(fb->force_samples); + } + + cfg.effective_tile_size = fb->tile_size; + cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin; + cfg.first_provoking_vertex = fb->first_provoking_vertex; + cfg.render_target_count = MAX2(fb->rt_count, 1); + cfg.color_buffer_allocation = fb->cbuf_allocation; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} + +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct pan_fbd_layer, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct pan_fbd_layer, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct pan_fbd_layer, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct pan_fbd_layer, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + static int csf_oom_handler_init(struct panfrost_context *ctx) { @@ -113,13 +196,14 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_function_def(&b, &handler, handler_ctx) { struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG); - struct cs_index counter = cs_reg32(&b, 47); - struct cs_index zero = cs_reg64(&b, 48); - struct cs_index flush_id = cs_reg32(&b, 48); - struct cs_index tiler_ctx = cs_reg64(&b, 50); - struct cs_index completed_top = cs_reg64(&b, 52); - struct cs_index completed_bottom = cs_reg64(&b, 54); - struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4); + struct cs_index counter = cs_reg32(&b, 31); + struct cs_index zero = cs_reg64(&b, 56); + struct cs_index flush_id = cs_reg32(&b, 58); + struct cs_index tiler_ctx = cs_reg64(&b, 60); + struct cs_index completed_top = cs_reg64(&b, 64); + struct cs_index completed_bottom = cs_reg64(&b, 66); + struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4); + struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER); /* Ensure that the OTHER endpoint is valid */ #if PAN_ARCH >= 11 @@ -133,12 +217,10 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter)); cs_wait_slot(&b, 0); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(MIDDLE)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE)); } cs_else(&b) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(FIRST)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST)); } cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx, @@ -147,11 +229,18 @@ csf_oom_handler_init(struct panfrost_context *ctx) FIELD_OFFSET(bbox_max)); cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(&b, fbd_pointer); +#endif cs_wait_slot(&b, 0); /* Run the fragment job and wait */ cs_select_endpoint_sb(&b, 3); +#if PAN_ARCH >= 14 + cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(&b, 3); /* Increment counter */ @@ -218,6 +307,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch) panfrost_pool_cleanup(&batch->csf.cs_chunk_pool); } +#if PAN_ARCH >= 14 +static inline struct pan_ptr +alloc_fbd(struct panfrost_batch *batch) +{ + const struct pan_desc_alloc_info fbd_layer = { + .size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64), + .align = alignof(struct pan_fbd_layer), + .nelems = 1, + }; + + return pan_pool_alloc_desc_aggregate( + &batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION), + PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); +} +#else static inline struct pan_ptr alloc_fbd(struct panfrost_batch *batch) { @@ -225,6 +329,7 @@ alloc_fbd(struct panfrost_batch *batch) &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION), PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); } +#endif /* PAN_ARCH >= 14 */ int GENX(csf_init_batch)(struct panfrost_batch *batch) @@ -758,7 +863,7 @@ GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb) (_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS] #define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx) \ GET_FBD(_ctx, _pass).gpu |= \ - GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu) + GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass)) void GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, @@ -771,7 +876,7 @@ GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, /* Default framebuffer descriptor */ batch->framebuffer.gpu |= - GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu); + GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer); if (batch->draw_count == 0) return; @@ -854,15 +959,21 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_vt_end(b, cs_now()); } + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); + /* Set up the fragment job */ - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - batch->framebuffer.gpu); + cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), (batch->miny << 16) | batch->minx); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), ((batch->maxy - 1) << 16) | (batch->maxx - 1)); cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0); +#if PAN_ARCH >= 14 + cs_emit_static_fragment_state(b, batch, pfb); + cs_emit_layer_fragment_state(b, fbd_pointer); +#endif /* Use different framebuffer descriptor if incremental rendering was * triggered while tiling */ @@ -871,13 +982,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0); cs_wait_slot(b, 0); cs_if(b, MALI_CS_CONDITION_GREATER, counter) { - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - GET_FBD(oom_ctx, LAST).gpu); + cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); +#endif } } /* Run the fragment job and wait */ +#if PAN_ARCH >= 14 + cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(b, 2); /* Gather freed heap chunks and add them to the heap context free list diff --git a/src/gallium/drivers/panfrost/pan_csf.h b/src/gallium/drivers/panfrost/pan_csf.h index b7be8be2339..2ad51a4a33a 100644 --- a/src/gallium/drivers/panfrost/pan_csf.h +++ b/src/gallium/drivers/panfrost/pan_csf.h @@ -29,7 +29,8 @@ struct pan_csf_tiler_oom_ctx { /* Alternative framebuffer descriptors for incremental rendering */ struct pan_ptr fbds[PAN_INCREMENTAL_RENDERING_PASS_COUNT]; - /* Bounding Box (Register 42 and 43) */ + /* Bounding Box (Register MALI_FRAGMENT_SR_BBOX_MIN and + * MALI_FRAGMENT_SR_BBOX_MAX) */ uint32_t bbox_min; uint32_t bbox_max; diff --git a/src/gallium/drivers/panfrost/pan_jm.c b/src/gallium/drivers/panfrost/pan_jm.c index 845c238853e..818846927fd 100644 --- a/src/gallium/drivers/panfrost/pan_jm.c +++ b/src/gallium/drivers/panfrost/pan_jm.c @@ -257,8 +257,8 @@ GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb, { PAN_TRACE_FUNC(PAN_TRACE_GL_JM); - batch->framebuffer.gpu |= GENX(pan_emit_fbd)( - fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu); + batch->framebuffer.gpu |= + GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer); } void diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index 86d28d2de7a..ede056ba82f 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -1175,6 +1175,9 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config, case 13: panfrost_cmdstream_screen_init_v13(screen); break; + case 14: + panfrost_cmdstream_screen_init_v14(screen); + break; default: debug_printf("panfrost: Unhandled architecture major %d", dev->arch); panfrost_destroy_screen(&(screen->base)); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index 14eb7ea59fd..9e6b95d008d 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -155,6 +155,7 @@ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen); +void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen); #define perf_debug(ctx, ...) \ do { \ diff --git a/src/panfrost/clc/pan_compile.c b/src/panfrost/clc/pan_compile.c index b2e25e7c53b..3a34897c21b 100644 --- a/src/panfrost/clc/pan_compile.c +++ b/src/panfrost/clc/pan_compile.c @@ -275,7 +275,7 @@ main(int argc, const char **argv) unsigned target_arch = atoi(target_arch_str); - if (target_arch < 4 || target_arch > 13) { + if (target_arch < 4 || target_arch > 14) { fprintf(stderr, "Unsupported target arch %d\n", target_arch); return 1; } diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 2f08cddc49e..989a36b7046 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -703,8 +703,10 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr) assert(intr->intrinsic == nir_intrinsic_load_var_buf_pan || intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan); + const unsigned arch = b->shader->arch; + /* These are only available on Valhall+ */ - assert(b->shader->arch >= 9); + assert(arch >= 9); const bool flat = intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan; const nir_alu_type src_type = nir_intrinsic_src_type(intr); @@ -757,19 +759,36 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr) bool use_imm_form = false; if (nir_src_is_const(intr->src[0])) { imm_offset = nir_src_as_uint(intr->src[0]); - assert(imm_offset < pan_ld_var_buf_off_size(b->shader->arch)); + assert(imm_offset < pan_ld_var_buf_off_size(arch)); use_imm_form = true; } + /* On v14+, flat source formats are removed from LD_VAR_BUF/LD_VAR_BUF_IMM, + * so flat buffer varyings must use the dedicated LD_VAR_BUF_FLAT*. + */ if (use_imm_form) { - bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, + if (arch >= 14 && flat) { + bi_ld_var_buf_flat_imm_to(b, dest, regfmt, vecsize, imm_offset); + } else { + bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, BI_UPDATE_STORE, vecsize, imm_offset); + } } else { bi_index offset = bi_src_index(&intr->src[0]); - bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample, - source_format, BI_UPDATE_STORE, vecsize); + if (arch >= 14 && flat) { + bi_ld_var_buf_flat_to(b, dest, offset, regfmt, vecsize); + } else { + bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample, + source_format, BI_UPDATE_STORE, vecsize); + } } + + /* LD_VAR_BUF_FLAT* only support register formats F16 and F32. */ + assert( + arch < 14 || !flat || + (regfmt == BI_REGISTER_FORMAT_F16 || regfmt == BI_REGISTER_FORMAT_F32)); + bi_split_def(b, &intr->def); } diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 47ba6928e89..d25571ed34d 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -939,6 +939,32 @@ + + + + + Fetches a given flat varying from hardware buffer + + + + + + + + + + + + + Fetches a given flat varying from hardware buffer + + + + + + + + Interpolates a given varying from hardware buffer diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 0b0a7654437..0ac71cc2f4f 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2021 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -9,9 +10,9 @@ #include -#define CASE(instr, expected) \ +#define CASE_ARCH(instr, arch, expected) \ do { \ - uint64_t _value = va_pack_instr(instr, 10); \ + uint64_t _value = va_pack_instr(instr, arch); \ if (_value != expected) { \ fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, \ (uint64_t)expected); \ @@ -21,6 +22,8 @@ } \ } while (0) +#define CASE(instr, expected) CASE_ARCH(instr, 10, expected) + class ValhallPacking : public testing::Test { protected: ValhallPacking() @@ -278,11 +281,41 @@ TEST_F(ValhallPacking, LdVarBufImmF16) BI_VECSIZE_V4, 0), 0x005d80843300003d); - CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID, - BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, - BI_VECSIZE_V4, 8), - 0x005d80443308003d); + CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, + BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, + BI_UPDATE_STORE, BI_VECSIZE_V4, 8), + 10, 0x005d80443308003d); + + CASE_ARCH(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, + BI_SAMPLE_CENTROID, BI_SOURCE_FORMAT_F16, + BI_UPDATE_STORE, BI_VECSIZE_V4, 8), + 11, 0x005d80443300083d); +} + +TEST_F(ValhallPacking, LdVarBufFlatImmFormat) +{ + CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), + BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4, 0x12), + 14, 0x0040800832001200); + + CASE_ARCH(bi_ld_var_buf_flat_imm_to(b, bi_register(0), + BI_REGISTER_FORMAT_F16, + BI_VECSIZE_V4, 0x12), + 14, 0x0040800433001200); +} + +TEST_F(ValhallPacking, LdVarBufFlat) +{ + CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4), + 14, 0x005f80083200003d); + + CASE_ARCH(bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4), + 14, 0x005f80043300003d); } TEST_F(ValhallPacking, LeaBufImm) diff --git a/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c b/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c index 6fc81ebbb12..2d5ca159bd3 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c +++ b/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c @@ -77,6 +77,8 @@ walk_bir_shader(bi_context *ctx, struct pan_shader_info *info) if (instr->sample == BI_SAMPLE_CENTROID) info->fs.hsr.centroid_interpolation = true; FALLTHROUGH; + case BI_OPCODE_LD_VAR_BUF_FLAT: + case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: case BI_OPCODE_LD_VAR_FLAT: case BI_OPCODE_LD_VAR_FLAT_IMM: if (!found_atest) diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index 0790005e49a..d57a7119a37 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -568,6 +568,10 @@ va_pack_alu(const bi_instr *I, unsigned arch) hex |= ((uint64_t)I->sample) << 38; break; + case BI_OPCODE_LD_VAR_BUF_FLAT_IMM: + hex |= ((uint64_t)I->index) << 8; + break; + case BI_OPCODE_LD_ATTR_IMM: hex |= ((uint64_t)I->table) << 16; hex |= ((uint64_t)I->attribute_index) << 20; diff --git a/src/panfrost/compiler/pan_compiler.c b/src/panfrost/compiler/pan_compiler.c index ef384514061..9d3a7a79be0 100644 --- a/src/panfrost/compiler/pan_compiler.c +++ b/src/panfrost/compiler/pan_compiler.c @@ -52,6 +52,7 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg) case 11: case 12: case 13: + case 14: return merge_wg ? &bifrost_nir_options_v11_merge_wg : &bifrost_nir_options_v11; default: diff --git a/src/panfrost/genxml/cs_builder.h b/src/panfrost/genxml/cs_builder.h index a109f4d113b..ae0653a1f84 100644 --- a/src/panfrost/genxml/cs_builder.h +++ b/src/panfrost/genxml/cs_builder.h @@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask) case MALI_CS_OPCODE_STORE_MULTIPLE: case MALI_CS_OPCODE_RUN_COMPUTE: case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: +#else case MALI_CS_OPCODE_RUN_FRAGMENT: +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: #if PAN_ARCH >= 12 case MALI_CS_OPCODE_RUN_IDVS2: @@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, } #endif +#if PAN_ARCH >= 14 +static inline void +cs_run_fragment2(struct cs_builder *b, bool enable_tem, + enum mali_tile_render_order tile_order) +{ + /* Staging regs */ + cs_flush_loads(b); + + b->req_resource_mask |= CS_FRAG_RES; + + cs_emit(b, RUN_FRAGMENT2, I) { + I.enable_tem = enable_tem; + I.tile_order = tile_order; + } +} +#else static inline void cs_run_fragment(struct cs_builder *b, bool enable_tem, enum mali_tile_render_order tile_order) @@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem, I.tile_order = tile_order; } } +#endif static inline void cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, @@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, (int16_t)(offsetof(struct cs_##__type##_trace, __field) - \ sizeof(struct cs_##__type##_trace)) +#if PAN_ARCH >= 14 +#define CS_RUN_FRAGMENT2_SR_COUNT 56 +#define CS_RUN_FRAGMENT2_SR_MASK BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT) +struct cs_run_fragment2_trace { + uint64_t ip; + uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, bool enable_tem, + enum mali_tile_render_order tile_order) +{ + if (likely(!ctx->enabled)) { + cs_run_fragment2(b, enable_tem, tile_order); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_fragment2_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_fragment2(b, enable_tem, tile_order); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip)); + + ASSERTED unsigned sr_count = 0; + unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr); + for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) { + unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16); + if (!mask) + continue; + + cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask, + sr_offset); + sr_offset += util_bitcount(mask) * sizeof(uint32_t); + sr_count += util_bitcount(mask); + } + assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT); + + cs_flush_stores(b); +} +#else struct cs_run_fragment_trace { uint64_t ip; uint32_t sr[7]; @@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_fragment, sr)); cs_flush_stores(b); } +#endif #if PAN_ARCH >= 13 #define CS_RUN_FULLSCREEN_SR_MASK \ diff --git a/src/panfrost/genxml/decode.c b/src/panfrost/genxml/decode.c index 38a2e696e4d..fc6068b5228 100644 --- a/src/panfrost/genxml/decode.c +++ b/src/panfrost/genxml/decode.c @@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va) } -static void -pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va, - const struct MALI_FRAMEBUFFER_PARAMETERS *fb) +void +GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va, + uint32_t render_target_count) { pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va); ctx->indent++; - for (int i = 0; i < (fb->render_target_count); i++) + for (int i = 0; i < render_target_count; i++) pandecode_rt(ctx, i, gpu_va); ctx->indent--; pandecode_log(ctx, "\n"); } -static void -pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va) +void +GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va) { const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR( ctx, zs_crc_packed, (uint64_t)gpu_va); @@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va) #if PAN_ARCH >= 6 -static void -pandecode_sample_locations(struct pandecode_context *ctx, const void *fb) +void +GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx, + uint64_t dcd_pointer, unsigned pre_frame_0, + unsigned pre_frame_1, unsigned post_frame, + unsigned job_type_param, uint64_t gpu_id) { - pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); + const unsigned dcd_size = pan_size(DRAW); - const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations); + if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (0 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer, + pre_frame_0); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } - pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", - params.sample_locations); + if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (1 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n", + dcd_pointer + (1 * dcd_size)); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } + + if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const struct mali_draw_packed *PANDECODE_PTR_VAR( + ctx, dcd, dcd_pointer + (2 * dcd_size)); + pan_unpack(dcd, DRAW, draw) + ; + pandecode_log(ctx, "Post frame:\n"); + ctx->indent++; + GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); + ctx->indent--; + } +} + +void +GENX(pandecode_sample_locations)(struct pandecode_context *ctx, + uint64_t sample_locations) +{ + const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations); + + pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations); for (int i = 0; i < 33; i++) { pandecode_log(ctx, " (%d, %d),\n", samples[2 * i] - 128, samples[2 * i + 1] - 128); } } -#endif +#endif /* PAN_ARCH >= 6 */ +#if PAN_ARCH < 14 struct pandecode_fbd GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, bool is_fragment, uint64_t gpu_id) @@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n"); #if PAN_ARCH >= 6 - pandecode_sample_locations(ctx, fb); + GENX(pandecode_sample_locations)(ctx, params.sample_locations); - unsigned dcd_size = pan_size(DRAW); unsigned job_type_param = 0; #if PAN_ARCH <= 9 job_type_param = MALI_JOB_TYPE_FRAGMENT; #endif - if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (0 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", - params.frame_shader_dcds, params.pre_frame_0); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } - - if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (1 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n", - params.frame_shader_dcds + (1 * dcd_size)); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } - - if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const struct mali_draw_packed *PANDECODE_PTR_VAR( - ctx, dcd, params.frame_shader_dcds + (2 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log(ctx, "Post frame:\n"); - ctx->indent++; - GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id); - ctx->indent--; - } + GENX(pandecode_frame_shader_dcds)(ctx, params.frame_shader_dcds, + params.pre_frame_0, params.pre_frame_1, + params.post_frame, job_type_param, gpu_id); #else DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); @@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, gpu_va += pan_size(FRAMEBUFFER); if (params.has_zs_crc_extension) { - pandecode_zs_crc_ext(ctx, gpu_va); + GENX(pandecode_zs_crc_ext)(ctx, gpu_va); gpu_va += pan_size(ZS_CRC_EXTENSION); } if (is_fragment) - pandecode_rts(ctx, gpu_va, ¶ms); + GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count); return (struct pandecode_fbd){ .rt_count = params.render_target_count, @@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va, }; #endif } +#endif /* PAN_ARCH < 14 */ #if PAN_ARCH >= 5 uint64_t diff --git a/src/panfrost/genxml/decode.h b/src/panfrost/genxml/decode.h index f7d83ca5525..47fe28f798f 100644 --- a/src/panfrost/genxml/decode.h +++ b/src/panfrost/genxml/decode.h @@ -132,6 +132,13 @@ void pandecode_cs_binary_v13(struct pandecode_context *ctx, uint64_t bin, void pandecode_cs_trace_v13(struct pandecode_context *ctx, uint64_t trace, uint32_t trace_size, uint64_t gpu_id); +void pandecode_interpret_cs_v14(struct pandecode_context *ctx, uint64_t queue, + uint32_t size, uint64_t gpu_id, uint32_t *regs); +void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin, + uint32_t bin_size); +void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace, + uint32_t trace_size, uint64_t gpu_id); + /* Logging infrastructure */ static void pandecode_make_indent(struct pandecode_context *ctx) @@ -275,4 +282,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx, #endif +#if PAN_ARCH >= 6 +void GENX(pandecode_sample_locations)(struct pandecode_context *ctx, + uint64_t sample_locations); + +void + GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx, + uint64_t dcd_pointer, unsigned pre_frame_0, + unsigned pre_frame_1, unsigned post_frame, + unsigned job_type_param, uint64_t gpu_id); +#endif + +#if PAN_ARCH >= 5 +void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va, + uint32_t render_target_count); + +void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va); +#endif + #endif /* __MMAP_TRACE_H__ */ diff --git a/src/panfrost/genxml/decode_common.c b/src/panfrost/genxml/decode_common.c index 208d28a8cb5..399fec9f335 100644 --- a/src/panfrost/genxml/decode_common.c +++ b/src/panfrost/genxml/decode_common.c @@ -423,6 +423,9 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va, case 13: pandecode_interpret_cs_v13(ctx, queue_gpu_va, size, gpu_id, regs); break; + case 14: + pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -446,6 +449,9 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va, case 13: pandecode_cs_binary_v13(ctx, bin_gpu_va, size); break; + case 14: + pandecode_cs_binary_v14(ctx, bin_gpu_va, size); + break; default: UNREACHABLE("Unsupported architecture"); } @@ -469,6 +475,9 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va, case 13: pandecode_cs_trace_v13(ctx, trace_gpu_va, size, gpu_id); break; + case 14: + pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id); + break; default: UNREACHABLE("Unsupported architecture"); } diff --git a/src/panfrost/genxml/decode_csf.c b/src/panfrost/genxml/decode_csf.c index ca3b4807950..b196c98943a 100644 --- a/src/panfrost/genxml/decode_csf.c +++ b/src/panfrost/genxml/decode_csf.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2022-2023 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -89,6 +90,12 @@ static const char *defer_modes_str[] = { #define defer_mode_str(I) "" #endif +#if PAN_ARCH <= 13 +#define assert_no_progress_inc(I) assert(!I.progress_increment) +#else +#define assert_no_progress_inc(I) do {} while (0) +#endif + static void print_cs_instr(FILE *fp, const uint64_t *instr) { @@ -117,28 +124,27 @@ print_cs_instr(FILE *fp, const uint64_t *instr) case MALI_CS_OPCODE_WAIT: { cs_unpack(instr, CS_WAIT, I); - fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "", - I.wait_mask); + assert_no_progress_inc(I); + fprintf(fp, "WAIT #%x", I.wait_mask); break; } case MALI_CS_OPCODE_RUN_COMPUTE: { const char *axes[4] = {"x_axis", "y_axis", "z_axis"}; cs_unpack(instr, CS_RUN_COMPUTE, I); + assert_no_progress_inc(I); /* Print the instruction. Ignore the selects and the flags override * since we'll print them implicitly later. */ #if PAN_ARCH >= 12 - fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", - I.progress_increment ? ".progress_inc" : "", axes[I.task_axis], - I.srt_select, I.spd_select, I.tsd_select, I.fau_select, - I.task_increment, I.ep_limit); + fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", + axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select, + I.fau_select, I.task_increment, I.ep_limit); #else - fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u", - I.progress_increment ? ".progress_inc" : "", axes[I.task_axis], - I.srt_select, I.spd_select, I.tsd_select, I.fau_select, - I.task_increment); + fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u", + axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select, + I.fau_select, I.task_increment); #endif break; } @@ -146,8 +152,8 @@ print_cs_instr(FILE *fp, const uint64_t *instr) #if PAN_ARCH == 10 case MALI_CS_OPCODE_RUN_TILING: { cs_unpack(instr, CS_RUN_TILING, I); - fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d", - I.progress_increment ? ".progress_inc" : "", I.srt_select, + assert_no_progress_inc(I); + fprintf(fp, "RUN_TILING.srt%d.spd%d.tsd%d.fau%d", I.srt_select, I.spd_select, I.tsd_select, I.fau_select); break; } @@ -156,10 +162,10 @@ print_cs_instr(FILE *fp, const uint64_t *instr) #if PAN_ARCH < 12 case MALI_CS_OPCODE_RUN_IDVS: { cs_unpack(instr, CS_RUN_IDVS, I); + assert_no_progress_inc(I); fprintf( fp, - "RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", + "RUN_IDVS%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64, I.malloc_enable ? "" : ".no_malloc", I.draw_id_register_enable ? ".draw_id_enable" : "", I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select, @@ -170,6 +176,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) #else case MALI_CS_OPCODE_RUN_IDVS2: { cs_unpack(instr, CS_RUN_IDVS2, I); + assert_no_progress_inc(I); const char *vertex_shading_str[] = { ".early", @@ -178,8 +185,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr) ".INVALID", }; - fprintf(fp, "RUN_IDVS2%s%s%s%s r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", + fprintf(fp, "RUN_IDVS2%s%s%s r%u, #%" PRIx64, I.malloc_enable ? "" : ".no_malloc", I.draw_id_register_enable ? ".draw_id_enable" : "", vertex_shading_str[I.vertex_shading_mode], I.draw_id, @@ -317,32 +323,37 @@ print_cs_instr(FILE *fp, const uint64_t *instr) case MALI_CS_OPCODE_SHARED_SB_INC: { cs_unpack(instr, CS_SHARED_SB_INC, I); - - const char *progress_increment_name[] = { - ".no_increment", - ".increment", - }; - - fprintf(fp, "SHARED_SB_INC%s%s #%u, #%u", - progress_increment_name[I.progress_increment], - defer_mode_str(I), I.sb_mask, I.shared_entry); + assert_no_progress_inc(I); + fprintf(fp, "SHARED_SB_INC%s #%u, #%u", defer_mode_str(I), I.sb_mask, + I.shared_entry); break; } case MALI_CS_OPCODE_SHARED_SB_DEC: { cs_unpack(instr, CS_SHARED_SB_DEC, I); - - const char *progress_increment_name[] = { - ".no_increment", - ".increment", - }; - - fprintf(fp, "SHARED_SB_DEC%s #%u", - progress_increment_name[I.progress_increment], I.shared_entry); + assert_no_progress_inc(I); + fprintf(fp, "SHARED_SB_DEC #%u", I.shared_entry); break; } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + static const char *tile_order[] = { + "zorder", "horizontal", "vertical", "unknown", + "unknown", "rev_horizontal", "rev_vertical", "unknown", + "unknown", "unknown", "unknown", "unknown", + "unknown", "unknown", "unknown", "unknown", + }; + + cs_unpack(instr, CS_RUN_FRAGMENT2, I); + + fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s", + I.enable_tem ? ".tile_enable_map_enable" : "", + tile_order[I.tile_order]); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { static const char *tile_order[] = { "zorder", "horizontal", "vertical", "unknown", @@ -350,27 +361,27 @@ print_cs_instr(FILE *fp, const uint64_t *instr) "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", }; - cs_unpack(instr, CS_RUN_FRAGMENT, I); - fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s", - I.progress_increment ? ".progress_inc" : "", + cs_unpack(instr, CS_RUN_FRAGMENT, I); + assert_no_progress_inc(I); + fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s", I.enable_tem ? ".tile_enable_map_enable" : "", tile_order[I.tile_order]); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { cs_unpack(instr, CS_RUN_FULLSCREEN, I); - fprintf(fp, "RUN_FULLSCREEN%s r%u, #%" PRIx64, - I.progress_increment ? ".progress_inc" : "", I.dcd, - I.flags_override); + assert_no_progress_inc(I); + fprintf(fp, "RUN_FULLSCREEN r%u, #%" PRIx64, I.dcd, I.flags_override); break; } case MALI_CS_OPCODE_FINISH_TILING: { cs_unpack(instr, CS_FINISH_TILING, I); - fprintf(fp, "FINISH_TILING%s", - I.progress_increment ? ".progress_inc" : ""); + assert_no_progress_inc(I); + fprintf(fp, "FINISH_TILING"); break; } @@ -443,12 +454,6 @@ print_cs_instr(FILE *fp, const uint64_t *instr) break; } - case MALI_CS_OPCODE_PROGRESS_WAIT: { - cs_unpack(instr, CS_PROGRESS_WAIT, I); - fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue); - break; - } - case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: { cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I); fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length); @@ -547,29 +552,17 @@ print_cs_instr(FILE *fp, const uint64_t *instr) break; } - case MALI_CS_OPCODE_PROGRESS_STORE: { - cs_unpack(instr, CS_PROGRESS_STORE, I); - fprintf(fp, "PROGRESS_STORE d%u", I.source); - break; - } - - case MALI_CS_OPCODE_PROGRESS_LOAD: { - cs_unpack(instr, CS_PROGRESS_LOAD, I); - fprintf(fp, "PROGRESS_LOAD d%u", I.destination); - break; - } - case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: { cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I); + assert_no_progress_inc(I); #if PAN_ARCH >= 12 - fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u", - I.progress_increment ? ".progress_inc" : "", I.srt_select, - I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task, - I.ep_limit); + fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u, #%u", + I.srt_select, I.spd_select, I.tsd_select, I.fau_select, + I.workgroups_per_task, I.ep_limit); #else - fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u", - I.progress_increment ? ".progress_inc" : "", I.srt_select, - I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task); + fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u", + I.srt_select, I.spd_select, I.tsd_select, I.fau_select, + I.workgroups_per_task); #endif break; @@ -1097,6 +1090,99 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp, } #endif +#if PAN_ARCH >= 14 +static void +pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp, + struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I) +{ + if (qctx->in_exception_handler) + return; + + ctx->indent++; + + pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0)); + pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1)); + pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n", + cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER)); + pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n", + cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE)); + + for (unsigned i = 0; i < 11; ++i) { + const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2); + pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i, + cs_get_u64(qctx, reg)); + } + + DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3], + "Flags 3:\n"); + DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX, &qctx->regs[MALI_FRAGMENT_SR_BBOX_MIN], + "Bounding Box:\n"); + DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE], + "Frame size:\n"); + + pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx + ->regs[MALI_FRAGMENT_SR_FLAGS_0], + FRAGMENT_FLAGS_0, flags0_unpacked); + DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n"); + + pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx + ->regs[MALI_FRAGMENT_SR_FLAGS_1], + FRAGMENT_FLAGS_1, flags1_unpacked); + DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n"); + + DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2], + "Flags 2:\n"); + pandecode_log(ctx, "Z clear: %f\n", + uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR))); + + const uint64_t tiler_pointer = + cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER); + pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n", + tiler_pointer); + + const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER); + pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer); + + const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER); + pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer); + + pandecode_log(ctx, "Frame argument: %" PRIx64 "\n", + cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG)); + + const uint64_t sample_locations = + cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER); + pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations); + + const uint64_t dcd_pointer = + cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER); + pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer); + + DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE], + "VRS image:\n"); + + GENX(pandecode_sample_locations)(ctx, sample_locations); + + const unsigned job_type_param = 0; + GENX(pandecode_frame_shader_dcds)(ctx, dcd_pointer, + flags0_unpacked.pre_frame_0, + flags0_unpacked.pre_frame_1, + flags0_unpacked.post_frame, + job_type_param, qctx->gpu_id); + + if (tiler_pointer) + GENX(pandecode_tiler)(ctx, tiler_pointer); + + if (dbd_pointer) + GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer); + + if (rtd_pointer) + GENX(pandecode_rts)(ctx, rtd_pointer, flags1_unpacked.render_target_count); + + ctx->indent--; +} +#else static void pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp, struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I) @@ -1115,6 +1201,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp, ctx->indent--; } +#endif /* PAN_ARCH >= 14 */ static void pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp, @@ -1261,11 +1348,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx) } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + cs_unpack(bytes, CS_RUN_FRAGMENT2, I); + pandecode_run_fragment2(ctx, fp, qctx, &I); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { cs_unpack(bytes, CS_RUN_FRAGMENT, I); pandecode_run_fragment(ctx, fp, qctx, &I); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { cs_unpack(bytes, CS_RUN_FULLSCREEN, I); @@ -2192,18 +2287,6 @@ collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg, break; } - case MALI_CS_OPCODE_PROGRESS_LOAD: { - cs_unpack(instr, CS_PROGRESS_LOAD, I); - for (unsigned i = 0; i < 16; i++) { - if (BITSET_TEST(track_map, I.destination) || - BITSET_TEST(track_map, I.destination + 1)) { - ibranch->has_unknown_targets = true; - return; - } - } - break; - } - default: break; } @@ -2430,7 +2513,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin, #else case MALI_CS_OPCODE_RUN_IDVS: #endif + +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: +#else case MALI_CS_OPCODE_RUN_FRAGMENT: +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: case MALI_CS_OPCODE_RUN_COMPUTE: case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: @@ -2539,6 +2627,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace, } #endif +#if PAN_ARCH >= 14 + case MALI_CS_OPCODE_RUN_FRAGMENT2: { + struct cs_run_fragment2_trace *frag_trace = trace_data; + + assert(trace_size >= sizeof(*frag_trace)); + cs_unpack(instr, CS_RUN_FRAGMENT2, I); + memcpy(®s[0], frag_trace->sr, sizeof(frag_trace->sr)); + pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I); + trace_data = frag_trace + 1; + trace_size -= sizeof(*frag_trace); + break; + } +#else case MALI_CS_OPCODE_RUN_FRAGMENT: { struct cs_run_fragment_trace *frag_trace = trace_data; @@ -2550,6 +2651,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace, trace_size -= sizeof(*frag_trace); break; } +#endif case MALI_CS_OPCODE_RUN_FULLSCREEN: { struct cs_run_fullscreen_trace *fs_trace = trace_data; diff --git a/src/panfrost/genxml/gen_macros.h b/src/panfrost/genxml/gen_macros.h index b9e856f8533..c1e8ab1fbae 100644 --- a/src/panfrost/genxml/gen_macros.h +++ b/src/panfrost/genxml/gen_macros.h @@ -61,6 +61,9 @@ #elif (PAN_ARCH == 13) #define GENX(X) X##_v13 #include "genxml/v13_pack.h" +#elif (PAN_ARCH == 14) +#define GENX(X) X##_v14 +#include "genxml/v14_pack.h" #else #error "Need to add suffixing macro for this architecture" #endif diff --git a/src/panfrost/genxml/meson.build b/src/panfrost/genxml/meson.build index 3712b84822d..ee4b4adea3f 100644 --- a/src/panfrost/genxml/meson.build +++ b/src/panfrost/genxml/meson.build @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MIT pan_packers = [] -foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13'] +foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14'] pan_packers += custom_target( packer + '_pack.h', input : ['gen_pack.py', packer + '.xml'], @@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency( libpanfrost_decode_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_decode_per_arch += static_library( 'pandecode-arch-v' + ver, ['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers], diff --git a/src/panfrost/genxml/v10.xml b/src/panfrost/genxml/v10.xml index 2fd4bb86637..95204c4a496 100644 --- a/src/panfrost/genxml/v10.xml +++ b/src/panfrost/genxml/v10.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1163,6 +1166,13 @@ + + + + + + diff --git a/src/panfrost/genxml/v12.xml b/src/panfrost/genxml/v12.xml index 0d651f01b0d..e3716030601 100644 --- a/src/panfrost/genxml/v12.xml +++ b/src/panfrost/genxml/v12.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1426,6 +1429,9 @@ + + + diff --git a/src/panfrost/genxml/v13.xml b/src/panfrost/genxml/v13.xml index c644d2bd49c..30285e4c351 100644 --- a/src/panfrost/genxml/v13.xml +++ b/src/panfrost/genxml/v13.xml @@ -1,5 +1,6 @@ @@ -84,6 +85,7 @@ + @@ -132,6 +134,7 @@ + @@ -1728,6 +1731,9 @@ + + + diff --git a/src/panfrost/genxml/v14.xml b/src/panfrost/genxml/v14.xml new file mode 100644 index 00000000000..30768156967 --- /dev/null +++ b/src/panfrost/genxml/v14.xml @@ -0,0 +1,2755 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/panfrost/genxml/v9.xml b/src/panfrost/genxml/v9.xml index d5bc4c1e110..3935d4dea99 100644 --- a/src/panfrost/genxml/v9.xml +++ b/src/panfrost/genxml/v9.xml @@ -1,5 +1,6 @@ @@ -103,6 +104,7 @@ + diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build index 8c5b3d5537d..a4572db619c 100644 --- a/src/panfrost/lib/meson.build +++ b/src/panfrost/lib/meson.build @@ -4,7 +4,7 @@ subdir('kmod') -pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13'] +pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_pixel_format = [] deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep] @@ -22,7 +22,7 @@ endforeach libpanfrost_per_arch = [] -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpanfrost_per_arch += static_library( 'pan-arch-v' + ver, [ diff --git a/src/panfrost/lib/pan_afbc.h b/src/panfrost/lib/pan_afbc.h index 035b77011b5..f0328a0ba44 100644 --- a/src/panfrost/lib/pan_afbc.h +++ b/src/panfrost/lib/pan_afbc.h @@ -3,6 +3,7 @@ * Copyright (C) 2014 Broadcom * Copyright (C) 2018-2019 Alyssa Rosenzweig * Copyright (C) 2019-2020 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -711,6 +712,32 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode) case PAN_AFBC_MODE_R16G16B16A16: return MALI_AFBC_COMPRESSION_MODE_R16G16B16A16; #endif +#if PAN_ARCH >= 14 + case PAN_AFBC_MODE_YUV420_6C8: + return MALI_AFBC_COMPRESSION_MODE_Y8U8V8_420; + case PAN_AFBC_MODE_YUV420_2C8: + return MALI_AFBC_COMPRESSION_MODE_R8G8; + case PAN_AFBC_MODE_YUV420_1C8: + return MALI_AFBC_COMPRESSION_MODE_R8; + case PAN_AFBC_MODE_YUV420_6C10: + return MALI_AFBC_COMPRESSION_MODE_Y10U10V10_420; + case PAN_AFBC_MODE_YUV420_2C10: + return MALI_AFBC_COMPRESSION_MODE_R10G10; + case PAN_AFBC_MODE_YUV420_1C10: + return MALI_AFBC_COMPRESSION_MODE_R10; + case PAN_AFBC_MODE_YUV422_4C8: + return MALI_AFBC_COMPRESSION_MODE_Y8U8Y8V8_422; + case PAN_AFBC_MODE_YUV422_2C8: + return MALI_AFBC_COMPRESSION_MODE_R8G8; + case PAN_AFBC_MODE_YUV422_1C8: + return MALI_AFBC_COMPRESSION_MODE_R8; + case PAN_AFBC_MODE_YUV422_4C10: + return MALI_AFBC_COMPRESSION_MODE_Y10U10Y10V10_422; + case PAN_AFBC_MODE_YUV422_2C10: + return MALI_AFBC_COMPRESSION_MODE_R10G10; + case PAN_AFBC_MODE_YUV422_1C10: + return MALI_AFBC_COMPRESSION_MODE_R10; +#else case PAN_AFBC_MODE_YUV420_6C8: return MALI_AFBC_COMPRESSION_MODE_YUV420_6C8; case PAN_AFBC_MODE_YUV420_2C8: @@ -735,6 +762,7 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode) return MALI_AFBC_COMPRESSION_MODE_YUV422_2C10; case PAN_AFBC_MODE_YUV422_1C10: return MALI_AFBC_COMPRESSION_MODE_YUV422_1C10; +#endif /* PAN_ARCH >= 14 */ #if PAN_ARCH == 9 case PAN_AFBC_MODE_R16: case PAN_AFBC_MODE_R16G16: diff --git a/src/panfrost/lib/pan_afrc.h b/src/panfrost/lib/pan_afrc.h index 4a96eb374ea..306e48fb55e 100644 --- a/src/panfrost/lib/pan_afrc.h +++ b/src/panfrost/lib/pan_afrc.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2023 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -347,6 +348,25 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier, return (scan ? MALI_AFRC_FORMAT_R10G10B10A10_SCAN : MALI_AFRC_FORMAT_R10G10B10A10_ROT); +#if PAN_ARCH >= 14 + case PAN_AFRC_ICHANGE_FORMAT_YUV444: + case PAN_AFRC_ICHANGE_FORMAT_YUV422: + case PAN_AFRC_ICHANGE_FORMAT_YUV420: + if (info.bpc == 8) { + if (plane == 0 || info.num_planes == 3) + return (scan ? MALI_AFRC_FORMAT_R8_SCAN : MALI_AFRC_FORMAT_R8_ROT); + + return (scan ? MALI_AFRC_FORMAT_R8G8_SCAN : MALI_AFRC_FORMAT_R8G8_ROT); + } + + if (plane == 0 || info.num_planes == 3) + return (scan ? MALI_AFRC_FORMAT_R10_SCAN : MALI_AFRC_FORMAT_R10_ROT); + + assert(info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV422 || + info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV420); + return (scan ? MALI_AFRC_FORMAT_R10G10_SCAN + : MALI_AFRC_FORMAT_R10G10_ROT); +#else case PAN_AFRC_ICHANGE_FORMAT_YUV444: if (info.bpc == 8) { if (plane == 0 || info.num_planes == 3) @@ -394,6 +414,7 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier, return (scan ? MALI_AFRC_FORMAT_R10G10_420_SCAN : MALI_AFRC_FORMAT_R10G10_420_ROT); +#endif /* PAN_ARCH >= 14 */ default: return MALI_AFRC_FORMAT_INVALID; diff --git a/src/panfrost/lib/pan_desc.c b/src/panfrost/lib/pan_desc.c index 3df01de0090..cf9f08aae5b 100644 --- a/src/panfrost/lib/pan_desc.c +++ b/src/panfrost/lib/pan_desc.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2021 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -11,6 +12,7 @@ #include "pan_afrc.h" #include "pan_desc.h" #include "pan_encoder.h" +#include "pan_fb.h" #include "pan_props.h" #include "pan_texture.h" #include "pan_trace.h" @@ -1172,11 +1174,156 @@ check_fb_attachments(const struct pan_fb_info *fb) #endif } +#if PAN_ARCH >= 14 unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, void *out) + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) { + void *out = framebuffer.cpu; + + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); + + check_fb_attachments(fb); + + const int crc_rt = GENX(pan_select_crc_rt)(fb, fb->tile_size); + const bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0); + const struct pan_clean_tile clean_tile = pan_get_clean_tile_info(fb); + + /* Emit to memory the state that might change per-layer. The static + * state is emitted directly to CSF registers by + * cs_emit_static_fragment_state(). + */ + + struct pan_fbd_layer fbd_data = {0}; + fbd_data.tiler = tiler_ctx->valhall.desc; + + /* internal_layer_index in flags0 is used to select the right + * primitive list in the tiler context, and frame_arg is the value + * that's passed to the fragment shader through r62-r63, which we use + * to pass gl_Layer. Since the layer_idx only takes 8-bits, we might + * use the extra 56-bits we have in frame_argument to pass other + * information to the fragment shader at some point. + */ + assert(layer_idx >= tiler_ctx->valhall.layer_offset); + fbd_data.frame_argument = layer_idx; + + pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) { + cfg.pre_frame_0 = + pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], + pan_clean_tile_write_any_set(clean_tile)); + cfg.pre_frame_1 = + pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1], + pan_clean_tile_write_any_set(clean_tile)); + cfg.post_frame = fb->bifrost.pre_post.modes[2]; + + const unsigned zs_bytes_per_pixel = pan_zsbuf_bytes_per_pixel(fb); + /* We can interleave HSR if we have space for two ZS tiles in + * the tile buffer. */ + const unsigned max_zs_tile_size_interleave = + fb->z_tile_buf_budget >> util_logbase2_ceil(zs_bytes_per_pixel); + const bool hsr_can_interleave = + fb->tile_size <= max_zs_tile_size_interleave; + + /* Enabling prepass without interleave is generally not good for + * performance, so disable HSR in that case. */ + cfg.hsr_prepass_enable = fb->allow_hsr_prepass && hsr_can_interleave; + cfg.hsr_prepass_interleaving_enable = hsr_can_interleave; + cfg.hsr_prepass_filter_enable = true; + cfg.hsr_hierarchical_optimizations_enable = true; + + cfg.internal_layer_index = layer_idx - tiler_ctx->valhall.layer_offset; + } + + fbd_data.dcd_pointer = fb->bifrost.pre_post.dcds.gpu; + + pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) { + cfg.s_clear = fb->zs.clear_value.stencil; + cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s); + + /* Default to 24 bit depth if there's no surface. */ + cfg.z_internal_format = + fb->zs.view.zs ? pan_get_z_internal_format(fb->zs.view.zs->format) + : MALI_Z_INTERNAL_FORMAT_D24; + cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z); + + if (crc_rt >= 0) { + bool *valid = fb->rts[crc_rt].crc_valid; + bool full = !fb->draw_extent.minx && !fb->draw_extent.miny && + fb->draw_extent.maxx == (fb->width - 1) && + fb->draw_extent.maxy == (fb->height - 1); + + /* If the CRC was valid it stays valid, if it wasn't, we must + * ensure the render operation covers the full frame, and + * clean tiles are pushed to memory. */ + bool new_valid = *valid | (full && pan_clean_tile_write_rt_enabled( + clean_tile, crc_rt)); + + cfg.crc_read_enable = *valid; + + /* If the data is currently invalid, still write CRC + * data if we are doing a full write, so that it is + * valid for next time. */ + cfg.crc_write_enable = new_valid; + + *valid = new_valid; + } + } + + fbd_data.z_clear = util_bitpack_float(fb->zs.clear_value.depth); + + { + /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */ + uint64_t out_gpu_addr = + framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64); + + if (has_zs_crc_ext) { + fbd_data.dbd_pointer = out_gpu_addr; + assert(fbd_data.dbd_pointer % 64 == 0); + out_gpu_addr += pan_size(ZS_CRC_EXTENSION); + } + + fbd_data.rtd_pointer = out_gpu_addr; + assert(fbd_data.rtd_pointer % 64 == 0); + } + + memcpy(out, &fbd_data, sizeof(fbd_data)); + out += ALIGN_POT(sizeof(fbd_data), 64); + + if (has_zs_crc_ext) { + struct mali_zs_crc_extension_packed *zs_crc_ext = out; + pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, zs_crc_ext, clean_tile); + out += pan_size(ZS_CRC_EXTENSION); + } + + const unsigned rt_count = MAX2(fb->rt_count, 1); + unsigned cbuf_offset = 0; + for (unsigned i = 0; i < rt_count; i++) { + pan_emit_rt(fb, layer_idx, i, cbuf_offset, out, clean_tile); + out += pan_size(RENDER_TARGET); + if (!fb->rts[i].view) + continue; + + cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) * + fb->tile_size * + pan_image_view_get_nr_samples(fb->rts[i].view); + + if (i != crc_rt && fb->rts[i].crc_valid != NULL) + *(fb->rts[i].crc_valid) = false; + } + + return 0; +} +#else +unsigned +GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, + const struct pan_tls_info *tls, + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) +{ + void *out = framebuffer.cpu; + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); check_fb_attachments(fb); @@ -1351,6 +1498,7 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, } return tag.opaque[0]; } +#endif /* PAN_ARCH >= 14 */ #else /* PAN_ARCH == 4 */ static enum mali_color_format pan_sfbd_raw_format(unsigned bits) @@ -1378,8 +1526,11 @@ GENX(pan_select_tile_size)(struct pan_fb_info *fb) unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, void *fbd) + const struct pan_tiler_context *tiler_ctx, + const struct pan_ptr framebuffer) { + void *fbd = framebuffer.cpu; + PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC); assert(fb->rt_count <= 1); diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index db5b6588ad3..7cc7639c897 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -341,7 +341,7 @@ void GENX(pan_emit_afrc_color_attachment)(const struct pan_attachment_info *att, unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx, const struct pan_tls_info *tls, const struct pan_tiler_context *tiler_ctx, - void *out); + const struct pan_ptr framebuffer); #if PAN_ARCH >= 6 unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height, diff --git a/src/panfrost/lib/pan_fb.c b/src/panfrost/lib/pan_fb.c index f9b6c85b2ce..3b3c6c86c5f 100644 --- a/src/panfrost/lib/pan_fb.c +++ b/src/panfrost/lib/pan_fb.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2026 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ #include "pan_fb.h" @@ -669,9 +670,124 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, } #endif +#if PAN_ARCH >= 14 uint32_t -GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out) +GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, + const struct pan_ptr framebuffer) { + /* Emit the dynamic framebuffer state. That is, state that may change per-layer. */ + + void *out = framebuffer.cpu; + const struct pan_fb_layout *fb = info->fb; + const struct pan_fb_load *load = info->load; + const struct pan_fb_store *store = info->store; + const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info); + const bool has_zs_crc_ext = pan_fb_has_zs(fb); + + struct pan_fbd_layer fbd_data = {0}; + fbd_data.tiler = info->tiler_ctx->valhall.desc; + + /* layer_index in flags0 is used to select the right primitive list in + * the tiler context, and frame_arg is the value that's passed to the + * fragment shader through r62-r63, which we use to pass gl_Layer. Since + * the layer_idx only takes 8-bits, we might use the extra 56-bits we + * have in frame_argument to pass other information to the fragment + * shader at some point. + */ + assert(info->layer >= info->tiler_ctx->valhall.layer_offset); + fbd_data.frame_argument = info->layer; + + pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) { + cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0], + ct.rts || ct.zs || ct.s); + cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1], + ct.rts || ct.zs || ct.s); + cfg.post_frame = info->frame_shaders.modes[2]; + + /* Enabling prepass without pipelineing is generally not good for + * performance, so disable HSR in that case. + */ + cfg.hsr_prepass_enable = info->allow_hsr_prepass && + pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb); + cfg.hsr_prepass_filter_enable = true; + cfg.hsr_hierarchical_optimizations_enable = true; + + cfg.internal_layer_index = + info->layer - info->tiler_ctx->valhall.layer_offset; + } + + pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) { + if (fb->s_format != PIPE_FORMAT_NONE) { + cfg.s_clear = load && target_has_clear(&load->s) ? + load->s.clear.stencil : 0; + cfg.s_write_enable = store && store->s.store; + } + + if (fb->z_format != PIPE_FORMAT_NONE) { + cfg.z_internal_format = pan_get_z_internal_format(fb->z_format); + cfg.z_write_enable = store && store->zs.store; + } else { + cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24; + assert(!store || !store->zs.store); + } + } + + fbd_data.z_clear = + util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load && + target_has_clear(&load->z) + ? load->z.clear.depth + : 0); + + fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer; + + { + /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */ + uint64_t out_gpu_addr = + framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64); + + if (has_zs_crc_ext) { + fbd_data.dbd_pointer = out_gpu_addr; + assert(fbd_data.dbd_pointer % 64 == 0); + out_gpu_addr += pan_size(ZS_CRC_EXTENSION); + } + + fbd_data.rtd_pointer = out_gpu_addr; + assert(fbd_data.rtd_pointer % 64 == 0); + } + + memcpy(out, &fbd_data, sizeof(fbd_data)); + out += ALIGN_POT(sizeof(fbd_data), 64); + + if (has_zs_crc_ext) { + struct mali_zs_crc_extension_packed zs_crc; + emit_zs_crc_desc(info, ct, &zs_crc); + memcpy(out, &zs_crc, sizeof(zs_crc)); + out += sizeof(zs_crc); + } + + uint32_t tile_rt_offset_B = 0; + for (unsigned rt = 0; rt < fb->rt_count; rt++) { + struct mali_rgb_render_target_packed rgb_rt; + emit_rgb_rt_desc(info, ct, rt, tile_rt_offset_B, &rgb_rt); + memcpy(out, &rgb_rt, sizeof(rgb_rt)); + out += sizeof(rgb_rt); + + if (fb->rt_formats[rt] != PIPE_FORMAT_NONE) { + tile_rt_offset_B += pan_bytes_per_pixel_tib(fb->rt_formats[rt]) * + fb->tile_size_px * fb->sample_count; + } + } + assert(tile_rt_offset_B <= fb->tile_rt_alloc_B); + + return 0; +} +#else /* PAN_ARCH < 14 */ +uint32_t +GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, + const struct pan_ptr framebuffer) +{ + void *out = framebuffer.cpu; const struct pan_fb_layout *fb = info->fb; const struct pan_fb_load *load = info->load; const struct pan_fb_store *store = info->store; @@ -823,4 +939,5 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out) } return tag.opaque[0]; } -#endif +#endif /* PAN_ARCH >= 14 */ +#endif /* PAN_ARCH >= 5 */ diff --git a/src/panfrost/lib/pan_fb.h b/src/panfrost/lib/pan_fb.h index c4635f3f4c2..48bfc888b1c 100644 --- a/src/panfrost/lib/pan_fb.h +++ b/src/panfrost/lib/pan_fb.h @@ -1,14 +1,20 @@ /* * Copyright (C) 2026 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ #ifndef __PAN_FB_H #define __PAN_FB_H +#if PAN_ARCH >= 14 +#include "genxml/cs_builder.h" +#endif + +#include "compiler/shader_enums.h" #include "genxml/gen_macros.h" #include "util/format/u_formats.h" -#include "compiler/shader_enums.h" +#include "pan_pool.h" struct nir_shader; struct nir_shader_compiler_options; @@ -481,7 +487,7 @@ void GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info, struct pan_fb_info *fbinfo); uint32_t GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, - void *out); + const struct pan_ptr framebuffer); #endif enum ENUM_PACKED pan_fb_shader_op { @@ -620,4 +626,35 @@ GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key, const struct nir_shader_compiler_options *nir_options); #endif +#if PAN_ARCH >= 14 +/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since + * we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors + * aligned. */ +struct pan_fbd_layer { + /** GPU address to the tiler descriptor. */ + uint64_t tiler; + + /** Frame argument. */ + uint64_t frame_argument; + + /** An instance of Fragment Flags 0. */ + struct mali_fragment_flags_0_packed flags0; + + /** An instance of Fragment Flags 2. */ + struct mali_fragment_flags_2_packed flags2; + + /** Z clear value. */ + uint32_t z_clear; + + /** GPU address to the draw call descriptors. It may be 0. */ + uint64_t dcd_pointer; + + /** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */ + uint64_t dbd_pointer; + + /** GPU address to the RENDER_TARGET descriptors. */ + uint64_t rtd_pointer; +} __attribute__((aligned(64))); +#endif /* PAN_ARCH >= 14 */ + #endif /* __PAN_FB_H */ diff --git a/src/panfrost/lib/pan_format.c b/src/panfrost/lib/pan_format.c index f67a3528ebb..7db35f5ac78 100644 --- a/src/panfrost/lib/pan_format.c +++ b/src/panfrost/lib/pan_format.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2019 Collabora, Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -184,7 +185,27 @@ const struct pan_blendable_format const struct pan_format GENX(pan_pipe_format)[PIPE_FORMAT_COUNT] = { FMT(NONE, CONSTANT, 0000, L, VTR_IB), -#if PAN_ARCH >= 7 +#if PAN_ARCH >= 14 + /* Multiplane formats */ + FMT_YUV(R8G8_R8B8_UNORM, Y8U8Y8V8_422, UVYA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(G8R8_B8R8_UNORM, U8Y8V8Y8_422, UYVA, SWAP, CENTER_422, _T____), + FMT_YUV(R8B8_R8G8_UNORM, Y8U8Y8V8_422, VYUA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(B8R8_G8R8_UNORM, U8Y8V8Y8_422, VUYA, SWAP, CENTER_422, _T____), + FMT_YUV(R8_G8B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_B8G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_G8_B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R8_B8_G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____), + + FMT_YUV(R8_G8B8_422_UNORM, Y8U8Y8V8_422, YUVA, NO_SWAP, CENTER_422, _T____), + FMT_YUV(R8_B8G8_422_UNORM, U8Y8V8Y8_422, YVUA, NO_SWAP, CENTER_422, _T____), + + FMT_YUV(R10_G10B10_420_UNORM, YUYAAYVYAA_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R10_G10B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, YUVA, NO_SWAP, CENTER_422, _T____), + /* special internal formats */ + FMT_YUV(R8G8B8_420_UNORM_PACKED, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(R10G10B10_420_UNORM_PACKED, Y10U10V10_420, YUVA, NO_SWAP, CENTER, _T____), + FMT_YUV(X6R10X6G10_X6R10X6B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, UVYA, NO_SWAP, CENTER_422, _T____), +#elif PAN_ARCH >= 7 /* Multiplane formats */ FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T____), FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP, CENTER_422, _T____), diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index 7c641c24105..770d8a1bf56 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -168,6 +168,8 @@ extern const struct pan_blendable_format pan_blendable_formats_v12[PIPE_FORMAT_COUNT]; extern const struct pan_blendable_format pan_blendable_formats_v13[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + pan_blendable_formats_v14[PIPE_FORMAT_COUNT]; uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats); @@ -184,6 +186,7 @@ pan_blendable_format_table(unsigned arch) FMT_TABLE(10); FMT_TABLE(12); FMT_TABLE(13); + FMT_TABLE(14); #undef FMT_TABLE default: assert(!"Unsupported architecture"); @@ -199,6 +202,7 @@ extern const struct pan_format pan_pipe_format_v9[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT]; extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT]; +extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT]; static inline const struct pan_format * pan_format_table(unsigned arch) @@ -213,6 +217,7 @@ pan_format_table(unsigned arch) FMT_TABLE(10); FMT_TABLE(12); FMT_TABLE(13); + FMT_TABLE(14); #undef FMT_TABLE default: assert(!"Unsupported architecture"); diff --git a/src/panfrost/lib/pan_mod.h b/src/panfrost/lib/pan_mod.h index 25ecaa25d50..1bd9a759a44 100644 --- a/src/panfrost/lib/pan_mod.h +++ b/src/panfrost/lib/pan_mod.h @@ -84,6 +84,7 @@ const struct pan_mod_handler *pan_mod_get_handler_v9(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier); const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier); +const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier); static inline const struct pan_mod_handler * pan_mod_get_handler(unsigned arch, uint64_t modifier) @@ -105,6 +106,8 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier) return pan_mod_get_handler_v12(modifier); case 13: return pan_mod_get_handler_v13(modifier); + case 14: + return pan_mod_get_handler_v14(modifier); default: UNREACHABLE("Unsupported arch"); } diff --git a/src/panfrost/lib/pan_texture.c b/src/panfrost/lib/pan_texture.c index 286b5c18b67..58a413278cf 100644 --- a/src/panfrost/lib/pan_texture.c +++ b/src/panfrost/lib/pan_texture.c @@ -223,6 +223,25 @@ pan_clump_format(enum pipe_format format) /* YUV-sampling has special cases */ if (pan_format_is_yuv(format)) { switch (format) { +#if PAN_ARCH >= 14 + case PIPE_FORMAT_R8G8_R8B8_UNORM: + case PIPE_FORMAT_G8R8_B8R8_UNORM: + case PIPE_FORMAT_R8B8_R8G8_UNORM: + case PIPE_FORMAT_B8R8_G8R8_UNORM: + case PIPE_FORMAT_R8_G8B8_422_UNORM: + case PIPE_FORMAT_R8_B8G8_422_UNORM: + case PIPE_FORMAT_R8_G8B8_420_UNORM: + case PIPE_FORMAT_R8_B8G8_420_UNORM: + case PIPE_FORMAT_R8_G8_B8_420_UNORM: + case PIPE_FORMAT_R8_B8_G8_420_UNORM: + case PIPE_FORMAT_R8G8B8_420_UNORM_PACKED: + return MALI_CLUMP_FORMAT_RAW8; + case PIPE_FORMAT_R10_G10B10_420_UNORM: + case PIPE_FORMAT_R10G10B10_420_UNORM_PACKED: + case PIPE_FORMAT_R10_G10B10_422_UNORM: + case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM: + return MALI_CLUMP_FORMAT_R10_PACKED; +#else case PIPE_FORMAT_R8G8_R8B8_UNORM: case PIPE_FORMAT_G8R8_B8R8_UNORM: case PIPE_FORMAT_R8B8_R8G8_UNORM: @@ -242,6 +261,7 @@ pan_clump_format(enum pipe_format format) case PIPE_FORMAT_R10_G10B10_422_UNORM: case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM: return MALI_CLUMP_FORMAT_Y10_UV10_422; +#endif /* PAN_ARCH >= 14 */ default: UNREACHABLE("unhandled clump format"); } diff --git a/src/panfrost/libpan/libpan.h b/src/panfrost/libpan/libpan.h index ed7c5c66f29..cc79ea92b74 100644 --- a/src/panfrost/libpan/libpan.h +++ b/src/panfrost/libpan/libpan.h @@ -28,6 +28,8 @@ #include "libpan_v12.h" #elif (PAN_ARCH == 13) #include "libpan_v13.h" +#elif (PAN_ARCH == 14) +#include "libpan_v14.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/libpan_shaders.h b/src/panfrost/libpan/libpan_shaders.h index 5154cef68d7..d51761abf64 100644 --- a/src/panfrost/libpan/libpan_shaders.h +++ b/src/panfrost/libpan/libpan_shaders.h @@ -26,6 +26,8 @@ #include "libpan_shaders_v12.h" #elif (PAN_ARCH == 13) #include "libpan_shaders_v13.h" +#elif (PAN_ARCH == 14) +#include "libpan_shaders_v14.h" #else #error "Unsupported architecture for libpan" #endif diff --git a/src/panfrost/libpan/meson.build b/src/panfrost/libpan/meson.build index 734660b5735..dfe40fff9c1 100644 --- a/src/panfrost/libpan/meson.build +++ b/src/panfrost/libpan/meson.build @@ -11,7 +11,7 @@ libpan_shader_files = files( idep_libpan_per_arch = {} -foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13'] +foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14'] libpan_spv = custom_target( input : libpan_shader_files, output : 'libpan_v' + ver + '.spv', diff --git a/src/panfrost/model/pan_model.c b/src/panfrost/model/pan_model.c index f9861ace8dc..4b28c4067fb 100644 --- a/src/panfrost/model/pan_model.c +++ b/src/panfrost/model/pan_model.c @@ -95,6 +95,10 @@ const struct pan_model pan_model_list[] = { MODEL_RATES(4, 8, 128)), FIFTHGEN_MODEL(PAN_PROD_ID(13, 8, 0), 4, "G725", "TKRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), MODEL_RATES(4, 8, 128)), + FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 1, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 64)), + FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536), + MODEL_RATES(4, 8, 128)), }; /* clang-format on */ diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 7e7e8922c88..65d08df53a1 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -74,7 +74,11 @@ static inline uint32_t get_fbd_size(bool has_zs_ext, uint32_t rt_count) { assert(rt_count >= 1 && rt_count <= MAX_RTS); +#if PAN_ARCH >= 14 + uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64); +#else uint32_t fbd_size = pan_size(FRAMEBUFFER); +#endif if (has_zs_ext) fbd_size += pan_size(ZS_CRC_EXTENSION); fbd_size += pan_size(RENDER_TARGET) * rt_count; @@ -209,13 +213,25 @@ enum panvk_cs_regs { PANVK_CS_REG_RUN_IDVS_SR_END = 60, #endif +#if PAN_ARCH >= 14 + /* RUN_FRAGMENT2 staging regs. + * SW ABI: + * - r58:59 contain the pointer to the first tiler descriptor. This is + * needed to gather completed heap chunks after a run_fragment2. + */ + PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0, + PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#else /* RUN_FRAGMENT staging regs. * SW ABI: - * - r38:39 contain the pointer to the first tiler descriptor. This is + * - r58:59 contain the pointer to the first tiler descriptor. This is * needed to gather completed heap chunks after a run_fragment. */ PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38, PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46, + PANVK_CS_REG_TILER_DESC_PTR = 58, +#endif /* RUN_COMPUTE staging regs. */ PANVK_CS_REG_RUN_COMPUTE_SR_START = 0, @@ -870,4 +886,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages, void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf, struct panvk_cs_deps deps); +#if PAN_ARCH >= 14 +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct pan_fbd_layer, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct pan_fbd_layer, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct pan_fbd_layer, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct pan_fbd_layer, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + #endif /* PANVK_CMD_BUFFER_H */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 794da0f16d7..c75dd998f7e 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -51,6 +51,7 @@ #include "vk_render_pass.h" #include "poly/geometry.h" +#if PAN_ARCH < 14 static enum cs_reg_perm provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg) { @@ -202,6 +203,7 @@ panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev) panvk_priv_bo_unref(dev->draw_ctx->fns_bo); vk_free(&dev->vk.alloc, dev->draw_ctx); } +#endif /* PAN_ARCH < 14 */ static void emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf, @@ -1245,8 +1247,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) uint32_t fbd_sz = calc_fbd_size(cmdbuf); uint32_t fbds_sz = enabled_layer_count * fbd_sz; - cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( - cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct pan_fbd_layer); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + cmdbuf->state.gfx.render.fbds = + panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment); if (!cmdbuf->state.gfx.render.fbds.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; @@ -1316,14 +1323,23 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) tiler_ctx = get_tiler_context(cmdbuf, layer_idx); uint32_t new_fbd_flags = - GENX(pan_emit_fb_desc)(&fbd_info, fbds.cpu + fbd_sz * i); + GENX(pan_emit_fb_desc)(&fbd_info, pan_ptr_offset(fbds, fbd_sz * i)); /* Make sure all FBDs have the same flags. */ assert(i == 0 || new_fbd_flags == fbd_flags); fbd_flags = new_fbd_flags; } +#if PAN_ARCH >= 14 + /* fbd_flags is unused on v14+. */ + assert(!fbd_flags); +#endif + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) { struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem( cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); @@ -1335,7 +1351,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) for (uint32_t i = 0; i < enabled_layer_count; i++) { uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i; - void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz)); fbd_info.layer = layer_idx; tiler_ctx = get_tiler_context(cmdbuf, layer_idx); @@ -1353,8 +1368,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) if (result != VK_SUCCESS) return result; - ASSERTED uint32_t new_fbd_flags = - GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd); + ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)( + &fbd_info, pan_ptr_offset(ir_fbds, fbd_sz * i)); /* Make sure all FBDs have the same flags. */ assert(new_fbd_flags == fbd_flags); @@ -1367,16 +1382,14 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) /* Wait for IR info push to complete */ cs_wait_slot(b, SB_ID(LS)); - - bool unset_provoking_vertex = - cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; +#endif /* PAN_ARCH >= 14 */ if (copy_fbds) { - struct cs_index cur_tiler = cs_reg64(b, 38); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); - struct cs_index fbd_idx = cs_reg32(b, 47); - struct cs_index src_fbd_ptr = cs_reg64(b, 48); - struct cs_index remaining_layers_in_td = cs_reg32(b, 50); + struct cs_index fbd_idx = cs_reg32(b, 60); + struct cs_index src_fbd_ptr = cs_reg64(b, 64); + struct cs_index remaining_layers_in_td = cs_reg32(b, 61); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -1400,10 +1413,27 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) * framebuffer size is aligned on 64-bytes. */ assert(fbd_sz == ALIGN_POT(fbd_sz, 64)); +#if PAN_ARCH >= 14 + for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + + /* Patch the Tiler pointer. */ + if (fbd_off == 0) + cs_add64(b, cs_scratch_reg64(b, 0), cur_tiler, 0); + + cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + } +#else + bool unset_provoking_vertex = + cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { if (fbd_off == 0) { cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr, BITFIELD_MASK(14), fbd_off); + + /* Patch the Tiler pointer. */ cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0); /* If we don't know what provoking vertex mode the @@ -1423,6 +1453,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr, BITFIELD_MASK(16), fbd_off); } +#endif /* Finish stores to pass_dst_fbd_ptr. */ cs_flush_stores(b); @@ -1459,9 +1490,11 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_update_frag_ctx(b) { cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbds.gpu | fbd_flags); - cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler); + cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR), + cmdbuf->state.gfx.render.tiler); } +#if PAN_ARCH < 14 /* If we don't know what provoking vertex mode the application wants yet, * leave space to patch it later */ if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { @@ -1483,6 +1516,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) cs_call(b, addr_reg, length_reg); } +#endif } return VK_SUCCESS; @@ -3299,6 +3333,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf) static void setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; const bool has_zs_ext = pan_fb_has_zs(fb); @@ -3343,6 +3380,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) TILER_OOM_CTX_FIELD_OFFSET(layer_count)); cs_flush_stores(b); +#endif /* PAN_ARCH >= 14 */ } static uint32_t @@ -3351,24 +3389,106 @@ pack_32_2x16(uint16_t lo, uint16_t hi) return (((uint32_t)hi) << 16) | (uint32_t)lo; } +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panvk_cmd_buffer *cmdbuf) +{ + /* Emit the static fragment staging registers. These don't change per-layer. */ + + const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render; + const struct pan_fb_layout *fb = &render->fb.layout; + + const uint8_t sample_count = render->fb.layout.sample_count; + + const struct pan_fb_bbox fb_area_px = + pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px); + const struct pan_fb_bbox bbox_px = + pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px); + + assert(pan_fb_bbox_is_valid(fb->tiling_area_px)); + + struct mali_fragment_bounding_box_packed bbox; + pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) { + cfg.bound_min_x = bbox_px.min_x; + cfg.bound_min_y = bbox_px.min_y; + cfg.bound_max_x = bbox_px.max_x; + cfg.bound_max_y = bbox_px.max_y; + } + + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width_px; + cfg.height = fb->height_px; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), + bbox.opaque[0]); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), + bbox.opaque[1]); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to( + b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + dev->sample_positions->addr.dev + + pan_sample_positions_offset(pan_sample_pattern(sample_count))); + + /* Flags 1 */ + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + cfg.sample_count = fb->sample_count; + cfg.sample_pattern = pan_sample_pattern(fb->sample_count); + cfg.effective_tile_size = fb->tile_size_px; + cfg.point_sprite_coord_origin_max_y = false; + cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf); + + assert(fb->rt_count > 0); + cfg.render_target_count = fb->rt_count; + cfg.color_buffer_allocation = fb->tile_rt_alloc_B; + } + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* If we don't know what provoking vertex mode the application wants yet, + * leave space to patch it later */ + if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { + cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) + { + /* provoking_vertex flag is bit 14 of Fragment Flags 1. */ + cs_add32(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), + cs_sr_reg32(b, FRAGMENT, FLAGS_1), -(1 << 14)); + } + } + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} +#endif /* PAN_ARCH >= 14 */ + static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { +#if PAN_ARCH < 14 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); +#endif const struct cs_tracing_ctx *tracing_ctx = &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing; - const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; /* Now initialize the fragment bits. */ + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); cs_update_frag_ctx(b) { +#if PAN_ARCH >= 14 + cs_emit_static_fragment_state(b, cmdbuf); + cs_emit_layer_fragment_state(b, fbd_pointer); +#else + const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout; cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), pack_32_2x16(fb->tiling_area_px.min_x, fb->tiling_area_px.min_y)); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), pack_32_2x16(fb->tiling_area_px.max_x, fb->tiling_area_px.max_y)); +#endif } bool simul_use = @@ -3401,6 +3521,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * state for this renderpass, so it's safe to enable. */ struct cs_index addr_reg = cs_scratch_reg64(b, 0); struct cs_index length_reg = cs_scratch_reg32(b, 2); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf); uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev + handler_idx * dev->tiler_oom.handler_stride; @@ -3408,6 +3531,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride); cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Wait for the tiling to be done before submitting the fragment job. */ wait_finish_tiling(cmdbuf); @@ -3422,8 +3546,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * up. */ cs_move64_to(b, addr_reg, 0); cs_move32_to(b, length_reg, 0); +#if PAN_ARCH >= 14 + // TODO: Implement IR support for v14. +#else cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, length_reg); +#endif /* Applications tend to forget to describe subpass dependencies, especially * when it comes to write -> read dependencies on attachments. The @@ -3439,8 +3567,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) } if (cmdbuf->state.gfx.render.layer_count <= 1) { +#if PAN_ARCH >= 14 + cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif } else { struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4); struct cs_index remaining_layers = cs_scratch_reg32(b, 4); @@ -3449,12 +3582,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) { cs_add32(b, remaining_layers, remaining_layers, -1); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); + cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_update_frag_ctx(b) - cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz); + cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz); } } @@ -3468,8 +3606,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4); struct cs_index completed_top = cs_scratch_reg64(b, 10); struct cs_index completed_bottom = cs_scratch_reg64(b, 12); - struct cs_index cur_tiler = cs_reg64(b, 38); - struct cs_index tiler_count = cs_reg32(b, 47); + struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR); + struct cs_index tiler_count = cs_reg32(b, 60); struct cs_index oq_chain = cs_scratch_reg64(b, 10); struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10); struct cs_index oq_syncobj = cs_scratch_reg64(b, 12); diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index b4cf6855184..72e805dc5ac 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -13,8 +13,9 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) { switch (reg) { /* The bbox is set up by the fragment subqueue, we should not modify it. */ - case 42: - case 43: + case MALI_FRAGMENT_SR_BBOX_MIN: + case MALI_FRAGMENT_SR_BBOX_MAX: + /* We should only load from the subqueue context. */ case PANVK_CS_REG_SUBQUEUE_CTX_START: case PANVK_CS_REG_SUBQUEUE_CTX_END: @@ -42,8 +43,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8), 8 * sizeof(uint32_t)); +#if PAN_ARCH >= 14 + const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64); +#else + const size_t fbd_size = sizeof(struct mali_framebuffer_packed); +#endif + if (has_zs_ext) { - const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed); + const uint16_t dbd_offset = fbd_size; /* Copy the whole DBD. */ cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other, @@ -57,8 +64,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count, } const uint16_t rts_offset = - sizeof(struct mali_framebuffer_packed) + - (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); + fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0); for (uint32_t rt = 0; rt < rt_count; rt++) { const uint16_t rt_offset = @@ -110,12 +116,14 @@ generate_tiler_oom_handler(struct panvk_device *dev, .tracebuf_addr_offset = offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), }; - struct mali_framebuffer_pointer_packed fb_tag; +#if PAN_ARCH < 14 + struct mali_framebuffer_pointer_packed fb_tag; pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) { cfg.zs_crc_extension_present = has_zs_ext; cfg.render_target_count = rt_count; } +#endif cs_function_def(&b, &handler, handler_ctx) { struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); @@ -140,7 +148,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4); /* The tiler pointer is pre-filled. */ - struct cs_index tiler_ptr = cs_reg64(&b, 38); + struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR); cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr)); @@ -176,11 +184,17 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_wait_slot(&b, SB_ID(LS)); /* Set FBD pointer to the scratch fbd */ - cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), - scratch_fbd_ptr_reg, fb_tag.opaque[0]); - + struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER); +#if PAN_ARCH >= 14 + cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0); + cs_emit_layer_fragment_state(&b, fbd_pointer); + cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false, + MALI_TILE_RENDER_ORDER_Z_ORDER); +#else + cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, fb_tag.opaque[0]); cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif /* Serialize run fragments since we reuse FBD for the runs */ cs_wait_slots(&b, dev->csf.sb.all_iters_mask); diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index c4848fe575b..b738be274d3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue) tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size; alloc_info.size = get_fbd_size(true, MAX_RTS); - alloc_info.alignment = pan_alignment(FRAMEBUFFER); +#if PAN_ARCH >= 14 + const unsigned fbds_alignment = alignof(struct pan_fbd_layer); +#else + const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER); +#endif + alloc_info.alignment = fbds_alignment; tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) { result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c index 0579034aea2..9879ca8b112 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c @@ -181,7 +181,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf) fbd_info.layer = layer_id; fbd_info.frame_shaders = fs; fbd_info.frame_shaders.dcd_pointer += layer_id * 3 * pan_size(DRAW); - tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd.cpu); + tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd); result = panvk_cmd_prepare_fragment_job(cmdbuf, tagged_fbd_ptr); if (result != VK_SUCCESS) diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index d79bcf885a7..ce06192d50a 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -14,6 +14,7 @@ panvk_entrypoints = custom_target( '--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7', '--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10', '--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13', + '--device-prefix', 'panvk_v14', '--beta', with_vulkan_beta.to_string() ], depend_files : vk_entrypoints_gen_depend_files, @@ -65,7 +66,7 @@ valhall_archs = [9, 10] valhall_inc_dir = ['valhall'] valhall_files = [] -fifthgen_archs = [12, 13] +fifthgen_archs = [12, 13, 14] fifthgen_inc_dir = ['fifthgen'] fifthgen_files = [] @@ -83,7 +84,7 @@ jm_files = [ 'jm/panvk_vX_gpu_queue.c', ] -csf_archs = [10, 12, 13] +csf_archs = [10, 12, 13, 14] csf_inc_dir = ['csf'] csf_files = [ 'csf/panvk_vX_bind_queue.c', @@ -126,7 +127,7 @@ common_per_arch_files = [ sha1_h, ] -foreach arch : [6, 7, 10, 12, 13] +foreach arch : [6, 7, 10, 12, 13, 14] per_arch_files = common_per_arch_files inc_panvk_per_arch = [] diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 8de69cfdb42..7c11787fd44 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -243,7 +243,7 @@ struct panvk_cmd_graphics_state { } \ } while (0) -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 struct panvk_device_draw_context { struct panvk_priv_bo *fns_bo; uint64_t fn_set_fbds_provoking_vertex_stride; @@ -376,8 +376,7 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state, gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \ } while (0) - -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 VkResult panvk_per_arch(device_draw_context_init)(struct panvk_device *dev); diff --git a/src/panfrost/vulkan/panvk_macros.h b/src/panfrost/vulkan/panvk_macros.h index 940d00522bb..09253ffdb93 100644 --- a/src/panfrost/vulkan/panvk_macros.h +++ b/src/panfrost/vulkan/panvk_macros.h @@ -61,6 +61,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 13: \ panvk_arch_name(name, v13)(__VA_ARGS__); \ break; \ + case 14: \ + panvk_arch_name(name, v14)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -84,6 +87,9 @@ panvk_catch_indirect_alloc_failure(VkResult error) case 13: \ ret = panvk_arch_name(name, v13)(__VA_ARGS__); \ break; \ + case 14: \ + ret = panvk_arch_name(name, v14)(__VA_ARGS__); \ + break; \ default: \ UNREACHABLE("Unsupported architecture"); \ } \ @@ -102,6 +108,8 @@ panvk_catch_indirect_alloc_failure(VkResult error) #define panvk_per_arch(name) panvk_arch_name(name, v12) #elif PAN_ARCH == 13 #define panvk_per_arch(name) panvk_arch_name(name, v13) +#elif PAN_ARCH == 14 +#define panvk_per_arch(name) panvk_arch_name(name, v14) #else #error "Unsupported arch" #endif diff --git a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c index 1e95c5c9390..bb18df6b49a 100644 --- a/src/panfrost/vulkan/panvk_physical_device.c +++ b/src/panfrost/vulkan/panvk_physical_device.c @@ -64,6 +64,7 @@ PER_ARCH_FUNCS(7); PER_ARCH_FUNCS(10); PER_ARCH_FUNCS(12); PER_ARCH_FUNCS(13); +PER_ARCH_FUNCS(14); static VkResult create_kmod_dev(struct panvk_physical_device *device, @@ -411,6 +412,7 @@ panvk_physical_device_init(struct panvk_physical_device *device, switch (arch) { case 6: case 7: + case 14: if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) { result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, "WARNING: panvk is not well-tested on v%d, " diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index c32d2f279e8..93b8a8e21af 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -550,7 +550,7 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, goto err_free_precomp; } -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 result = panvk_per_arch(device_draw_context_init)(device); if (result != VK_SUCCESS) goto err_free_mem_cache; @@ -616,7 +616,7 @@ err_finish_queues: panvk_meta_cleanup(device); err_free_draw_ctx: -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 panvk_per_arch(device_draw_context_cleanup)(device); err_free_mem_cache: #endif @@ -679,7 +679,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, } panvk_precomp_cleanup(device); -#if PAN_ARCH >= 10 +#if PAN_ARCH >= 10 && PAN_ARCH < 14 panvk_per_arch(device_draw_context_cleanup)(device); #endif panvk_meta_cleanup(device);