diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 357d4cfeff0..d6e143de182 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2023 Collabora Ltd. + * Copyright (C) 2026 Arm Ltd. * SPDX-License-Identifier: MIT */ @@ -13,6 +14,7 @@ #include "pan_cmdstream.h" #include "pan_context.h" #include "pan_csf.h" +#include "pan_fb.h" #include "pan_fb_preload.h" #include "pan_job.h" #include "pan_trace.h" @@ -75,6 +77,99 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr) (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) + \ offsetof(struct pan_ptr, gpu)) +#if PAN_ARCH >= 14 +static void +cs_emit_static_fragment_state(struct cs_builder *b, + struct panfrost_batch *batch, + const struct pan_fb_info *fb) +{ + struct mali_fragment_bounding_box_packed bbox; + pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) { + cfg.bound_min_x = batch->minx; + cfg.bound_min_y = batch->miny; + cfg.bound_max_x = batch->maxx - 1; + cfg.bound_max_y = batch->maxy - 1; + } + + struct mali_frame_size_packed frame_size; + pan_pack(&frame_size, FRAME_SIZE, cfg) { + cfg.width = fb->width; + cfg.height = fb->height; + } + + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX), + bbox.opaque[0] | ((uint64_t)bbox.opaque[1] << 32)); + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]); + cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER), + fb->sample_positions); + + struct mali_fragment_flags_1_packed flags1; + pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) { + /* The force_samples setting dictates the sample-count that is used + * for rasterization, and works like D3D11's ForcedSampleCount + * feature: + * + * - If force_samples == 0: Let nr_samples dictate sample count + * - If force_samples == 1: force single-sampled rasterization + * - If force_samples >= 1: force multi-sampled rasterization + * + * This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the + * fragment shader, even when performing single-sampled rendering. + */ + if (fb->pls_enabled) { + cfg.sample_count = 4; + cfg.sample_pattern = pan_sample_pattern(1); + } else if (!fb->force_samples) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(fb->nr_samples); + } else if (fb->force_samples == 1) { + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(1); + } else { + cfg.sample_count = 1; + cfg.sample_pattern = pan_sample_pattern(fb->force_samples); + } + + cfg.effective_tile_size = fb->tile_size; + cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin; + cfg.first_provoking_vertex = fb->first_provoking_vertex; + cfg.render_target_count = MAX2(fb->rt_count, 1); + cfg.color_buffer_allocation = fb->cbuf_allocation; + } + + cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]); + + /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */ +} + +#define PAN_CS_REG_FBD_LAYER_PTR 54 + +static inline void +cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr) +{ + /* Emit the dynamic fragment state. This state may change per-layer. */ + + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr, + offsetof(struct pan_fbd_layer, flags0)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr, + offsetof(struct pan_fbd_layer, flags2)); + cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr, + offsetof(struct pan_fbd_layer, z_clear)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, tiler)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, rtd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dbd_pointer)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr, + offsetof(struct pan_fbd_layer, frame_argument)); + cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr, + offsetof(struct pan_fbd_layer, dcd_pointer)); + + cs_flush_loads(b); +} +#endif /* PAN_ARCH >= 14 */ + static int csf_oom_handler_init(struct panfrost_context *ctx) { @@ -113,13 +208,18 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_function_def(&b, &handler, handler_ctx) { struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG); - struct cs_index counter = cs_reg32(&b, 47); - struct cs_index zero = cs_reg64(&b, 48); - struct cs_index flush_id = cs_reg32(&b, 48); - struct cs_index tiler_ctx = cs_reg64(&b, 50); - struct cs_index completed_top = cs_reg64(&b, 52); - struct cs_index completed_bottom = cs_reg64(&b, 54); - struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4); + struct cs_index counter = cs_reg32(&b, 31); + struct cs_index zero = cs_reg64(&b, 56); + struct cs_index flush_id = cs_reg32(&b, 58); + struct cs_index tiler_ctx = cs_reg64(&b, 60); + struct cs_index completed_top = cs_reg64(&b, 64); + struct cs_index completed_bottom = cs_reg64(&b, 66); + struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4); +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(&b, PAN_CS_REG_FBD_LAYER_PTR); +#else + struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER); +#endif /* Ensure that the OTHER endpoint is valid */ #if PAN_ARCH >= 11 @@ -133,25 +233,31 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter)); cs_wait_slot(&b, 0); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(MIDDLE)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE)); } cs_else(&b) { - cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, - FBD_OFFSET(FIRST)); + cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST)); } +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(&b, fbd_pointer); +#else cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx, FIELD_OFFSET(bbox_min)); cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MAX), tiler_oom_ctx, FIELD_OFFSET(bbox_max)); cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0); +#endif cs_wait_slot(&b, 0); /* Run the fragment job and wait */ cs_select_endpoint_sb(&b, 3); +#if PAN_ARCH >= 14 + cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(&b, 3); /* Increment counter */ @@ -218,6 +324,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch) panfrost_pool_cleanup(&batch->csf.cs_chunk_pool); } +#if PAN_ARCH >= 14 +static inline struct pan_ptr +alloc_fbd(struct panfrost_batch *batch) +{ + const struct pan_desc_alloc_info fbd_layer = { + .size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64), + .align = alignof(struct pan_fbd_layer), + .nelems = 1, + }; + + return pan_pool_alloc_desc_aggregate( + &batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION), + PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); +} +#else static inline struct pan_ptr alloc_fbd(struct panfrost_batch *batch) { @@ -225,6 +346,7 @@ alloc_fbd(struct panfrost_batch *batch) &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION), PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); } +#endif /* PAN_ARCH >= 14 */ int GENX(csf_init_batch)(struct panfrost_batch *batch) @@ -854,15 +976,26 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_vt_end(b, cs_now()); } +#if PAN_ARCH >= 14 + struct cs_index fbd_pointer = cs_reg64(b, PAN_CS_REG_FBD_LAYER_PTR); +#else + struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); +#endif + /* Set up the fragment job */ - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - batch->framebuffer.gpu); + cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu); + +#if PAN_ARCH >= 14 + cs_emit_static_fragment_state(b, batch, pfb); + cs_emit_layer_fragment_state(b, fbd_pointer); +#else cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), (batch->miny << 16) | batch->minx); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX), ((batch->maxy - 1) << 16) | (batch->maxx - 1)); cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0); +#endif /* Use different framebuffer descriptor if incremental rendering was * triggered while tiling */ @@ -871,13 +1004,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0); cs_wait_slot(b, 0); cs_if(b, MALI_CS_CONDITION_GREATER, counter) { - cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), - GET_FBD(oom_ctx, LAST).gpu); + cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu); +#if PAN_ARCH >= 14 + cs_emit_layer_fragment_state(b, fbd_pointer); +#endif } } /* Run the fragment job and wait */ +#if PAN_ARCH >= 14 + cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#else cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER); +#endif cs_wait_slot(b, 2); /* Gather freed heap chunks and add them to the heap context free list