From 1f1c36094bb4be5748ec414c5a6370817e1cde96 Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Mon, 17 Feb 2025 09:29:56 +0100 Subject: [PATCH] panfrost: Add v12 support to the Gallium driver Signed-off-by: Mary Guillemard Reviewed-by: Boris Brezillon Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/gallium/drivers/panfrost/meson.build | 4 +- src/gallium/drivers/panfrost/pan_cmdstream.c | 81 ++++++++++++++++++- src/gallium/drivers/panfrost/pan_cmdstream.h | 2 + src/gallium/drivers/panfrost/pan_csf.c | 46 ++++++++++- src/gallium/drivers/panfrost/pan_fb_preload.c | 9 ++- src/gallium/drivers/panfrost/pan_job.h | 3 + src/gallium/drivers/panfrost/pan_screen.c | 3 + src/gallium/drivers/panfrost/pan_screen.h | 1 + 8 files changed, 140 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 61db5780f58..4623d24595f 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -41,7 +41,7 @@ compile_args_panfrost = [ '-Wno-pointer-arith' ] -panfrost_versions = ['4', '5', '6', '7', '9', '10'] +panfrost_versions = ['4', '5', '6', '7', '9', '10', '12'] libpanfrost_versions = [] foreach ver : panfrost_versions @@ -53,7 +53,7 @@ foreach ver : panfrost_versions ] if ver in ['4', '5', '6', '7', '9'] files_panfrost_vx += ['pan_jm.c'] - elif ver in ['10'] + elif ver in ['10', '12'] files_panfrost_vx += ['pan_csf.c'] endif libpanfrost_versions += static_library( diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 61fdb5ef48e..6056ba7700b 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -64,7 +64,7 @@ * functions. */ #if PAN_ARCH <= 9 #define JOBX(__suffix) GENX(jm_##__suffix) -#elif PAN_ARCH <= 10 +#elif PAN_ARCH <= 12 #define JOBX(__suffix) GENX(csf_##__suffix) #else #error "Unsupported arch" @@ -707,6 +707,71 @@ panfrost_emit_frag_shader_meta(struct panfrost_batch *batch) } #endif +#if PAN_ARCH >= 12 +static uint64_t +panfrost_emit_viewport(struct panfrost_batch *batch) +{ + struct panfrost_context *ctx = batch->ctx; + const struct pipe_viewport_state *vp = &ctx->pipe_viewport; + const struct pipe_scissor_state *ss = &ctx->scissor; + const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + + /* Derive min/max from translate/scale. Note since |x| >= 0 by + * definition, we have that -|x| <= |x| hence translate - |scale| <= + * translate + |scale|, so the ordering is correct here. */ + float vp_minx = vp->translate[0] - fabsf(vp->scale[0]); + float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); + float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); + float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); + + float minz, maxz; + util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz); + + /* Viewport clamped to the framebuffer */ + unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0)); + unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0)); + unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0)); + unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0)); + + if (ss && rast->scissor) { + minx = MAX2(ss->minx, minx); + miny = MAX2(ss->miny, miny); + maxx = MIN2(ss->maxx, maxx); + maxy = MIN2(ss->maxy, maxy); + } + + /* Set the range to [1, 1) so max values don't wrap round */ + if (maxx == 0 || maxy == 0) + maxx = maxy = minx = miny = 1; + + panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); + batch->scissor_culls_everything = (minx >= maxx || miny >= maxy); + + pan_cast_and_pack(&batch->avalon_viewport, VIEWPORT, cfg) { + /* Clamp viewport to valid range */ + cfg.min_x = CLAMP(minx, 0, UINT16_MAX); + cfg.min_y = CLAMP(miny, 0, UINT16_MAX); + cfg.max_x = CLAMP(maxx, 0, UINT16_MAX); + cfg.max_y = CLAMP(maxy, 0, UINT16_MAX); + + cfg.min_depth = CLAMP(minz, 0.0f, 1.0f); + cfg.max_depth = CLAMP(maxz, 0.0f, 1.0f); + } + + /* [minx, maxx) and [miny, maxy) are exclusive ranges for scissors in the hardware */ + maxx--; + maxy--; + + pan_cast_and_pack(&batch->scissor, SCISSOR, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } + + return 0; +} +#else static uint64_t panfrost_emit_viewport(struct panfrost_batch *batch) { @@ -783,6 +848,7 @@ panfrost_emit_viewport(struct panfrost_batch *batch) return 0; #endif } +#endif #if PAN_ARCH >= 9 /** @@ -4002,9 +4068,14 @@ prepare_shader(struct panfrost_compiled_shader *state, return; bool vs = (state->info.stage == MESA_SHADER_VERTEX); - bool secondary_enable = (vs && state->info.vs.secondary_enable); +#if PAN_ARCH >= 12 + unsigned nr_variants = vs ? 2 : 1; +#else + bool secondary_enable = (vs && state->info.vs.secondary_enable); unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1; +#endif + struct panfrost_ptr ptr = pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM); @@ -4018,8 +4089,10 @@ prepare_shader(struct panfrost_compiled_shader *state, if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; +#if PAN_ARCH < 12 else if (vs) cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; +#endif cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); @@ -4037,7 +4110,9 @@ prepare_shader(struct panfrost_compiled_shader *state, /* IDVS/triangles */ pan_pack(&programs[1], SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&state->info); +#if PAN_ARCH < 12 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; +#endif cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; @@ -4045,6 +4120,7 @@ prepare_shader(struct panfrost_compiled_shader *state, cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); } +#if PAN_ARCH < 12 if (!secondary_enable) return; @@ -4059,6 +4135,7 @@ prepare_shader(struct panfrost_compiled_shader *state, cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); } #endif +#endif } static void diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.h b/src/gallium/drivers/panfrost/pan_cmdstream.h index 0519d05da48..2a7f24fc9e1 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.h +++ b/src/gallium/drivers/panfrost/pan_cmdstream.h @@ -256,11 +256,13 @@ panfrost_get_position_shader(struct panfrost_batch *batch, return vs_ptr; } +#if PAN_ARCH < 12 static inline uint64_t panfrost_get_varying_shader(struct panfrost_batch *batch) { return batch->rsd[PIPE_SHADER_VERTEX] + (2 * pan_size(SHADER_PROGRAM)); } +#endif static inline unsigned panfrost_vertex_attribute_stride(struct panfrost_compiled_shader *vs, diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 380619ee5d9..a46ac6ea219 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -139,6 +139,13 @@ csf_oom_handler_init(struct panfrost_context *ctx) struct cs_index completed_bottom = cs_reg64(&b, 54); struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4); + /* Ensure that the OTHER endpoint is valid */ +#if PAN_ARCH >= 11 + cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_OTHER, 0); +#else + cs_set_scoreboard_entry(&b, 0, 0); +#endif + /* Use different framebuffer descriptor depending on whether incremental * rendering has already been triggered */ cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter)); @@ -161,7 +168,7 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_wait_slot(&b, 0, false); /* Run the fragment job and wait */ - cs_set_scoreboard_entry(&b, 3, 0); + cs_select_sb_entries_for_async_ops(&b, 3); cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); cs_wait_slot(&b, 3, false); @@ -191,7 +198,7 @@ csf_oom_handler_init(struct panfrost_context *ctx) cs_wait_slot(&b, 0, false); - cs_set_scoreboard_entry(&b, 2, 0); + cs_select_sb_entries_for_async_ops(&b, 2); } assert(cs_is_valid(&b)); @@ -274,7 +281,7 @@ GENX(csf_init_batch)(struct panfrost_batch *batch) /* Set up entries */ struct cs_builder *b = batch->csf.cs.builder; - cs_set_scoreboard_entry(b, 2, 0); + cs_select_sb_entries_for_async_ops(b, 2); batch->framebuffer = alloc_fbd(batch); if (!batch->framebuffer.gpu) @@ -698,6 +705,10 @@ csf_emit_tiler_desc(struct panfrost_batch *batch, const struct pan_fb_info *fb) ; tiler.hierarchy_mask &= ~BITFIELD_MASK(disable_hierarchies); +#if PAN_ARCH >= 12 + tiler.effective_tile_size = fb->tile_size; +#endif + tiler.fb_width = batch->key.width; tiler.fb_height = batch->key.height; tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu; @@ -873,7 +884,12 @@ csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage, assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT || stage == PIPE_SHADER_COMPUTE); +#if PAN_ARCH >= 12 + unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 2 : 0; +#else unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0; +#endif + unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2); struct cs_builder *b = batch->csf.cs.builder; @@ -1092,7 +1108,7 @@ csf_emit_draw_state(struct panfrost_batch *batch, } csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX, - panfrost_get_position_shader(batch, info)); + panfrost_get_position_shader(batch, info)); if (fs_required) { csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT, @@ -1103,12 +1119,18 @@ csf_emit_draw_state(struct panfrost_batch *batch, cs_move64_to(b, cs_sr_reg64(b, IDVS, FRAGMENT_SPD), 0); } +#if PAN_ARCH >= 12 + cs_move64_to(b, cs_reg64(b, MALI_IDVS_SR_VERTEX_TSD), batch->tls.gpu); + cs_move64_to(b, cs_reg64(b, MALI_IDVS_SR_FRAGMENT_TSD), batch->tls.gpu); +#else if (secondary_shader) { cs_move64_to(b, cs_sr_reg64(b, IDVS, VERTEX_VARY_SPD), panfrost_get_varying_shader(batch)); } cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), batch->tls.gpu); +#endif + cs_move32_to(b, cs_sr_reg32(b, IDVS, GLOBAL_ATTRIBUTE_OFFSET), 0); cs_move32_to(b, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), 0); cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD2), 0); @@ -1120,10 +1142,16 @@ csf_emit_draw_state(struct panfrost_batch *batch, uint64_t *sbd = (uint64_t *)&batch->scissor[0]; cs_move64_to(b, cs_sr_reg64(b, IDVS, SCISSOR_BOX), *sbd); +#if PAN_ARCH >= 12 + uint64_t *avalon_viewport = (uint64_t *)batch->avalon_viewport; + cs_move64_to(b, cs_sr_reg64(b, IDVS, VIEWPORT_HIGH), avalon_viewport[0]); + cs_move64_to(b, cs_sr_reg64(b, IDVS, VIEWPORT_LOW), avalon_viewport[1]); +#else cs_move32_to(b, cs_sr_reg32(b, IDVS, LOW_DEPTH_CLAMP), fui(batch->minimum_z)); cs_move32_to(b, cs_sr_reg32(b, IDVS, HIGH_DEPTH_CLAMP), fui(batch->maximum_z)); +#endif if (ctx->occlusion_query && ctx->active_queries) { struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc); @@ -1336,8 +1364,13 @@ GENX(csf_launch_draw)(struct panfrost_batch *batch, cs_move32_to(b, cs_sr_reg32(b, IDVS, INDEX_BUFFER_SIZE), 0); } +#if PAN_ARCH >= 12 + cs_run_idvs2(b, flags_override, false, true, drawid, + MALI_IDVS_SHADING_MODE_EARLY); +#else cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), drawid); +#endif } void @@ -1378,8 +1411,13 @@ GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch, } cs_wait_slot(b, 0, false); +#if PAN_ARCH >= 12 + cs_run_idvs2(b, flags_override, false, true, drawid, + MALI_IDVS_SHADING_MODE_EARLY); +#else cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), drawid); +#endif cs_add64(b, address, address, indirect->stride); cs_add32(b, counter, counter, (unsigned int)-1); diff --git a/src/gallium/drivers/panfrost/pan_fb_preload.c b/src/gallium/drivers/panfrost/pan_fb_preload.c index 391194a0f79..be43bd2621d 100644 --- a/src/gallium/drivers/panfrost/pan_fb_preload.c +++ b/src/gallium/drivers/panfrost/pan_fb_preload.c @@ -1204,11 +1204,18 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool, cfg.flags_1.sample_mask = 0xFFFF; cfg.flags_0.multisample_enable = ms; cfg.flags_0.evaluate_per_sample = ms; - cfg.maximum_z = 1.0; cfg.flags_0.clean_fragment_write = clean_fragment_write; + +#if PAN_ARCH >= 12 + cfg.fragment_resources = T.gpu | nr_tables; + cfg.fragment_shader = spd.gpu; + cfg.thread_storage = tsd; +#else + cfg.maximum_z = 1.0; cfg.shader.resources = T.gpu | nr_tables; cfg.shader.shader = spd.gpu; cfg.shader.thread_storage = tsd; +#endif } #endif } diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index a8b1bc02f52..8ab9bca2147 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -151,6 +151,9 @@ struct panfrost_batch { unsigned scissor[2]; float minimum_z, maximum_z; + /* Avalon: struct mali_viewport_packed */ + unsigned avalon_viewport[4]; + /* Used on Valhall only. Midgard includes attributes in-band with * attributes, wildly enough. */ diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index 45582cf23c9..131884a7218 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -939,6 +939,9 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config, case 10: panfrost_cmdstream_screen_init_v10(screen); break; + case 12: + panfrost_cmdstream_screen_init_v12(screen); + break; default: debug_printf("panfrost: Unhandled architecture major %d", dev->arch); panfrost_destroy_screen(&(screen->base)); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index e8fe9b17f3d..2e57c598a9c 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -160,6 +160,7 @@ void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen); +void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen); #define perf_debug(ctx, ...) \ do { \