From fa98cf6af0398490f149a2f934aa7dcebfcd440b Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Mon, 24 Mar 2025 16:23:20 -0700 Subject: [PATCH] panvk/csf: fix case where vk_meta is used before PROVOKING_VERTEX_MODE_LAST In this case, we need to emit the FBDs and TDs for the meta command before we know what provoking vertex mode the application is going to use. To handle this, we make a guess for which provoking vertex mode we need. Then we use cs_maybe to leave space to flip the provoking vertex bit if the guess was wrong. This case is still unhandled on JM. Fixes: 7a9f14d3c2b ("panvk: advertise VK_EXT_provoking_vertex") Signed-off-by: Olivia Lee Tested-by: Mary Guillemard Reviewed-by: Mary Guillemard Reviewed-by: Ryan Mckeever (cherry picked from commit 885805560f97c0b777d47db550ba51234b603a50) Part-of: --- .pick_status.json | 2 +- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 247 +++++++++++++++++++- src/panfrost/vulkan/panvk_cmd_draw.h | 21 ++ src/panfrost/vulkan/panvk_device.h | 3 + src/panfrost/vulkan/panvk_vX_cmd_draw.c | 4 + src/panfrost/vulkan/panvk_vX_device.c | 16 +- 6 files changed, 279 insertions(+), 14 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index be3bd2cad97..ab68329e4c4 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -4404,7 +4404,7 @@ "description": "panvk/csf: fix case where vk_meta is used before PROVOKING_VERTEX_MODE_LAST", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "7a9f14d3c2b1470703fb68216c40fc0572f94815", "notes": null diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 8ea5de33b27..0c006fb970d 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -13,6 +13,8 @@ #include #include "genxml/gen_macros.h" +#include "drm-uapi/panthor_drm.h" + #include "panvk_buffer.h" #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" @@ -43,6 +45,153 @@ #include "vk_pipeline_layout.h" #include "vk_render_pass.h" +static enum cs_reg_perm +provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg) +{ + return CS_REG_RW; +} + +#define PROVOKING_VERTEX_FN_MAX_SIZE 512 + +static size_t +generate_fn_set_fbds_provoking_vertex(struct panvk_device *dev, + struct cs_buffer fn_mem, bool has_zs_ext, + uint32_t rt_count, + uint32_t *dump_region_size) +{ + const struct drm_panthor_csif_info *csif_info = + panthor_kmod_get_csif_props(dev->kmod.dev); + + struct cs_builder b; + struct cs_builder_conf conf = { + .nr_registers = csif_info->cs_reg_count, + .nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4), + .reg_perm = provoking_vertex_fn_reg_perm_cb, + }; + cs_builder_init(&b, &conf, fn_mem); + + struct cs_function function; + struct cs_function_ctx function_ctx = { + .ctx_reg = cs_subqueue_ctx_reg(&b), + .dump_addr_offset = + offsetof(struct panvk_cs_subqueue_context, reg_dump_addr), + }; + + cs_function_def(&b, &function, function_ctx) { + uint32_t fbd_sz = get_fbd_size(has_zs_ext, rt_count); + + /* argument passed in by the caller */ + struct cs_index fbd_count = cs_scratch_reg32(&b, 0); + + /* normal scratch regs */ + struct cs_index scratch_reg = cs_scratch_reg32(&b, 1); + struct cs_index fbd_addr = cs_scratch_reg64(&b, 2); + + cs_add64(&b, fbd_addr, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), 0); + + cs_while(&b, MALI_CS_CONDITION_GREATER, fbd_count) { + /* provoking_vertex flag is bit 14 of word 11 */ + unsigned offset = 11 * 4; + cs_load32_to(&b, scratch_reg, fbd_addr, offset); + cs_wait_slot(&b, SB_ID(LS), false); + cs_add32(&b, scratch_reg, scratch_reg, -(1 << 14)); + cs_store32(&b, scratch_reg, fbd_addr, offset); + cs_wait_slot(&b, SB_ID(LS), false); + + cs_add32(&b, fbd_count, fbd_count, -1); + cs_add64(&b, fbd_addr, fbd_addr, fbd_sz); + } + } + + assert(cs_is_valid(&b)); + cs_finish(&b); + + *dump_region_size = function.dump_size; + + return function.length * sizeof(uint64_t); +} + +static uint32_t +get_fn_set_fbds_provoking_vertex_idx(bool has_zs_ext, uint32_t rt_count) +{ + assert(rt_count >= 1 && rt_count <= MAX_RTS); + uint32_t idx = has_zs_ext * MAX_RTS + (rt_count - 1); + assert(idx < 2 * MAX_RTS); + return idx; +} + +static uint32_t +calc_fn_set_fbds_provoking_vertex_idx(struct panvk_cmd_buffer *cmdbuf) +{ + const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info; + bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s; + uint32_t rt_count = MAX2(fb->rt_count, 1); + + return get_fn_set_fbds_provoking_vertex_idx(has_zs_ext, rt_count); +} + +VkResult +panvk_per_arch(device_draw_context_init)(struct panvk_device *dev) +{ + dev->draw_ctx = vk_alloc(&dev->vk.alloc, + sizeof(struct panvk_device_draw_context), + _Alignof(struct panvk_device_draw_context), + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (dev->draw_ctx == NULL) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + VkResult result = panvk_priv_bo_create( + dev, PROVOKING_VERTEX_FN_MAX_SIZE * 2 * MAX_RTS, 0, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &dev->draw_ctx->fns_bo); + if (result != VK_SUCCESS) + goto free_draw_ctx; + + for (uint32_t has_zs_ext = 0; has_zs_ext <= 1; has_zs_ext++) { + for (uint32_t rt_count = 1; rt_count <= MAX_RTS; rt_count++) { + uint32_t idx = + get_fn_set_fbds_provoking_vertex_idx(has_zs_ext, rt_count); + /* Check that we have calculated a fn_stride if we need it to offset + * addresses. */ + assert(idx == 0 || + dev->draw_ctx->fn_set_fbds_provoking_vertex_stride != 0); + size_t offset = + idx * dev->draw_ctx->fn_set_fbds_provoking_vertex_stride; + + struct cs_buffer fn_mem = { + .cpu = dev->draw_ctx->fns_bo->addr.host + offset, + .gpu = dev->draw_ctx->fns_bo->addr.dev + offset, + .capacity = PROVOKING_VERTEX_FN_MAX_SIZE / sizeof(uint64_t), + }; + + uint32_t dump_region_size; + size_t fn_length = + generate_fn_set_fbds_provoking_vertex(dev, fn_mem, has_zs_ext, + rt_count, &dump_region_size); + + /* All functions must have the same length */ + assert(idx == 0 || + fn_length == dev->draw_ctx->fn_set_fbds_provoking_vertex_stride); + dev->draw_ctx->fn_set_fbds_provoking_vertex_stride = fn_length; + dev->dump_region_size[PANVK_SUBQUEUE_VERTEX_TILER] = + MAX2(dev->dump_region_size[PANVK_SUBQUEUE_VERTEX_TILER], + dump_region_size); + } + } + + return VK_SUCCESS; + +free_draw_ctx: + vk_free(&dev->vk.alloc, dev->draw_ctx); + return result; +} + +void +panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev) +{ + panvk_priv_bo_unref(dev->draw_ctx->fns_bo); + vk_free(&dev->vk.alloc, dev->draw_ctx); +} + static void emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf, uint32_t attrib_idx, uint32_t vb_desc_offset, @@ -802,6 +951,8 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, cs_wait_slot(b, SB_ID(LS), false); } +static bool get_first_provoking_vertex(struct panvk_cmd_buffer *cmdbuf); + static VkResult get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) { @@ -851,8 +1002,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples); - cfg.first_provoking_vertex = - cmdbuf->state.gfx.render.first_provoking_vertex != U_TRISTATE_NO; + cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf); /* This will be overloaded. */ cfg.layer_count = 1; @@ -902,6 +1052,15 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) BITFIELD_MASK(4), offsetof(struct panvk_cs_subqueue_context, render.tiler_heap)); + /* If we don't know what provoking vertex mode the application wants yet, + * leave space to patch it later */ + if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { + cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_tds_provoking_vertex) + /* provoking_vertex flag is bit 18 of word 2 */ + cs_add32(b, cs_scratch_reg32(b, 2), cs_scratch_reg32(b, 2), + -(1 << 18)); + } + /* Fill extra fields with zeroes so we can reset the completed * top/bottom and private states. */ cs_move64_to(b, cs_scratch_reg64(b, 10), 0); @@ -1150,8 +1309,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) fbinfo->sample_positions = dev->sample_positions->addr.dev + panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples)); - fbinfo->first_provoking_vertex = - cmdbuf->state.gfx.render.first_provoking_vertex != U_TRISTATE_NO; + fbinfo->first_provoking_vertex = get_first_provoking_vertex(cmdbuf); VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo); if (result != VK_SUCCESS) @@ -1192,6 +1350,9 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + bool unset_provoking_vertex = + cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET; + if (copy_fbds) { struct cs_index cur_tiler = cs_reg64(b, 38); struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER); @@ -1232,6 +1393,15 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off); cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0); + + /* If we don't know what provoking vertex mode the + * application wants yet, leave space to patch it later. */ + if (unset_provoking_vertex) { + /* provoking_vertex flag is bit 14 of word 11 */ + struct cs_index word = cs_scratch_reg32(b, 11); + cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) + cs_add32(b, word, word, -(1 << 14)); + } } else { cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off); @@ -1281,11 +1451,52 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) fbds.gpu | fbd_flags); cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler); } + + /* If we don't know what provoking vertex mode the application wants yet, + * leave space to patch it later */ + if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) { + uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) * + (1 + PANVK_IR_PASS_COUNT); + /* passed to fn_set_fbds_provoking_vertex */ + struct cs_index fbd_count_reg = cs_scratch_reg32(b, 0); + cs_move32_to(b, fbd_count_reg, fbd_count); + + struct cs_index length_reg = cs_scratch_reg32(b, 1); + struct cs_index addr_reg = cs_scratch_reg64(b, 2); + uint32_t fn_idx = calc_fn_set_fbds_provoking_vertex_idx(cmdbuf); + uint32_t fn_stride = + dev->draw_ctx->fn_set_fbds_provoking_vertex_stride; + uint32_t fn_addr = + dev->draw_ctx->fns_bo->addr.dev + fn_idx * fn_stride; + cs_move64_to(b, addr_reg, fn_addr); + cs_move32_to(b, length_reg, fn_stride); + + cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex) + cs_call(b, addr_reg, length_reg); + } } return VK_SUCCESS; } +static bool +get_first_provoking_vertex(struct panvk_cmd_buffer *cmdbuf) +{ + switch (cmdbuf->state.gfx.render.first_provoking_vertex) { + case U_TRISTATE_NO: + return false; + case U_TRISTATE_YES: + return true; + /* If we don't know the provoking vertex mode yet, guess that it will + * be PROVOKING_VERTEX_MODE_FIRST. This is the vulkan default, and so + * likely to be right more often. */ + case U_TRISTATE_UNSET: + return true; + default: + unreachable("Invalid u_tristate"); + } +} + static void set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf, enum u_tristate first_provoking_vertex) @@ -1301,14 +1512,24 @@ set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf, state->render.first_provoking_vertex = first_provoking_vertex; } - /* Once we emit the first FBDs/TDs, we need to commit to a state. If we - * choose the wrong one, we will fail the assert when the next application - * draw happens (with a different state). Use PROVOKING_VERTEX_MODE_FIRST - * because it's the vulkan default, and so likely to be right more often. - * - * TODO: handle this case better */ - if (state->render.first_provoking_vertex == U_TRISTATE_UNSET) - state->render.first_provoking_vertex = U_TRISTATE_YES; + /* If the application uses PROVOKING_VERTEX_MODE_LAST after we previously + * emitted FBDs/TDs with the wrong mode set, patch the CS to flip the + * provoking vertex mode bits. */ + if (state->render.first_provoking_vertex == U_TRISTATE_NO && + state->render.maybe_set_tds_provoking_vertex) + { + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + cs_patch_maybe(b, state->render.maybe_set_tds_provoking_vertex); + state->render.maybe_set_tds_provoking_vertex = NULL; + } + if (state->render.first_provoking_vertex == U_TRISTATE_NO && + state->render.maybe_set_fbds_provoking_vertex) { + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + cs_patch_maybe(b, state->render.maybe_set_fbds_provoking_vertex); + state->render.maybe_set_fbds_provoking_vertex = NULL; + } } static VkResult @@ -2414,6 +2635,8 @@ panvk_per_arch(cmd_inherit_render_state)( struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; cmdbuf->state.gfx.render.first_provoking_vertex = U_TRISTATE_UNSET; + cmdbuf->state.gfx.render.maybe_set_tds_provoking_vertex = NULL; + cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex = NULL; cmdbuf->state.gfx.render.suspended = false; cmdbuf->state.gfx.render.flags = inheritance_info->flags; diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 138be5494d8..8f0062156b8 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -83,6 +83,11 @@ struct panvk_rendering_state { /* True if the last render pass was suspended. */ bool suspended; + /* Blocks that can patch to flip the provoking vertex mode if we need to + * emit FBDs/TDs before we know which mode the application is using */ + struct cs_maybe *maybe_set_tds_provoking_vertex; + struct cs_maybe *maybe_set_fbds_provoking_vertex; + struct { /* != 0 if the render pass contains one or more occlusion queries to * signal. */ @@ -208,6 +213,13 @@ struct panvk_cmd_graphics_state { } \ } while (0) +#if PAN_ARCH >= 10 +struct panvk_device_draw_context { + struct panvk_priv_bo *fns_bo; + uint64_t fn_set_fbds_provoking_vertex_stride; +}; +#endif + static inline uint32_t panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev, const struct panvk_cmd_graphics_state *state, @@ -314,6 +326,15 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state, gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \ } while (0) + +#if PAN_ARCH >= 10 +VkResult +panvk_per_arch(device_draw_context_init)(struct panvk_device *dev); + +void +panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev); +#endif + void panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf, const VkRenderingInfo *pRenderingInfo); diff --git a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h index cda025c2a91..267309eaa30 100644 --- a/src/panfrost/vulkan/panvk_device.h +++ b/src/panfrost/vulkan/panvk_device.h @@ -30,6 +30,7 @@ #define PANVK_MAX_QUEUE_FAMILIES 1 struct panvk_precomp_cache; +struct panvk_device_draw_context; struct panvk_device { struct vk_device vk; @@ -79,6 +80,8 @@ struct panvk_device { #endif } utrace; + struct panvk_device_draw_context* draw_ctx; + struct { struct pandecode_context *decode_ctx; } debug; diff --git a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c index 397a48ab321..378a081c65e 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c @@ -221,6 +221,10 @@ panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf, #endif state->render.first_provoking_vertex = U_TRISTATE_UNSET; +#if PAN_ARCH >= 10 + state->render.maybe_set_tds_provoking_vertex = NULL; + state->render.maybe_set_fbds_provoking_vertex = NULL; +#endif memset(state->render.fb.crc_valid, 0, sizeof(state->render.fb.crc_valid)); memset(&state->render.color_attachments, 0, sizeof(state->render.color_attachments)); diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index a7268794639..9158e152b31 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -17,6 +17,7 @@ #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_device.h" +#include "panvk_cmd_draw.h" #include "panvk_entrypoints.h" #include "panvk_instance.h" #include "panvk_macros.h" @@ -373,9 +374,15 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, if (result != VK_SUCCESS) goto err_free_priv_bos; - result = panvk_meta_init(device); +#if PAN_ARCH >= 10 + result = panvk_per_arch(device_draw_context_init)(device); if (result != VK_SUCCESS) goto err_free_precomp; +#endif + + result = panvk_meta_init(device); + if (result != VK_SUCCESS) + goto err_free_draw_ctx; for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = @@ -425,7 +432,11 @@ err_finish_queues: panvk_meta_cleanup(device); +err_free_draw_ctx: +#if PAN_ARCH >= 10 + panvk_per_arch(device_draw_context_cleanup)(device); err_free_precomp: +#endif panvk_precomp_cleanup(device); err_free_priv_bos: if (device->printf.bo) @@ -468,6 +479,9 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, } panvk_precomp_cleanup(device); +#if PAN_ARCH >= 10 + panvk_per_arch(device_draw_context_cleanup)(device); +#endif panvk_meta_cleanup(device); u_printf_destroy(&device->printf.ctx); panvk_priv_bo_unref(device->printf.bo);