diff --git a/src/panfrost/.clang-format b/src/panfrost/.clang-format index dafd1889248..4ca86fc5831 100644 --- a/src/panfrost/.clang-format +++ b/src/panfrost/.clang-format @@ -10,5 +10,10 @@ ForEachMacros: [ 'cs_case', 'cs_default', 'cs_match', + 'cs_update_compute_ctx', + 'cs_update_frag_ctx', + 'cs_update_progress_seqno', + 'cs_update_vt_ctx', 'cs_while', + 'panvk_cs_reg_upd_ctx', ] diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h new file mode 100644 index 00000000000..601198394d2 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -0,0 +1,460 @@ +/* + * Copyright © 2024 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_CMD_BUFFER_H +#define PANVK_CMD_BUFFER_H + +#ifndef PAN_ARCH +#error "PAN_ARCH must be defined" +#endif + +#include + +#include "genxml/cs_builder.h" + +#include "panvk_cmd_desc_state.h" +#include "panvk_cmd_push_constant.h" +#include "panvk_queue.h" + +#include "vk_command_buffer.h" + +#include "util/list.h" + +#define MAX_VBS 16 +#define MAX_RTS 8 + +struct panvk_cs_sync32 { + uint32_t seqno; + uint32_t error; +}; + +struct panvk_cs_sync64 { + uint64_t seqno; + uint32_t error; + uint32_t pad; +}; + +struct panvk_cs_desc_ringbuf { + uint64_t syncobj; + uint64_t ptr; + uint32_t pos; + uint32_t pad; +}; + +/* 512k of render descriptors that can be used when + * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT is set on the command buffer. */ +#define RENDER_DESC_RINGBUF_SIZE (512 * 1024) + +struct panvk_cs_subqueue_context { + uint64_t syncobjs; + uint32_t iter_sb; + uint32_t pad; + struct { + struct panvk_cs_desc_ringbuf desc_ringbuf; + uint64_t tiler_heap; + uint64_t geom_buf; + } render; + uint64_t debug_syncobjs; +} __attribute__((aligned(64))); + +struct panvk_cache_flush_info { + enum mali_cs_flush_mode l2; + enum mali_cs_flush_mode lsc; + bool others; +}; + +struct panvk_cs_deps { + bool needs_draw_flush; + struct { + uint32_t wait_sb_mask; + struct panvk_cache_flush_info cache_flush; + } src[PANVK_SUBQUEUE_COUNT]; + + struct { + uint32_t wait_subqueue_mask; + } dst[PANVK_SUBQUEUE_COUNT]; +}; + +enum panvk_sb_ids { + PANVK_SB_LS = 0, + PANVK_SB_IMM_FLUSH = 0, + PANVK_SB_DEFERRED_SYNC = 1, + PANVK_SB_DEFERRED_FLUSH = 2, + PANVK_SB_ITER_START = 3, + PANVK_SB_ITER_COUNT = 5, +}; + +#define SB_IMM_MASK 0 +#define SB_MASK(nm) BITFIELD_BIT(PANVK_SB_##nm) +#define SB_ID(nm) PANVK_SB_##nm +#define SB_ITER(x) (PANVK_SB_ITER_START + (x)) +#define SB_WAIT_ITER(x) BITFIELD_BIT(PANVK_SB_ITER_START + (x)) +#define SB_ALL_MASK BITFIELD_MASK(8) + +static inline uint32_t +next_iter_sb(uint32_t sb) +{ + return sb + 1 < PANVK_SB_ITER_COUNT ? sb + 1 : 0; +} + +enum panvk_cs_regs { + /* RUN_IDVS staging regs. */ + PANVK_CS_REG_RUN_IDVS_SR_START = 0, + PANVK_CS_REG_RUN_IDVS_SR_END = 60, + + /* RUN_FRAGMENT staging regs. */ + PANVK_CS_REG_RUN_FRAGMENT_SR_START = 40, + PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46, + + /* RUN_COMPUTE staging regs. */ + PANVK_CS_REG_RUN_COMPUTE_SR_START = 0, + PANVK_CS_REG_RUN_COMPUTE_SR_END = 39, + + /* Range of registers that can be used to store temporary data on + * all queues. Note that some queues have extra space they can use + * as scratch space.*/ + PANVK_CS_REG_SCRATCH_START = 66, + PANVK_CS_REG_SCRATCH_END = 83, + + /* Driver context. */ + PANVK_CS_REG_PROGRESS_SEQNO_START = 84, + PANVK_CS_REG_PROGRESS_SEQNO_END = 89, + PANVK_CS_REG_SUBQUEUE_CTX_START = 90, + PANVK_CS_REG_SUBQUEUE_CTX_END = 91, +}; + +static inline struct cs_index +cs_scratch_reg_tuple(struct cs_builder *b, unsigned start, unsigned count) +{ + assert(PANVK_CS_REG_SCRATCH_START + start + count - 1 <= + PANVK_CS_REG_SCRATCH_END); + return cs_reg_tuple(b, PANVK_CS_REG_SCRATCH_START + start, count); +} + +static inline struct cs_index +cs_scratch_reg32(struct cs_builder *b, unsigned reg) +{ + return cs_scratch_reg_tuple(b, reg, 1); +} + +static inline struct cs_index +cs_scratch_reg64(struct cs_builder *b, unsigned reg) +{ + assert(reg % 2 == 0); + return cs_scratch_reg_tuple(b, reg, 2); +} + +static inline struct cs_index +cs_sr_reg_tuple(struct cs_builder *b, unsigned start, unsigned count) +{ + assert(start + count - 1 < PANVK_CS_REG_SCRATCH_START); + return cs_reg_tuple(b, start, count); +} + +static inline struct cs_index +cs_sr_reg32(struct cs_builder *b, unsigned reg) +{ + return cs_sr_reg_tuple(b, reg, 1); +} + +static inline struct cs_index +cs_sr_reg64(struct cs_builder *b, unsigned reg) +{ + assert(reg % 2 == 0); + return cs_sr_reg_tuple(b, reg, 2); +} + +static inline struct cs_index +cs_subqueue_ctx_reg(struct cs_builder *b) +{ + return cs_reg64(b, PANVK_CS_REG_SUBQUEUE_CTX_START); +} + +static inline struct cs_index +cs_progress_seqno_reg(struct cs_builder *b, enum panvk_subqueue_id subqueue) +{ + assert(PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2) < + PANVK_CS_REG_PROGRESS_SEQNO_END); + return cs_reg64(b, PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2)); +} + +struct panvk_cs_reg_upd_context { + reg_perm_cb_t reg_perm; + struct panvk_cs_reg_upd_context *next; +}; + +struct panvk_cs_state { + struct cs_builder builder; + + struct cs_load_store_tracker ls_tracker; + + /* Used to debug register writes in invalid contexts. */ + struct { + struct panvk_cs_reg_upd_context *upd_ctx_stack; + reg_perm_cb_t base_perm; + } reg_access; + + /* Sync point relative to the beginning of the command buffer. + * Needs to be offset with the subqueue sync point. */ + int32_t relative_sync_point; +}; + +static inline struct panvk_cs_reg_upd_context * +panvk_cs_reg_ctx_push(struct cs_builder *b, + struct panvk_cs_reg_upd_context *ctx, + reg_perm_cb_t reg_perm) +{ + struct panvk_cs_state *cs_state = + container_of(b, struct panvk_cs_state, builder); + + ctx->reg_perm = reg_perm; + ctx->next = cs_state->reg_access.upd_ctx_stack; + cs_state->reg_access.upd_ctx_stack = ctx; + return ctx; +} + +static inline void +panvk_cs_reg_ctx_pop(struct cs_builder *b, struct panvk_cs_reg_upd_context *ctx) +{ + struct panvk_cs_state *cs_state = + container_of(b, struct panvk_cs_state, builder); + + assert(cs_state->reg_access.upd_ctx_stack == ctx); + + cs_state->reg_access.upd_ctx_stack = ctx->next; +} + +struct panvk_cs_reg_range { + unsigned start; + unsigned end; +}; + +#define PANVK_CS_REG_RANGE(__name) \ + { \ + .start = PANVK_CS_REG_##__name##_START, \ + .end = PANVK_CS_REG_##__name##_END, \ + } + +#define panvk_cs_reg_blacklist(__name, ...) \ + static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm( \ + struct cs_builder *b, unsigned reg) \ + { \ + const struct panvk_cs_reg_range ranges[] = { \ + __VA_ARGS__, \ + }; \ + \ + for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) { \ + if (reg >= ranges[i].start && reg <= ranges[i].end) \ + return CS_REG_RD; \ + } \ + \ + return CS_REG_RW; \ + } + +panvk_cs_reg_blacklist(vt, PANVK_CS_REG_RANGE(RUN_IDVS_SR), + PANVK_CS_REG_RANGE(PROGRESS_SEQNO), + PANVK_CS_REG_RANGE(SUBQUEUE_CTX)); +panvk_cs_reg_blacklist(frag, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR), + PANVK_CS_REG_RANGE(PROGRESS_SEQNO), + PANVK_CS_REG_RANGE(SUBQUEUE_CTX)); +panvk_cs_reg_blacklist(compute, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR), + PANVK_CS_REG_RANGE(PROGRESS_SEQNO), + PANVK_CS_REG_RANGE(SUBQUEUE_CTX)); + +#define panvk_cs_reg_whitelist(__name, ...) \ + static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm( \ + struct cs_builder *b, unsigned reg) \ + { \ + const struct panvk_cs_reg_range ranges[] = { \ + __VA_ARGS__, \ + }; \ + \ + for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) { \ + if (reg >= ranges[i].start && reg <= ranges[i].end) \ + return CS_REG_RW; \ + } \ + \ + return CS_REG_RD; \ + } + +#define panvk_cs_reg_upd_ctx(__b, __name) \ + for (struct panvk_cs_reg_upd_context __reg_upd_ctx, \ + *reg_upd_ctxp = panvk_cs_reg_ctx_push(__b, &__reg_upd_ctx, \ + panvk_cs_##__name##_reg_perm); \ + reg_upd_ctxp; \ + panvk_cs_reg_ctx_pop(__b, &__reg_upd_ctx), reg_upd_ctxp = NULL) + +panvk_cs_reg_whitelist(progress_seqno, PANVK_CS_REG_RANGE(PROGRESS_SEQNO)); +#define cs_update_progress_seqno(__b) panvk_cs_reg_upd_ctx(__b, progress_seqno) + +panvk_cs_reg_whitelist(compute_ctx, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR)); +#define cs_update_compute_ctx(__b) panvk_cs_reg_upd_ctx(__b, compute_ctx) + +panvk_cs_reg_whitelist(frag_ctx, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR)); +#define cs_update_frag_ctx(__b) panvk_cs_reg_upd_ctx(__b, frag_ctx) + +panvk_cs_reg_whitelist(vt_ctx, PANVK_CS_REG_RANGE(RUN_IDVS_SR)); +#define cs_update_vt_ctx(__b) panvk_cs_reg_upd_ctx(__b, vt_ctx) + +struct panvk_tls_state { + struct panfrost_ptr desc; + struct pan_tls_info info; + unsigned max_wg_count; +}; + +struct panvk_cmd_compute_state { + struct panvk_descriptor_state desc_state; + const struct panvk_shader *shader; + struct panvk_compute_sysvals sysvals; + mali_ptr push_uniforms; + struct { + struct panvk_shader_desc_state desc; + } cs; +}; + +struct panvk_attrib_buf { + mali_ptr address; + unsigned size; +}; + +struct panvk_resolve_attachment { + VkResolveModeFlagBits mode; + struct panvk_image_view *src_iview; + struct panvk_image_view *dst_iview; +}; + +struct panvk_cmd_graphics_state { + struct panvk_descriptor_state desc_state; + + struct { + struct vk_vertex_input_state vi; + struct vk_sample_locations_state sl; + } dynamic; + + struct panvk_graphics_sysvals sysvals; + + struct panvk_shader_link link; + bool linked; + + struct { + const struct panvk_shader *shader; + struct panvk_shader_desc_state desc; + mali_ptr spd; + } fs; + + struct { + const struct panvk_shader *shader; + struct panvk_shader_desc_state desc; + struct { + mali_ptr pos, var; + } spds; + } vs; + + struct { + struct panvk_attrib_buf bufs[MAX_VBS]; + unsigned count; + bool dirty; + } vb; + + /* Index buffer */ + struct { + struct panvk_buffer *buffer; + uint64_t offset; + uint8_t index_size; + uint32_t first_vertex, base_vertex, base_instance; + bool dirty; + } ib; + + struct { + struct panvk_blend_info info; + } cb; + + struct { + VkRenderingFlags flags; + uint32_t layer_count; + + enum vk_rp_attachment_flags bound_attachments; + struct { + VkFormat fmts[MAX_RTS]; + uint8_t samples[MAX_RTS]; + struct panvk_resolve_attachment resolve[MAX_RTS]; + } color_attachments; + + struct pan_image_view zs_pview; + + struct { + struct panvk_resolve_attachment resolve; + } z_attachment, s_attachment; + + struct { + struct pan_fb_info info; + bool crc_valid[MAX_RTS]; + } fb; + + struct panfrost_ptr fbds; + mali_ptr tiler; + bool dirty; + } render; + + mali_ptr push_uniforms; +}; + +struct panvk_cmd_buffer { + struct vk_command_buffer vk; + VkCommandBufferUsageFlags flags; + struct panvk_pool cs_pool; + struct panvk_pool desc_pool; + struct panvk_pool tls_pool; + struct list_head push_sets; + + uint32_t flush_id; + + struct { + struct panvk_cmd_graphics_state gfx; + struct panvk_cmd_compute_state compute; + struct panvk_push_constant_state push_constants; + struct panvk_cs_state cs[PANVK_SUBQUEUE_COUNT]; + struct panvk_tls_state tls; + } state; +}; + +VK_DEFINE_HANDLE_CASTS(panvk_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) + +static inline struct cs_builder * +panvk_get_cs_builder(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) +{ + return &cmdbuf->state.cs[subqueue].builder; +} + +static inline struct panvk_descriptor_state * +panvk_cmd_get_desc_state(struct panvk_cmd_buffer *cmdbuf, + VkPipelineBindPoint bindpoint) +{ + switch (bindpoint) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + return &cmdbuf->state.gfx.desc_state; + + case VK_PIPELINE_BIND_POINT_COMPUTE: + return &cmdbuf->state.compute.desc_state; + + default: + assert(!"Unsupported bind point"); + return NULL; + } +} + +extern const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops); + +void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf); + +void panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue); + +void panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf, + const VkDependencyInfo *in, + struct panvk_cs_deps *out); + +#endif /* PANVK_CMD_BUFFER_H */ diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h new file mode 100644 index 00000000000..e5df67337c4 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -0,0 +1,72 @@ +/* + * Copyright © 2021 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_QUEUE_H +#define PANVK_QUEUE_H + +#ifndef PAN_ARCH +#error "PAN_ARCH must be defined" +#endif + +#include "genxml/gen_macros.h" + +#include + +#include "panvk_device.h" + +#include "vk_queue.h" + +enum panvk_subqueue_id { + PANVK_SUBQUEUE_VERTEX_TILER = 0, + PANVK_SUBQUEUE_FRAGMENT, + PANVK_SUBQUEUE_COMPUTE, + PANVK_SUBQUEUE_COUNT, +}; + +struct panvk_tiler_heap { + uint32_t chunk_size; + struct panvk_priv_mem desc; + struct { + uint32_t handle; + mali_ptr dev_addr; + } context; +}; + +struct panvk_subqueue { + struct panvk_priv_mem context; +}; + +struct panvk_desc_ringbuf { + struct panvk_priv_mem syncobj; + struct pan_kmod_bo *bo; + struct { + uint64_t dev; + void *host; + } addr; +}; + +struct panvk_queue { + struct vk_queue vk; + + uint32_t group_handle; + uint32_t syncobj_handle; + + struct panvk_tiler_heap tiler_heap; + struct panvk_desc_ringbuf render_desc_ringbuf; + struct panvk_priv_mem syncobjs; + struct panvk_priv_mem debug_syncobjs; + + struct panvk_subqueue subqueues[PANVK_SUBQUEUE_COUNT]; +}; + +VK_DEFINE_HANDLE_CASTS(panvk_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) + +void panvk_per_arch(queue_finish)(struct panvk_queue *queue); + +VkResult panvk_per_arch(queue_init)(struct panvk_device *device, + struct panvk_queue *queue, int idx, + const VkDeviceQueueCreateInfo *create_info); + +#endif diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c new file mode 100644 index 00000000000..920e91172a4 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -0,0 +1,703 @@ +/* + * Copyright © 2021 Collabora Ltd. + * + * Derived from tu_cmd_buffer.c which is: + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "genxml/gen_macros.h" + +#include "panvk_buffer.h" +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_buffer.h" +#include "panvk_cmd_desc_state.h" +#include "panvk_cmd_pool.h" +#include "panvk_cmd_push_constant.h" +#include "panvk_device.h" +#include "panvk_entrypoints.h" +#include "panvk_instance.h" +#include "panvk_physical_device.h" +#include "panvk_priv_bo.h" + +#include "pan_blitter.h" +#include "pan_desc.h" +#include "pan_encoder.h" +#include "pan_props.h" +#include "pan_samples.h" + +#include "vk_descriptor_update_template.h" +#include "vk_format.h" +#include "vk_synchronization.h" + +static void +emit_tls(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); + unsigned core_id_range; + panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); + + if (cmdbuf->state.tls.info.tls.size) { + unsigned thread_tls_alloc = + panfrost_query_thread_tls_alloc(&phys_dev->kmod.props); + unsigned size = panfrost_get_total_stack_size( + cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range); + + cmdbuf->state.tls.info.tls.ptr = + panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu; + } + + assert(!cmdbuf->state.tls.info.wls.size); + + if (cmdbuf->state.tls.desc.cpu) { + GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu); + } +} + +static void +finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + + cs_update_progress_seqno(b) { + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + uint32_t rel_sync_point = cmdbuf->state.cs[i].relative_sync_point; + + if (!rel_sync_point) + continue; + + cs_add64(b, cs_progress_seqno_reg(b, i), cs_progress_seqno_reg(b, i), + rel_sync_point); + } + } + + /* If we're decoding the CS or dumping memory mappings, we need a flush + * to make sure all data have been pushed to memory. */ + if (instance->debug_flags & (PANVK_DEBUG_DUMP | PANVK_DEBUG_TRACE)) { + struct cs_index flush_id = cs_scratch_reg32(b, 0); + + cs_move32_to(b, flush_id, 0); + cs_wait_slots(b, SB_ALL_MASK, false); + cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, + false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); + cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + } + + /* If we're in sync/trace more, we signal the debug object. */ + if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) { + struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0); + struct cs_index one = cs_scratch_reg32(b, 2); + struct cs_index error = cs_scratch_reg32(b, 3); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 2); + + cs_move32_to(b, one, 1); + cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, debug_syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + cs_add64(b, debug_sync_addr, debug_sync_addr, + sizeof(struct panvk_cs_sync32) * subqueue); + cs_load32_to(b, error, debug_sync_addr, + offsetof(struct panvk_cs_sync32, error)); + cs_wait_slots(b, SB_ALL_MASK, false); + cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_SYSTEM, one, debug_sync_addr, + cs_now()); + + cs_match(b, error, cmp_scratch) { + cs_case(b, 0) { + /* Do nothing. */ + } + + cs_default(b) { + /* Overwrite the sync error with the first error we encountered. */ + cs_store32(b, error, debug_sync_addr, + offsetof(struct panvk_cs_sync32, error)); + cs_wait_slots(b, SB_ID(LS), false); + } + } + } + + cs_finish(&cmdbuf->state.cs[subqueue].builder); +} + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + emit_tls(cmdbuf); + + for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { + struct cs_builder *b = &cmdbuf->state.cs[i].builder; + + if (!cs_is_valid(b)) { + vk_command_buffer_set_error(&cmdbuf->vk, + VK_ERROR_OUT_OF_DEVICE_MEMORY); + } else { + finish_cs(cmdbuf, i); + } + } + + return vk_command_buffer_end(&cmdbuf->vk); +} + +static bool +src_stages_need_draw_flush(VkPipelineStageFlags2 stages) +{ + static const VkPipelineStageFlags2 draw_flush_stage_mask = + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT | + VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT; + + return (stages & draw_flush_stage_mask) != 0; +} + +static bool +stages_cover_subqueue(enum panvk_subqueue_id subqueue, + VkPipelineStageFlags2 stages) +{ + static const VkPipelineStageFlags2 queue_coverage[PANVK_SUBQUEUE_COUNT] = { + [PANVK_SUBQUEUE_VERTEX_TILER] = VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT, + [PANVK_SUBQUEUE_FRAGMENT] = + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT | + VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT, + [PANVK_SUBQUEUE_COMPUTE] = + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT, + }; + + return (stages & queue_coverage[subqueue]) != 0; +} + +static uint32_t +src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue, + VkPipelineStageFlags2 stages) +{ + if (!stages_cover_subqueue(subqueue, stages)) + return 0; + + /* Indirect draw buffers are read from the command stream, and load/store + * operations are synchronized with the LS scoreboad immediately after the + * read, so no need to wait in that case. + */ + if (subqueue == PANVK_SUBQUEUE_VERTEX_TILER && + stages == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT) + return 0; + + /* We need to wait for all previously submitted jobs, and given the + * iterator scoreboard is a moving target, we just wait for the + * whole dynamic scoreboard range. */ + return BITFIELD_RANGE(PANVK_SB_ITER_START, PANVK_SB_ITER_COUNT); +} + +static void +collect_cache_flush_info(enum panvk_subqueue_id subqueue, + struct panvk_cache_flush_info *cache_flush, + VkPipelineStageFlags2 src_stages, + VkPipelineStageFlags2 dst_stages, + VkAccessFlags2 src_access, VkAccessFlags2 dst_access) +{ + static const VkAccessFlags2 dev_writes[PANVK_SUBQUEUE_COUNT] = { + [PANVK_SUBQUEUE_VERTEX_TILER] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | + VK_ACCESS_2_SHADER_WRITE_BIT | + VK_ACCESS_2_TRANSFER_WRITE_BIT, + [PANVK_SUBQUEUE_FRAGMENT] = + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_SHADER_WRITE_BIT | + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_2_TRANSFER_WRITE_BIT, + [PANVK_SUBQUEUE_COMPUTE] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | + VK_ACCESS_2_SHADER_WRITE_BIT | + VK_ACCESS_2_TRANSFER_WRITE_BIT, + }; + static const VkAccessFlags2 dev_reads[PANVK_SUBQUEUE_COUNT] = { + [PANVK_SUBQUEUE_VERTEX_TILER] = + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_INDEX_READ_BIT | + VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_UNIFORM_READ_BIT | + VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT | + VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + [PANVK_SUBQUEUE_FRAGMENT] = + VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT | + VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + [PANVK_SUBQUEUE_COMPUTE] = + VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT | + VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + }; + + /* Note on the cache organization: + * - L2 cache is unified, so all changes to this cache are automatically + * visible to all GPU sub-components (shader cores, tiler, ...). This + * means we only need to flush when the host (AKA CPU) is involved. + * - LS caches (which are basically just read-write L1 caches) are coherent + * with each other and with the L2 cache, so again, we only need to flush + * when the host is involved. + * - Other read-only L1 caches (like the ones in front of the texture unit) + * are not coherent with the LS or L2 caches, and thus need to be + * invalidated any time a write happens. + */ + +#define ACCESS_HITS_RO_L1_CACHE \ + (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | \ + VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | \ + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | \ + VK_ACCESS_2_TRANSFER_READ_BIT) + + if ((dev_writes[subqueue] & src_access) && + (dev_reads[subqueue] & ACCESS_HITS_RO_L1_CACHE & dst_access)) + cache_flush->others |= true; + + /* If the host wrote something, we need to clean/invalidate everything. */ + if ((src_stages & VK_PIPELINE_STAGE_2_HOST_BIT) && + (src_access & VK_ACCESS_2_HOST_WRITE_BIT) && + ((dev_reads[subqueue] | dev_writes[subqueue]) & dst_access)) { + cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE; + cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE; + cache_flush->others |= true; + } + + /* If the host needs to read something we wrote, we need to clean + * everything. */ + if ((dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT) && + (dst_access & VK_ACCESS_2_HOST_READ_BIT) && + (dev_writes[subqueue] & src_access)) { + cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN; + cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN; + } +} + +static void +collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, + VkPipelineStageFlags2 src_stages, + VkPipelineStageFlags2 dst_stages, VkAccessFlags src_access, + VkAccessFlags dst_access, struct panvk_cs_deps *deps) +{ + if (src_stages_need_draw_flush(src_stages) && cmdbuf->state.gfx.render.tiler) + deps->needs_draw_flush = true; + + uint32_t wait_subqueue_mask = 0; + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + uint32_t sb_mask = src_stages_to_subqueue_sb_mask(i, src_stages); + assert((sb_mask != 0) == stages_cover_subqueue(i, src_stages)); + if (!sb_mask) + continue; + + deps->src[i].wait_sb_mask |= sb_mask; + collect_cache_flush_info(i, &deps->src[i].cache_flush, src_stages, + dst_stages, src_access, dst_access); + wait_subqueue_mask |= BITFIELD_BIT(i); + } + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (!stages_cover_subqueue(i, dst_stages)) + continue; + + deps->dst[i].wait_subqueue_mask |= wait_subqueue_mask & ~BITFIELD_BIT(i); + } +} + +void +panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf, + const VkDependencyInfo *in, + struct panvk_cs_deps *out) +{ + memset(out, 0, sizeof(*out)); + + for (uint32_t i = 0; i < in->memoryBarrierCount; i++) { + const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i]; + VkPipelineStageFlags2 src_stages = + vk_expand_pipeline_stage_flags2(barrier->srcStageMask); + VkPipelineStageFlags2 dst_stages = + vk_expand_pipeline_stage_flags2(barrier->dstStageMask); + VkAccessFlags2 src_access = + vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask); + VkAccessFlags2 dst_access = + vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask); + + collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, + out); + } + + for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) { + const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i]; + VkPipelineStageFlags2 src_stages = + vk_expand_pipeline_stage_flags2(barrier->srcStageMask); + VkPipelineStageFlags2 dst_stages = + vk_expand_pipeline_stage_flags2(barrier->dstStageMask); + VkAccessFlags2 src_access = + vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask); + VkAccessFlags2 dst_access = + vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask); + + collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, + out); + } + + for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i]; + VkPipelineStageFlags2 src_stages = + vk_expand_pipeline_stage_flags2(barrier->srcStageMask); + VkPipelineStageFlags2 dst_stages = + vk_expand_pipeline_stage_flags2(barrier->dstStageMask); + VkAccessFlags2 src_access = + vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask); + VkAccessFlags2 dst_access = + vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask); + + collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, + out); + } + + /* The draw flush will add a vertex -> fragment dependency, so we can skip + * the one described in the deps. */ + if (out->needs_draw_flush) + out->dst[PANVK_SUBQUEUE_FRAGMENT].wait_subqueue_mask &= + ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, + const VkDependencyInfo *pDependencyInfo) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + struct panvk_cs_deps deps; + + panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps); + + if (deps.needs_draw_flush) + panvk_per_arch(cmd_flush_draws)(cmdbuf); + + uint32_t wait_subqueue_mask = 0; + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) + wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask; + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (!deps.src[i].wait_sb_mask) + continue; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); + struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i]; + + cs_wait_slots(b, deps.src[i].wait_sb_mask, false); + + struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush; + if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE || + cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) { + struct cs_index flush_id = cs_scratch_reg32(b, 0); + + cs_move32_to(b, flush_id, 0); + cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others, + flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); + cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + } + + /* If no one waits on us, there's no point signaling the sync object. */ + if (wait_subqueue_mask & BITFIELD_BIT(i)) { + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index add_val = cs_scratch_reg64(b, 2); + + cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i); + cs_move64_to(b, add_val, 1); + cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, + cs_now()); + ++cs_state->relative_sync_point; + } + } + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (!deps.dst[i].wait_subqueue_mask) + continue; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); + for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) { + if (!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(j))) + continue; + + struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j]; + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index wait_val = cs_scratch_reg64(b, 2); + + cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j); + + cs_add64(b, wait_val, cs_progress_seqno_reg(b, j), + cs_state->relative_sync_point); + cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val, + sync_addr); + } + } +} + +void +panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue) +{ + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + struct cs_index iter_sb = cs_scratch_reg32(b, 0); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 1); + + cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_wait_slot(b, SB_ID(LS), false); + + cs_match(b, iter_sb, cmp_scratch) { +#define CASE(x) \ + cs_case(b, x) { \ + cs_wait_slot(b, SB_ITER(x), false); \ + cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS)); \ + } + + CASE(0) + CASE(1) + CASE(2) + CASE(3) + CASE(4) +#undef CASE + } +} + +static struct cs_buffer +alloc_cs_buffer(void *cookie) +{ + struct panvk_cmd_buffer *cmdbuf = cookie; + const unsigned capacity = 64 * 1024 / sizeof(uint64_t); + + struct panfrost_ptr ptr = + panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64); + + return (struct cs_buffer){ + .cpu = ptr.cpu, + .gpu = ptr.gpu, + .capacity = capacity, + }; +} + +static enum cs_reg_perm +cs_reg_perm(struct cs_builder *b, unsigned reg) +{ + struct panvk_cs_state *cs_state = + container_of(b, struct panvk_cs_state, builder); + struct panvk_cs_reg_upd_context *upd_ctx; + + for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx; + upd_ctx = upd_ctx->next) { + if (upd_ctx->reg_perm(b, reg) == CS_REG_RW) + return CS_REG_RW; + } + + return cs_state->reg_access.base_perm(b, reg); +} + +static void +init_cs_builders(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = { + [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm, + [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm, + [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm, + }; + + for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { + /* Lazy allocation of the root CS. */ + struct cs_buffer root_cs = {0}; + + struct cs_builder_conf conf = { + .nr_registers = 96, + .nr_kernel_registers = 4, + .alloc_buffer = alloc_cs_buffer, + .cookie = cmdbuf, + }; + + if (instance->debug_flags & PANVK_DEBUG_CS) { + cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){ + .sb_slot = SB_ID(LS), + }; + + conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker; + + cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL; + cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i]; + conf.reg_perm = cs_reg_perm; + } + + cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs); + } +} + +static void +panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf, + VkCommandBufferResetFlags flags) +{ + struct panvk_cmd_buffer *cmdbuf = + container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk); + struct panvk_cmd_pool *pool = + container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); + + vk_command_buffer_reset(&cmdbuf->vk); + + panvk_pool_reset(&cmdbuf->cs_pool); + panvk_pool_reset(&cmdbuf->desc_pool); + panvk_pool_reset(&cmdbuf->tls_pool); + list_splicetail(&cmdbuf->push_sets, &pool->push_sets); + list_inithead(&cmdbuf->push_sets); + + memset(&cmdbuf->state, 0, sizeof(cmdbuf->state)); + init_cs_builders(cmdbuf); +} + +static void +panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf) +{ + struct panvk_cmd_buffer *cmdbuf = + container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk); + struct panvk_cmd_pool *pool = + container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + + panvk_pool_cleanup(&cmdbuf->cs_pool); + panvk_pool_cleanup(&cmdbuf->desc_pool); + panvk_pool_cleanup(&cmdbuf->tls_pool); + list_splicetail(&cmdbuf->push_sets, &pool->push_sets); + vk_command_buffer_finish(&cmdbuf->vk); + vk_free(&dev->vk.alloc, cmdbuf); +} + +static VkResult +panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level, + struct vk_command_buffer **cmdbuf_out) +{ + struct panvk_device *device = + container_of(vk_pool->base.device, struct panvk_device, vk); + struct panvk_cmd_pool *pool = + container_of(vk_pool, struct panvk_cmd_pool, vk); + struct panvk_cmd_buffer *cmdbuf; + + cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!cmdbuf) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + VkResult result = vk_command_buffer_init( + &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level); + if (result != VK_SUCCESS) { + vk_free(&device->vk.alloc, cmdbuf); + return result; + } + + list_inithead(&cmdbuf->push_sets); + cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi; + cmdbuf->vk.dynamic_graphics_state.ms.sample_locations = + &cmdbuf->state.gfx.dynamic.sl; + + struct panvk_pool_properties cs_pool_props = { + .create_flags = 0, + .slab_size = 64 * 1024, + .label = "Command buffer CS pool", + .prealloc = false, + .owns_bos = true, + .needs_locking = false, + }; + panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props); + + struct panvk_pool_properties desc_pool_props = { + .create_flags = 0, + .slab_size = 64 * 1024, + .label = "Command buffer descriptor pool", + .prealloc = false, + .owns_bos = true, + .needs_locking = false, + }; + panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool, + &desc_pool_props); + + struct panvk_pool_properties tls_pool_props = { + .create_flags = + panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP), + .slab_size = 64 * 1024, + .label = "TLS pool", + .prealloc = false, + .owns_bos = true, + .needs_locking = false, + }; + panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool, + &tls_pool_props); + + init_cs_builders(cmdbuf); + *cmdbuf_out = &cmdbuf->vk; + return VK_SUCCESS; +} + +const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = { + .create = panvk_create_cmdbuf, + .reset = panvk_reset_cmdbuf, + .destroy = panvk_destroy_cmdbuf, +}; + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + + vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo); + cmdbuf->flags = pBeginInfo->flags; + + /* The descriptor ringbuf trips out pandecode because we always point to the + * next tiler/framebuffer descriptor after CS execution, which means we're + * decoding an uninitialized or stale descriptor. + * FIXME: find a way to trace the simultaneous path that doesn't crash. One + * option would be to disable CS intepretation and dump the RUN_xxx context + * on the side at execution time. + */ + if (instance->debug_flags & PANVK_DEBUG_TRACE) + cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + + return VK_SUCCESS; +} diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c new file mode 100644 index 00000000000..6a93ae4dc5f --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -0,0 +1,285 @@ +/* + * Copyright © 2024 Collabora Ltd. + * + * Derived from tu_cmd_buffer.c which is: + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * Copyright © 2015 Intel Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "genxml/gen_macros.h" + +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_buffer.h" +#include "panvk_cmd_desc_state.h" +#include "panvk_cmd_meta.h" +#include "panvk_cmd_push_constant.h" +#include "panvk_device.h" +#include "panvk_entrypoints.h" +#include "panvk_meta.h" +#include "panvk_physical_device.h" + +#include "pan_desc.h" +#include "pan_encoder.h" +#include "pan_props.h" + +#include + +static VkResult +prepare_driver_set(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_shader_desc_state *cs_desc_state = + &cmdbuf->state.compute.cs.desc; + + if (cs_desc_state->driver_set.dev_addr) + return VK_SUCCESS; + + const struct panvk_descriptor_state *desc_state = + &cmdbuf->state.compute.desc_state; + const struct panvk_shader *cs = cmdbuf->state.compute.shader; + uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1; + struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE); + struct panvk_opaque_desc *descs = driver_set.cpu; + + if (!driver_set.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + /* Dummy sampler always comes first. */ + pan_pack(&descs[0], SAMPLER, _) { + } + + panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs, + (struct mali_buffer_packed *)(&descs[1])); + + cs_desc_state->driver_set.dev_addr = driver_set.gpu; + cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE; + return VK_SUCCESS; +} + +static VkResult +prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf) +{ + cmdbuf->state.compute.push_uniforms = panvk_per_arch( + cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals, + sizeof(cmdbuf->state.compute.sysvals)); + return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS + : VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, + uint32_t baseGroupX, uint32_t baseGroupY, + uint32_t baseGroupZ, uint32_t groupCountX, + uint32_t groupCountY, uint32_t groupCountZ) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + const struct panvk_shader *shader = cmdbuf->state.compute.shader; + VkResult result; + + /* If there's no compute shader, we can skip the dispatch. */ + if (!panvk_priv_mem_dev_addr(shader->spd)) + return; + + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(cmdbuf->vk.base.device->physical); + struct panvk_descriptor_state *desc_state = + &cmdbuf->state.compute.desc_state; + struct panvk_shader_desc_state *cs_desc_state = + &cmdbuf->state.compute.cs.desc; + + struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); + if (!tsd.gpu) + return; + + struct pan_tls_info tlsinfo = { + .tls.size = shader->info.tls_size, + .wls.size = shader->info.wls_size, + }; + unsigned core_id_range; + + panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); + + if (tlsinfo.wls.size) { + /* TODO: Clamp WLS instance to some maximum WLS budget. */ + struct pan_compute_dim dim = {groupCountX, groupCountY, groupCountZ}; + + tlsinfo.wls.instances = pan_wls_instances(&dim); + + unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) * + tlsinfo.wls.instances * core_id_range; + + tlsinfo.wls.ptr = + panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu; + if (!tlsinfo.wls.ptr) + return; + } + + cmdbuf->state.tls.info.tls.size = + MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size); + + if (!cmdbuf->state.tls.desc.gpu) { + cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); + if (!cmdbuf->state.tls.desc.gpu) + return; + } + + GENX(pan_emit_tls)(&tlsinfo, tsd.cpu); + + result = panvk_per_arch(cmd_prepare_push_descs)( + cmdbuf, desc_state, shader->desc_info.used_set_mask); + if (result != VK_SUCCESS) + return; + + struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals; + sysvals->num_work_groups.x = groupCountX; + sysvals->num_work_groups.y = groupCountY; + sysvals->num_work_groups.z = groupCountZ; + sysvals->local_group_size.x = shader->local_size.x; + sysvals->local_group_size.y = shader->local_size.y; + sysvals->local_group_size.z = shader->local_size.z; + + result = prepare_driver_set(cmdbuf); + if (result != VK_SUCCESS) + return; + + cmdbuf->state.compute.push_uniforms = 0; + result = prepare_push_uniforms(cmdbuf); + if (result != VK_SUCCESS) + return; + + result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state, + shader, cs_desc_state); + if (result != VK_SUCCESS) + return; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + unsigned task_axis = MALI_TASK_AXIS_X; + unsigned task_increment = 0; + + /* Copy the global TLS pointer to the per-job TSD. */ + cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu); + cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); + cs_wait_slot(b, SB_ID(LS), false); + cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); + cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); + cs_wait_slot(b, SB_ID(LS), false); + + cs_update_compute_ctx(b) { + cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table); + uint32_t push_size = 256 + sizeof(struct panvk_compute_sysvals); + uint64_t fau_count = DIV_ROUND_UP(push_size, 8); + mali_ptr fau_ptr = + cmdbuf->state.compute.push_uniforms | (fau_count << 56); + cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr); + cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd)); + cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu); + + /* Global attribute offset */ + cs_move32_to(b, cs_sr_reg32(b, 32), 0); + + struct mali_compute_size_workgroup_packed wg_size; + pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) { + cfg.workgroup_size_x = shader->local_size.x; + cfg.workgroup_size_y = shader->local_size.y; + cfg.workgroup_size_z = shader->local_size.z; + cfg.allow_merging_workgroups = false; + } + cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]); + cs_move32_to(b, cs_sr_reg32(b, 34), baseGroupX * shader->local_size.x); + cs_move32_to(b, cs_sr_reg32(b, 35), baseGroupY * shader->local_size.y); + cs_move32_to(b, cs_sr_reg32(b, 36), baseGroupZ * shader->local_size.z); + cs_move32_to(b, cs_sr_reg32(b, 37), groupCountX); + cs_move32_to(b, cs_sr_reg32(b, 38), groupCountY); + cs_move32_to(b, cs_sr_reg32(b, 39), groupCountZ); + + /* Pick the task_axis and task_increment to maximize thread utilization. */ + unsigned threads_per_wg = + shader->local_size.x * shader->local_size.y * shader->local_size.z; + unsigned max_thread_cnt = panfrost_compute_max_thread_count( + &phys_dev->kmod.props, shader->info.work_reg_count); + unsigned threads_per_task = threads_per_wg; + unsigned local_size[3] = { + shader->local_size.x, + shader->local_size.y, + shader->local_size.z, + }; + + for (unsigned i = 0; i < 3; i++) { + if (threads_per_task * local_size[i] >= max_thread_cnt) { + /* We reached out thread limit, stop at the current axis and + * calculate the increment so it doesn't exceed the per-core + * thread capacity. + */ + task_increment = max_thread_cnt / threads_per_task; + break; + } else if (task_axis == MALI_TASK_AXIS_Z) { + /* We reached the Z axis, and there's still room to stuff more + * threads. Pick the current axis grid size as our increment + * as there's no point using something bigger. + */ + task_increment = local_size[i]; + break; + } + + threads_per_task *= local_size[i]; + task_axis++; + } + } + + assert(task_axis <= MALI_TASK_AXIS_Z); + assert(task_increment > 0); + + panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + + cs_req_res(b, CS_COMPUTE_RES); + cs_run_compute(b, task_increment, task_axis, false, + cs_shader_res_sel(0, 0, 0, 0)); + cs_req_res(b, 0); + + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index iter_sb = cs_scratch_reg32(b, 2); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); + struct cs_index add_val = cs_scratch_reg64(b, 4); + + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + + cs_add64(b, sync_addr, sync_addr, + PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); + cs_move64_to(b, add_val, 1); + + cs_match(b, iter_sb, cmp_scratch) { +#define CASE(x) \ + cs_case(b, x) { \ + cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \ + add_val, sync_addr, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \ + cs_move32_to(b, iter_sb, next_iter_sb(x)); \ + } + + CASE(0) + CASE(1) + CASE(2) + CASE(3) + CASE(4) +#undef CASE + } + + cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_wait_slot(b, SB_ID(LS), false); + + ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer, + VkBuffer _buffer, VkDeviceSize offset) +{ + panvk_stub(); +} diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c new file mode 100644 index 00000000000..759a1786ed6 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -0,0 +1,2121 @@ +/* + * Copyright © 2024 Collabora Ltd. + * + * Derived from tu_cmd_buffer.c which is: + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * Copyright © 2015 Intel Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "genxml/gen_macros.h" + +#include "panvk_buffer.h" +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_buffer.h" +#include "panvk_cmd_desc_state.h" +#include "panvk_cmd_meta.h" +#include "panvk_device.h" +#include "panvk_entrypoints.h" +#include "panvk_image.h" +#include "panvk_image_view.h" +#include "panvk_instance.h" +#include "panvk_priv_bo.h" +#include "panvk_shader.h" + +#include "pan_desc.h" +#include "pan_earlyzs.h" +#include "pan_encoder.h" +#include "pan_format.h" +#include "pan_jc.h" +#include "pan_props.h" +#include "pan_samples.h" +#include "pan_shader.h" + +#include "vk_format.h" +#include "vk_meta.h" +#include "vk_pipeline_layout.h" + +struct panvk_draw_info { + struct { + uint32_t size; + uint32_t offset; + int32_t vertex_offset; + } index; + + struct { + uint32_t base; + uint32_t count; + } vertex; + + struct { + uint32_t base; + uint32_t count; + } instance; +}; + +#define is_dirty(__cmdbuf, __name) \ + BITSET_TEST((__cmdbuf)->vk.dynamic_graphics_state.dirty, \ + MESA_VK_DYNAMIC_##__name) + +static void +emit_vs_attrib(const struct panvk_draw_info *draw, + const struct vk_vertex_attribute_state *attrib_info, + const struct vk_vertex_binding_state *buf_info, + const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset, + struct mali_attribute_packed *desc) +{ + bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE; + enum pipe_format f = vk_format_to_pipe_format(attrib_info->format); + unsigned buf_idx = vb_desc_offset + attrib_info->binding; + unsigned divisor = draw->vertex.count * buf_info->divisor; + + pan_pack(desc, ATTRIBUTE, cfg) { + cfg.offset = attrib_info->offset; + cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw; + cfg.table = 0; + cfg.buffer_index = buf_idx; + cfg.stride = buf_info->stride; + if (!per_instance) { + /* Per-vertex */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.offset_enable = true; + } else if (util_is_power_of_two_or_zero(divisor)) { + /* Per-instance, POT divisor */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; + cfg.divisor_r = __builtin_ctz(divisor); + } else { + /* Per-instance, NPOT divisor */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; + cfg.divisor_d = panfrost_compute_magic_divisor(divisor, &cfg.divisor_r, + &cfg.divisor_e); + } + } +} + +static VkResult +prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf, + struct panvk_draw_info *draw) +{ + struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; + bool dirty = is_dirty(cmdbuf, VI) || is_dirty(cmdbuf, VI_BINDINGS_VALID) || + is_dirty(cmdbuf, VI_BINDING_STRIDES) || + cmdbuf->state.gfx.vb.dirty || + !vs_desc_state->driver_set.dev_addr; + + if (!dirty) + return VK_SUCCESS; + + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct vk_vertex_input_state *vi = + cmdbuf->vk.dynamic_graphics_state.vi; + unsigned num_vs_attribs = util_last_bit(vi->attributes_valid); + uint32_t vb_count = 0; + + for (unsigned i = 0; i < num_vs_attribs; i++) { + if (vi->attributes_valid & BITFIELD_BIT(i)) + vb_count = MAX2(vi->attributes[i].binding + 1, vb_count); + } + + uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1; + uint32_t desc_count = vb_offset + vb_count; + const struct panvk_descriptor_state *desc_state = + &cmdbuf->state.gfx.desc_state; + struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE); + struct panvk_opaque_desc *descs = driver_set.cpu; + + if (!driver_set.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) { + if (vi->attributes_valid & BITFIELD_BIT(i)) { + unsigned binding = vi->attributes[i].binding; + + emit_vs_attrib(draw, &vi->attributes[i], &vi->bindings[binding], + &cmdbuf->state.gfx.vb.bufs[binding], vb_offset, + (struct mali_attribute_packed *)(&descs[i])); + } else { + memset(&descs[i], 0, sizeof(descs[0])); + } + } + + /* Dummy sampler always comes right after the vertex attribs. */ + pan_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, _) { + } + + panvk_per_arch(cmd_fill_dyn_bufs)( + desc_state, vs, + (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1])); + + for (uint32_t i = 0; i < vb_count; i++) { + const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i]; + + pan_pack(&descs[vb_offset + i], BUFFER, cfg) { + if (vi->bindings_valid & BITFIELD_BIT(i)) { + cfg.address = vb->address; + cfg.size = vb->size; + } else { + cfg.address = 0; + cfg.size = 0; + } + } + } + + vs_desc_state->driver_set.dev_addr = driver_set.gpu; + vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE; + return VK_SUCCESS; +} + +static VkResult +prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; + + if (fs_desc_state->driver_set.dev_addr) + return VK_SUCCESS; + + const struct panvk_descriptor_state *desc_state = + &cmdbuf->state.gfx.desc_state; + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1; + struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE); + struct panvk_opaque_desc *descs = driver_set.cpu; + + if (desc_count && !driver_set.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + /* Dummy sampler always comes first. */ + pan_pack(&descs[0], SAMPLER, _) { + } + + panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs, + (struct mali_buffer_packed *)(&descs[1])); + + fs_desc_state->driver_set.dev_addr = driver_set.gpu; + fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE; + return VK_SUCCESS; +} + +static void +prepare_sysvals(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals; + struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb; + + if (is_dirty(cmdbuf, CB_BLEND_CONSTANTS)) { + for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) + sysvals->blend.constants[i] = + CLAMP(cb->blend_constants[i], 0.0f, 1.0f); + cmdbuf->state.gfx.push_uniforms = 0; + } + + if (is_dirty(cmdbuf, VP_VIEWPORTS)) { + VkViewport *viewport = &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0]; + + /* Upload the viewport scale. Defined as (px/2, py/2, pz) at the start of + * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the + * end of the section, the spec defines: + * + * px = width + * py = height + * pz = maxDepth - minDepth + */ + sysvals->viewport.scale.x = 0.5f * viewport->width; + sysvals->viewport.scale.y = 0.5f * viewport->height; + sysvals->viewport.scale.z = (viewport->maxDepth - viewport->minDepth); + + /* Upload the viewport offset. Defined as (ox, oy, oz) at the start of + * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the + * end of the section, the spec defines: + * + * ox = x + width/2 + * oy = y + height/2 + * oz = minDepth + */ + sysvals->viewport.offset.x = (0.5f * viewport->width) + viewport->x; + sysvals->viewport.offset.y = (0.5f * viewport->height) + viewport->y; + sysvals->viewport.offset.z = viewport->minDepth; + cmdbuf->state.gfx.push_uniforms = 0; + } +} + +static bool +has_depth_att(struct panvk_cmd_buffer *cmdbuf) +{ + return (cmdbuf->state.gfx.render.bound_attachments & + MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0; +} + +static bool +has_stencil_att(struct panvk_cmd_buffer *cmdbuf) +{ + return (cmdbuf->state.gfx.render.bound_attachments & + MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0; +} + +static bool +writes_depth(struct panvk_cmd_buffer *cmdbuf) +{ + const struct vk_depth_stencil_state *ds = + &cmdbuf->vk.dynamic_graphics_state.ds; + + return has_depth_att(cmdbuf) && ds->depth.test_enable && + ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER; +} + +static bool +writes_stencil(struct panvk_cmd_buffer *cmdbuf) +{ + const struct vk_depth_stencil_state *ds = + &cmdbuf->vk.dynamic_graphics_state.ds; + + return has_stencil_att(cmdbuf) && ds->stencil.test_enable && + ((ds->stencil.front.write_mask && + (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP || + ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP || + ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) || + (ds->stencil.back.write_mask && + (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP || + ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP || + ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP))); +} + +static bool +ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf) +{ + const struct vk_depth_stencil_state *ds = + &cmdbuf->vk.dynamic_graphics_state.ds; + + if (!has_depth_att(cmdbuf)) + return true; + + if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS) + return false; + + if (ds->stencil.test_enable && + (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS || + ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) + return false; + + return true; +} + +static inline enum mali_func +translate_compare_func(VkCompareOp comp) +{ + STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER); + STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS); + STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL); + STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL); + STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER); + STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL); + STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL == + (VkCompareOp)MALI_FUNC_GEQUAL); + STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS); + + return (enum mali_func)comp; +} + +static enum mali_stencil_op +translate_stencil_op(VkStencilOp in) +{ + switch (in) { + case VK_STENCIL_OP_KEEP: + return MALI_STENCIL_OP_KEEP; + case VK_STENCIL_OP_ZERO: + return MALI_STENCIL_OP_ZERO; + case VK_STENCIL_OP_REPLACE: + return MALI_STENCIL_OP_REPLACE; + case VK_STENCIL_OP_INCREMENT_AND_CLAMP: + return MALI_STENCIL_OP_INCR_SAT; + case VK_STENCIL_OP_DECREMENT_AND_CLAMP: + return MALI_STENCIL_OP_DECR_SAT; + case VK_STENCIL_OP_INCREMENT_AND_WRAP: + return MALI_STENCIL_OP_INCR_WRAP; + case VK_STENCIL_OP_DECREMENT_AND_WRAP: + return MALI_STENCIL_OP_DECR_WRAP; + case VK_STENCIL_OP_INVERT: + return MALI_STENCIL_OP_INVERT; + default: + unreachable("Invalid stencil op"); + } +} + +static bool +fs_required(struct panvk_cmd_buffer *cmdbuf) +{ + const struct pan_shader_info *fs_info = + cmdbuf->state.gfx.fs.shader ? &cmdbuf->state.gfx.fs.shader->info : NULL; + const struct vk_dynamic_graphics_state *dyns = + &cmdbuf->vk.dynamic_graphics_state; + const struct vk_color_blend_state *cb = &dyns->cb; + + if (!fs_info) + return false; + + /* If we generally have side effects */ + if (fs_info->fs.sidefx) + return true; + + /* If colour is written we need to execute */ + for (unsigned i = 0; i < cb->attachment_count; ++i) { + if ((cb->color_write_enables & BITFIELD_BIT(i)) && + cb->attachments[i].write_mask) + return true; + } + + /* If alpha-to-coverage is enabled, we need to run the fragment shader even + * if we don't have a color attachment, so depth/stencil updates can be + * discarded if alpha, and thus coverage, is 0. */ + if (dyns->ms.alpha_to_coverage_enable) + return true; + + /* If depth is written and not implied we need to execute. + * TODO: Predicate on Z/S writes being enabled */ + return (fs_info->fs.writes_depth || fs_info->fs.writes_stencil); +} + +static enum mali_draw_mode +translate_prim_topology(VkPrimitiveTopology in) +{ + /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not + * part of the VkPrimitiveTopology enum. + */ + if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA) + return MALI_DRAW_MODE_TRIANGLES; + + switch (in) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + return MALI_DRAW_MODE_POINTS; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + return MALI_DRAW_MODE_LINES; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return MALI_DRAW_MODE_LINE_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + return MALI_DRAW_MODE_TRIANGLES; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + return MALI_DRAW_MODE_TRIANGLE_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return MALI_DRAW_MODE_TRIANGLE_FAN; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: + default: + unreachable("Invalid primitive type"); + } +} + +static void +force_fb_preload(struct panvk_cmd_buffer *cmdbuf) +{ + for (unsigned i = 0; i < cmdbuf->state.gfx.render.fb.info.rt_count; i++) { + if (cmdbuf->state.gfx.render.fb.info.rts[i].view) { + cmdbuf->state.gfx.render.fb.info.rts[i].clear = false; + cmdbuf->state.gfx.render.fb.info.rts[i].preload = true; + } + } + + if (cmdbuf->state.gfx.render.fb.info.zs.view.zs) { + cmdbuf->state.gfx.render.fb.info.zs.clear.z = false; + cmdbuf->state.gfx.render.fb.info.zs.preload.z = true; + } + + if (cmdbuf->state.gfx.render.fb.info.zs.view.s || + (cmdbuf->state.gfx.render.fb.info.zs.view.zs && + util_format_is_depth_and_stencil( + cmdbuf->state.gfx.render.fb.info.zs.view.zs->format))) { + cmdbuf->state.gfx.render.fb.info.zs.clear.s = false; + cmdbuf->state.gfx.render.fb.info.zs.preload.s = true; + } +} + +static VkResult +update_tls(struct panvk_cmd_buffer *cmdbuf) +{ + struct panvk_tls_state *state = &cmdbuf->state.tls; + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + + if (!state->desc.gpu) { + state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); + if (!state->desc.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + cs_update_vt_ctx(b) + cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu); + } + + state->info.tls.size = + MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size); + return VK_SUCCESS; +} + +static enum mali_index_type +index_size_to_index_type(uint32_t size) +{ + switch (size) { + case 0: + return MALI_INDEX_TYPE_NONE; + case 1: + return MALI_INDEX_TYPE_UINT8; + case 2: + return MALI_INDEX_TYPE_UINT16; + case 4: + return MALI_INDEX_TYPE_UINT32; + default: + assert(!"Invalid index size"); + return MALI_INDEX_TYPE_NONE; + } +} + +static VkResult +prepare_blend(struct panvk_cmd_buffer *cmdbuf) +{ + bool dirty = + is_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) || is_dirty(cmdbuf, CB_LOGIC_OP) || + is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || + is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || + is_dirty(cmdbuf, CB_BLEND_ENABLES) || + is_dirty(cmdbuf, CB_BLEND_EQUATIONS) || + is_dirty(cmdbuf, CB_WRITE_MASKS) || is_dirty(cmdbuf, CB_BLEND_CONSTANTS); + + if (!dirty) + return VK_SUCCESS; + + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct vk_dynamic_graphics_state *dyns = + &cmdbuf->vk.dynamic_graphics_state; + const struct vk_color_blend_state *cb = &dyns->cb; + unsigned bd_count = MAX2(cb->attachment_count, 1); + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + const struct pan_shader_info *fs_info = fs ? &fs->info : NULL; + mali_ptr fs_code = panvk_shader_get_dev_addr(fs); + struct panfrost_ptr ptr = + panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND); + struct mali_blend_packed *bds = ptr.cpu; + + if (bd_count && !ptr.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + panvk_per_arch(blend_emit_descs)( + dev, cb, cmdbuf->state.gfx.render.color_attachments.fmts, + cmdbuf->state.gfx.render.color_attachments.samples, fs_info, fs_code, bds, + &cmdbuf->state.gfx.cb.info); + + cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count); + return VK_SUCCESS; +} + +static void +prepare_vp(struct panvk_cmd_buffer *cmdbuf) +{ + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const VkViewport *viewport = + &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0]; + const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0]; + + if (is_dirty(cmdbuf, VP_VIEWPORTS) || is_dirty(cmdbuf, VP_SCISSORS)) { + uint64_t scissor_box; + pan_pack(&scissor_box, SCISSOR, cfg) { + + /* The spec says "width must be greater than 0.0" */ + assert(viewport->x >= 0); + int minx = (int)viewport->x; + int maxx = (int)(viewport->x + viewport->width); + + /* Viewport height can be negative */ + int miny = + MIN2((int)viewport->y, (int)(viewport->y + viewport->height)); + int maxy = + MAX2((int)viewport->y, (int)(viewport->y + viewport->height)); + + assert(scissor->offset.x >= 0 && scissor->offset.y >= 0); + miny = MAX2(scissor->offset.x, minx); + miny = MAX2(scissor->offset.y, miny); + maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx); + maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy); + + /* Make sure we don't end up with a max < min when width/height is 0 */ + maxx = maxx > minx ? maxx - 1 : maxx; + maxy = maxy > miny ? maxy - 1 : maxy; + + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } + + cs_move64_to(b, cs_sr_reg64(b, 42), scissor_box); + } + + if (is_dirty(cmdbuf, VP_VIEWPORTS)) { + cs_move32_to(b, cs_sr_reg32(b, 44), + fui(MIN2(viewport->minDepth, viewport->maxDepth))); + cs_move32_to(b, cs_sr_reg32(b, 45), + fui(MAX2(viewport->minDepth, viewport->maxDepth))); + } +} + +static uint32_t +calc_fbd_size(struct panvk_cmd_buffer *cmdbuf) +{ + const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info; + bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s; + uint32_t fbd_size = pan_size(FRAMEBUFFER); + + if (has_zs_ext) + fbd_size += pan_size(ZS_CRC_EXTENSION); + + fbd_size += pan_size(RENDER_TARGET) * MAX2(fb->rt_count, 1); + return fbd_size; +} + +static uint32_t +calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf) +{ + return (calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count) + + pan_size(TILER_CONTEXT); +} + +static void +cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size) +{ + /* Make sure we don't allocate more than the ringbuf size. */ + assert(size <= RENDER_DESC_RINGBUF_SIZE); + + /* Make sure the allocation is 64-byte aligned. */ + assert(ALIGN_POT(size, 64) == size); + + struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0); + struct cs_index sz_reg = cs_scratch_reg32(b, 2); + + cs_load64_to( + b, ringbuf_sync, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj)); + cs_wait_slot(b, SB_ID(LS), false); + + /* Wait for the other end to release memory. */ + cs_move32_to(b, sz_reg, size - 1); + cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync); + + /* Decrement the syncobj to reflect the fact we're reserving memory. */ + cs_move32_to(b, sz_reg, -size); + cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync, + cs_now()); +} + +static void +cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size) +{ + struct cs_index scratch_reg = cs_scratch_reg32(b, 0); + struct cs_index ptr_lo = cs_scratch_reg32(b, 2); + struct cs_index pos = cs_scratch_reg32(b, 4); + + cs_load_to( + b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); + cs_wait_slot(b, SB_ID(LS), false); + + /* Update the relative position and absolute address. */ + cs_add32(b, ptr_lo, ptr_lo, size); + cs_add32(b, pos, pos, size); + cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE); + + /* Wrap-around. */ + cs_while(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) { + cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE); + cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE); + cs_loop_break(b, MALI_CS_CONDITION_ALWAYS, cs_undef()); + } + + cs_store( + b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); + cs_wait_slot(b, SB_ID(LS), false); +} + +static VkResult +get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) +{ + if (cmdbuf->state.gfx.render.tiler) + return VK_SUCCESS; + + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(cmdbuf->vk.base.device->physical); + struct panfrost_tiler_features tiler_features = + panfrost_query_tiler_features(&phys_dev->kmod.props); + bool simul_use = + cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + struct panfrost_ptr tiler_desc = {0}; + struct mali_tiler_context_packed tiler_tmpl; + + if (!simul_use) { + tiler_desc = panvk_cmd_alloc_desc(cmdbuf, TILER_CONTEXT); + if (!tiler_desc.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } else { + /* If the tiler descriptor is allocated from the ring buffer, we set a + * dumb non-zero address to allow the is-tiler-acquired test to pass. */ + tiler_desc.cpu = &tiler_tmpl; + tiler_desc.gpu = 0xdeadbeefdeadbeefull; + } + + pan_pack(tiler_desc.cpu, TILER_CONTEXT, cfg) { + unsigned max_levels = tiler_features.max_levels; + assert(max_levels >= 2); + + /* TODO: Select hierarchy mask more effectively */ + cfg.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28; + + /* For large framebuffers, disable the smallest bin size to + * avoid pathological tiler memory usage. + */ + cfg.fb_width = cmdbuf->state.gfx.render.fb.info.width; + cfg.fb_height = cmdbuf->state.gfx.render.fb.info.height; + if (MAX2(cfg.fb_width, cfg.fb_height) >= 4096) + cfg.hierarchy_mask &= ~1; + + cfg.sample_pattern = + pan_sample_pattern(cmdbuf->state.gfx.render.fb.info.nr_samples); + + /* TODO: revisit for VK_EXT_provoking_vertex. */ + cfg.first_provoking_vertex = true; + + cfg.layer_count = cmdbuf->state.gfx.render.layer_count; + cfg.layer_offset = 0; + } + + cmdbuf->state.gfx.render.tiler = tiler_desc.gpu; + + struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40); + + if (simul_use) { + uint32_t descs_sz = calc_render_descs_size(cmdbuf); + + cs_render_desc_ringbuf_reserve(b, descs_sz); + + /* Reserve ringbuf mem. */ + cs_update_vt_ctx(b) { + cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, + render.desc_ringbuf.ptr)); + } + + cs_render_desc_ringbuf_move_ptr(b, descs_sz); + + /* Lay out words 2:5, so they can be stored along the other updates. */ + cs_move64_to(b, cs_scratch_reg64(b, 2), + tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32); + cs_move64_to(b, cs_scratch_reg64(b, 4), + tiler_tmpl.opaque[4] | (uint64_t)tiler_tmpl.opaque[5] << 32); + } else { + cs_update_vt_ctx(b) { + cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu); + } + } + + /* Reset the polygon list. */ + cs_move64_to(b, cs_scratch_reg64(b, 0), 0); + + /* Load the tiler_heap and geom_buf from the context. */ + cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(4), + offsetof(struct panvk_cs_subqueue_context, render.tiler_heap)); + + /* Reset the completed chain. */ + cs_move64_to(b, cs_scratch_reg64(b, 10), 0); + cs_move64_to(b, cs_scratch_reg64(b, 12), 0); + + cs_wait_slot(b, SB_ID(LS), false); + + /* Update the first half of the tiler desc. */ + if (simul_use) { + cs_store(b, cs_scratch_reg_tuple(b, 0, 14), tiler_ctx_addr, + BITFIELD_MASK(14), 0); + } else { + cs_store(b, cs_scratch_reg_tuple(b, 0, 2), tiler_ctx_addr, + BITFIELD_MASK(2), 0); + cs_store(b, cs_scratch_reg_tuple(b, 6, 8), tiler_ctx_addr, + BITFIELD_MASK(8), 24); + } + + cs_wait_slot(b, SB_ID(LS), false); + + /* r10:13 are already zero, fill r8:9 and r14:15 with zeros so we can reset + * the private state in one store. */ + cs_move64_to(b, cs_scratch_reg64(b, 8), 0); + cs_move64_to(b, cs_scratch_reg64(b, 14), 0); + + /* Update the second half of the tiler descriptor. */ + cs_store(b, cs_scratch_reg_tuple(b, 8, 8), tiler_ctx_addr, BITFIELD_MASK(8), + 96); + cs_wait_slot(b, SB_ID(LS), false); + + /* Then we change the scoreboard slot used for iterators. */ + panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + + cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now()); + return VK_SUCCESS; +} + +static VkResult +get_fb_descs(struct panvk_cmd_buffer *cmdbuf) +{ + if (cmdbuf->state.gfx.render.fbds.gpu || + !cmdbuf->state.gfx.render.layer_count) + return VK_SUCCESS; + + uint32_t fbds_sz = + calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count; + + memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0, + sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds)); + + cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); + if (!cmdbuf->state.gfx.render.fbds.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + return VK_SUCCESS; +} + +static VkResult +prepare_vs(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) +{ + struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; + struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const struct vk_input_assembly_state *ia = + &cmdbuf->vk.dynamic_graphics_state.ia; + mali_ptr pos_spd = ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST + ? panvk_priv_mem_dev_addr(vs->spds.pos_points) + : panvk_priv_mem_dev_addr(vs->spds.pos_triangles); + mali_ptr var_spd = panvk_priv_mem_dev_addr(vs->spds.var); + bool upd_res_table = false; + + if (!vs_desc_state->res_table) { + VkResult result = prepare_vs_driver_set(cmdbuf, draw); + if (result != VK_SUCCESS) + return result; + + result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state, + vs, vs_desc_state); + if (result != VK_SUCCESS) + return result; + + upd_res_table = true; + } + + cs_update_vt_ctx(b) { + if (upd_res_table) + cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table); + + if (pos_spd != cmdbuf->state.gfx.vs.spds.pos) + cs_move64_to(b, cs_sr_reg64(b, 16), pos_spd); + + if (var_spd != cmdbuf->state.gfx.vs.spds.var) + cs_move64_to(b, cs_sr_reg64(b, 18), var_spd); + } + + return VK_SUCCESS; +} + +static VkResult +prepare_fs(struct panvk_cmd_buffer *cmdbuf) +{ + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; + struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + mali_ptr frag_spd = panvk_priv_mem_dev_addr(fs->spd); + bool upd_res_table = false; + + if (!fs_desc_state->res_table) { + VkResult result = prepare_fs_driver_set(cmdbuf); + if (result != VK_SUCCESS) + return result; + + result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state, + fs, fs_desc_state); + if (result != VK_SUCCESS) + return result; + + upd_res_table = true; + } + + cs_update_vt_ctx(b) { + if (upd_res_table) + cs_move64_to(b, cs_sr_reg64(b, 4), fs_desc_state->res_table); + + if (cmdbuf->state.gfx.fs.spd != frag_spd) + cs_move64_to(b, cs_sr_reg64(b, 20), frag_spd); + } + + return VK_SUCCESS; +} + +static VkResult +prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf) +{ + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + + if (!cmdbuf->state.gfx.push_uniforms) { + cmdbuf->state.gfx.push_uniforms = panvk_per_arch( + cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.gfx.sysvals, + sizeof(cmdbuf->state.gfx.sysvals)); + if (!cmdbuf->state.gfx.push_uniforms) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + uint32_t push_size = 256 + sizeof(struct panvk_graphics_sysvals); + uint64_t fau_count = DIV_ROUND_UP(push_size, 8); + mali_ptr fau_ptr = cmdbuf->state.gfx.push_uniforms | (fau_count << 56); + + cs_update_vt_ctx(b) { + cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr); + cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr); + } + } + + return VK_SUCCESS; +} + +static VkResult +prepare_ds(struct panvk_cmd_buffer *cmdbuf) +{ + bool dirty = is_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) || + is_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) || + is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) || + is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) || + is_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) || + is_dirty(cmdbuf, DS_STENCIL_OP) || + is_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) || + is_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) || + is_dirty(cmdbuf, DS_STENCIL_REFERENCE) || + is_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) || + is_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) || + is_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) || + /* fs_required() uses ms.alpha_to_coverage_enable + * and vk_color_blend_state + */ + is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) || + is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || + is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || + is_dirty(cmdbuf, CB_WRITE_MASKS); + + if (!dirty) + return VK_SUCCESS; + + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + const struct vk_dynamic_graphics_state *dyns = + &cmdbuf->vk.dynamic_graphics_state; + const struct vk_depth_stencil_state *ds = &dyns->ds; + const struct vk_rasterization_state *rs = &dyns->rs; + bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable; + bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable; + bool needs_fs = fs_required(cmdbuf); + + struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL); + if (!zsd.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) { + cfg.stencil_test_enable = test_s; + if (test_s) { + cfg.front_compare_function = + translate_compare_func(ds->stencil.front.op.compare); + cfg.front_stencil_fail = + translate_stencil_op(ds->stencil.front.op.fail); + cfg.front_depth_fail = + translate_stencil_op(ds->stencil.front.op.depth_fail); + cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass); + cfg.back_compare_function = + translate_compare_func(ds->stencil.back.op.compare); + cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail); + cfg.back_depth_fail = + translate_stencil_op(ds->stencil.back.op.depth_fail); + cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass); + } + + cfg.stencil_from_shader = needs_fs ? fs->info.fs.writes_stencil : 0; + cfg.front_write_mask = ds->stencil.front.write_mask; + cfg.back_write_mask = ds->stencil.back.write_mask; + cfg.front_value_mask = ds->stencil.front.compare_mask; + cfg.back_value_mask = ds->stencil.back.compare_mask; + cfg.front_reference_value = ds->stencil.front.reference; + cfg.back_reference_value = ds->stencil.back.reference; + + if (rs->depth_clamp_enable) + cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS; + + cfg.depth_source = pan_depth_source(&fs->info); + cfg.depth_write_enable = ds->depth.write_enable; + cfg.depth_bias_enable = rs->depth_bias.enable; + cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op) + : MALI_FUNC_ALWAYS; + cfg.depth_units = rs->depth_bias.constant * 2.0f; + cfg.depth_factor = rs->depth_bias.slope; + cfg.depth_bias_clamp = rs->depth_bias.clamp; + } + + cs_update_vt_ctx(b) + cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu); + + return VK_SUCCESS; +} + +static void +prepare_dcd(struct panvk_cmd_buffer *cmdbuf) +{ + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + bool fs_is_dirty = + cmdbuf->state.gfx.fs.spd != panvk_priv_mem_dev_addr(fs->spd); + bool dcd0_dirty = is_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) || + is_dirty(cmdbuf, RS_CULL_MODE) || + is_dirty(cmdbuf, RS_FRONT_FACE) || + is_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) || + is_dirty(cmdbuf, MS_SAMPLE_MASK) || + is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) || + is_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) || + /* writes_depth() uses vk_depth_stencil_state */ + is_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) || + is_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) || + is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) || + /* writes_stencil() uses vk_depth_stencil_state */ + is_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) || + is_dirty(cmdbuf, DS_STENCIL_OP) || + is_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) || + /* fs_required() uses vk_color_blend_state */ + is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || + is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || + is_dirty(cmdbuf, CB_WRITE_MASKS) || fs_is_dirty || + cmdbuf->state.gfx.render.dirty; + bool dcd1_dirty = is_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) || + is_dirty(cmdbuf, MS_SAMPLE_MASK) || + /* fs_required() uses ms.alpha_to_coverage_enable + * and vk_color_blend_state + */ + is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) || + is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || + is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || + is_dirty(cmdbuf, CB_WRITE_MASKS) || fs_is_dirty || + cmdbuf->state.gfx.render.dirty; + + bool needs_fs = fs_required(cmdbuf); + + const struct vk_dynamic_graphics_state *dyns = + &cmdbuf->vk.dynamic_graphics_state; + const struct vk_rasterization_state *rs = + &cmdbuf->vk.dynamic_graphics_state.rs; + bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable; + bool writes_z = writes_depth(cmdbuf); + bool writes_s = writes_stencil(cmdbuf); + + if (dcd0_dirty) { + struct mali_dcd_flags_0_packed dcd0; + pan_pack(&dcd0, DCD_FLAGS_0, cfg) { + if (needs_fs) { + uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0; + uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments & + MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS; + + cfg.allow_forward_pixel_to_kill = + fs->info.fs.can_fpk && !(rt_mask & ~rt_written) && + !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read; + + bool writes_zs = writes_z || writes_s; + bool zs_always_passes = ds_test_always_passes(cmdbuf); + bool oq = false; /* TODO: Occlusion queries */ + + struct pan_earlyzs_state earlyzs = + pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq, + alpha_to_coverage, zs_always_passes); + + cfg.pixel_kill_operation = earlyzs.kill; + cfg.zs_update_operation = earlyzs.update; + } else { + cfg.allow_forward_pixel_to_kill = true; + cfg.allow_forward_pixel_to_be_killed = true; + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.overdraw_alpha0 = true; + cfg.overdraw_alpha1 = true; + } + + cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE; + cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0; + cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0; + + cfg.multisample_enable = dyns->ms.rasterization_samples > 1; + } + + cs_update_vt_ctx(b) + cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]); + } + + if (dcd1_dirty) { + struct mali_dcd_flags_1_packed dcd1; + pan_pack(&dcd1, DCD_FLAGS_1, cfg) { + cfg.sample_mask = dyns->ms.rasterization_samples > 1 + ? dyns->ms.sample_mask + : UINT16_MAX; + + if (needs_fs) { + cfg.render_target_mask = + (fs->info.outputs_written >> FRAG_RESULT_DATA0) & + cmdbuf->state.gfx.render.bound_attachments; + } + } + + cs_update_vt_ctx(b) + cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]); + } +} + +static void +clear_dirty(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) +{ + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + + if (vs) { + const struct vk_input_assembly_state *ia = + &cmdbuf->vk.dynamic_graphics_state.ia; + + cmdbuf->state.gfx.vs.spds.pos = + ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST + ? panvk_priv_mem_dev_addr(vs->spds.pos_points) + : panvk_priv_mem_dev_addr(vs->spds.pos_triangles); + cmdbuf->state.gfx.vs.spds.var = panvk_priv_mem_dev_addr(vs->spds.var); + } + + if (fs) + cmdbuf->state.gfx.fs.spd = panvk_priv_mem_dev_addr(fs->spd); + + if (draw->index.size) + cmdbuf->state.gfx.ib.dirty = false; + + cmdbuf->state.gfx.render.dirty = false; + vk_dynamic_graphics_state_clear_dirty(&cmdbuf->vk.dynamic_graphics_state); +} + +static void +panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) +{ + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; + struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; + const struct vk_rasterization_state *rs = + &cmdbuf->vk.dynamic_graphics_state.rs; + const struct vk_input_assembly_state *ia = + &cmdbuf->vk.dynamic_graphics_state.ia; + bool idvs = vs->info.vs.idvs; + VkResult result; + + /* If there's no vertex shader, we can skip the draw. */ + if (!panvk_priv_mem_dev_addr(vs->spds.pos_points)) + return; + + /* FIXME: support non-IDVS. */ + assert(idvs); + + if (!cmdbuf->state.gfx.linked) { + result = panvk_per_arch(link_shaders)(&cmdbuf->desc_pool, vs, fs, + &cmdbuf->state.gfx.link); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmdbuf->vk, result); + return; + } + cmdbuf->state.gfx.linked = true; + } + + result = update_tls(cmdbuf); + if (result != VK_SUCCESS) + return; + + bool needs_tiling = !rs->rasterizer_discard_enable; + + if (needs_tiling) { + result = get_tiler_desc(cmdbuf); + if (result != VK_SUCCESS) + return; + + get_fb_descs(cmdbuf); + } + + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + + uint32_t used_set_mask = + vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0); + + result = + panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state, used_set_mask); + if (result != VK_SUCCESS) + return; + + prepare_sysvals(cmdbuf); + + result = prepare_push_uniforms(cmdbuf); + if (result != VK_SUCCESS) + return; + + result = prepare_vs(cmdbuf, draw); + if (result != VK_SUCCESS) + return; + + /* No need to setup the FS desc tables if the FS is not executed. */ + if (needs_tiling && fs_required(cmdbuf)) { + result = prepare_fs(cmdbuf); + if (result != VK_SUCCESS) + return; + } + + struct mali_primitive_flags_packed tiler_idvs_flags; + bool writes_point_size = + vs->info.vs.writes_point_size && + ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + + pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) { + cfg.draw_mode = translate_prim_topology(ia->primitive_topology); + cfg.index_type = index_size_to_index_type(draw->index.size); + + if (writes_point_size) { + cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; + cfg.position_fifo_format = MALI_FIFO_FORMAT_EXTENDED; + } else { + cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_NONE; + cfg.position_fifo_format = MALI_FIFO_FORMAT_BASIC; + } + + if (vs->info.outputs_written & VARYING_BIT_LAYER) { + cfg.layer_index_enable = true; + cfg.position_fifo_format = MALI_FIFO_FORMAT_EXTENDED; + } + + cfg.secondary_shader = + vs->info.vs.secondary_enable && fs_required(cmdbuf); + cfg.primitive_restart = ia->primitive_restart_enable; + } + + uint32_t varying_size = 0; + + if (vs && fs) { + unsigned vs_vars = vs->info.varyings.output_count; + unsigned fs_vars = fs->info.varyings.input_count; + unsigned var_slots = MAX2(vs_vars, fs_vars); + + /* Assumes 16 byte slots. We could do better. */ + varying_size = var_slots * 16; + } + + cs_update_vt_ctx(b) { + cs_move32_to(b, cs_sr_reg32(b, 32), draw->vertex.base); + cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count); + cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count); + cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset); + cs_move32_to(b, cs_sr_reg32(b, 36), draw->index.vertex_offset); + + /* Instance ID is assumed to be zero-based for now. See if we can + * extend nir_lower_system_values() and the lower options to make + * instance-ID non-zero based, or if it's fine to always return + * zero for the instance base. */ + cs_move32_to(b, cs_sr_reg32(b, 37), 0); + + /* We don't use the resource dep system yet. */ + cs_move32_to(b, cs_sr_reg32(b, 38), 0); + + cs_move32_to( + b, cs_sr_reg32(b, 39), + (draw->index.offset + draw->vertex.count) * draw->index.size); + + if (draw->index.size && cmdbuf->state.gfx.ib.dirty) { + cs_move64_to(b, cs_sr_reg64(b, 54), + panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer, + cmdbuf->state.gfx.ib.offset)); + } + + /* TODO: Revisit to avoid passing everything through the override flags + * (likely needed for state preservation in secondary command buffers). */ + cs_move32_to(b, cs_sr_reg32(b, 56), 0); + + cs_move32_to(b, cs_sr_reg32(b, 48), varying_size); + + result = prepare_blend(cmdbuf); + if (result != VK_SUCCESS) + return; + + result = prepare_ds(cmdbuf); + if (result != VK_SUCCESS) + return; + + prepare_dcd(cmdbuf); + prepare_vp(cmdbuf); + } + + clear_dirty(cmdbuf, draw); + + cs_req_res(b, CS_IDVS_RES); + cs_run_idvs(b, tiler_idvs_flags.opaque[0], false, true, + cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), + cs_undef()); + cs_req_res(b, 0); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, + uint32_t instanceCount, uint32_t firstVertex, + uint32_t firstInstance) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + if (instanceCount == 0 || vertexCount == 0) + return; + + struct panvk_draw_info draw = { + .vertex.base = firstVertex, + .vertex.count = vertexCount, + .instance.base = firstInstance, + .instance.count = instanceCount, + }; + + panvk_cmd_draw(cmdbuf, &draw); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer, + uint32_t indexCount, uint32_t instanceCount, + uint32_t firstIndex, int32_t vertexOffset, + uint32_t firstInstance) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + if (instanceCount == 0 || indexCount == 0) + return; + + struct panvk_draw_info draw = { + .index.size = cmdbuf->state.gfx.ib.index_size, + .index.offset = firstIndex, + .index.vertex_offset = vertexOffset, + .vertex.count = indexCount, + .instance.count = instanceCount, + .instance.base = firstInstance, + }; + + panvk_cmd_draw(cmdbuf, &draw); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, uint32_t drawCount, + uint32_t stride) +{ + panvk_stub(); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, + VkBuffer _buffer, VkDeviceSize offset, + uint32_t drawCount, uint32_t stride) +{ + panvk_stub(); +} + +static void +panvk_cmd_begin_rendering_init_state(struct panvk_cmd_buffer *cmdbuf, + const VkRenderingInfo *pRenderingInfo) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + uint32_t att_width = 0, att_height = 0; + + cmdbuf->state.gfx.render.flags = pRenderingInfo->flags; + + /* Resuming from a suspended pass, the state should be unchanged. */ + if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) + return; + + cmdbuf->state.gfx.render.dirty = true; + memset(cmdbuf->state.gfx.render.fb.crc_valid, 0, + sizeof(cmdbuf->state.gfx.render.fb.crc_valid)); + memset(&cmdbuf->state.gfx.render.color_attachments, 0, + sizeof(cmdbuf->state.gfx.render.color_attachments)); + memset(&cmdbuf->state.gfx.render.z_attachment, 0, + sizeof(cmdbuf->state.gfx.render.z_attachment)); + memset(&cmdbuf->state.gfx.render.s_attachment, 0, + sizeof(cmdbuf->state.gfx.render.s_attachment)); + cmdbuf->state.gfx.render.bound_attachments = 0; + + cmdbuf->state.gfx.render.layer_count = pRenderingInfo->layerCount; + *fbinfo = (struct pan_fb_info){ + .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model), + .nr_samples = 1, + .rt_count = pRenderingInfo->colorAttachmentCount, + }; + + assert(pRenderingInfo->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts)); + + for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) { + const VkRenderingAttachmentInfo *att = + &pRenderingInfo->pColorAttachments[i]; + VK_FROM_HANDLE(panvk_image_view, iview, att->imageView); + + if (!iview) + continue; + + struct panvk_image *img = + container_of(iview->vk.image, struct panvk_image, vk); + const VkExtent3D iview_size = + vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level); + + cmdbuf->state.gfx.render.bound_attachments |= + MESA_VK_RP_ATTACHMENT_COLOR_BIT(i); + cmdbuf->state.gfx.render.color_attachments.fmts[i] = iview->vk.format; + cmdbuf->state.gfx.render.color_attachments.samples[i] = img->vk.samples; + att_width = MAX2(iview_size.width, att_width); + att_height = MAX2(iview_size.height, att_height); + + fbinfo->rts[i].view = &iview->pview; + fbinfo->rts[i].crc_valid = &cmdbuf->state.gfx.render.fb.crc_valid[i]; + fbinfo->nr_samples = + MAX2(fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview)); + + if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + enum pipe_format fmt = vk_format_to_pipe_format(iview->vk.format); + union pipe_color_union *col = + (union pipe_color_union *)&att->clearValue.color; + + fbinfo->rts[i].clear = true; + pan_pack_color(phys_dev->formats.blendable, fbinfo->rts[i].clear_value, + col, fmt, false); + } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) { + fbinfo->rts[i].preload = true; + } + + if (att->resolveMode != VK_RESOLVE_MODE_NONE) { + struct panvk_resolve_attachment *resolve_info = + &cmdbuf->state.gfx.render.color_attachments.resolve[i]; + VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView); + + resolve_info->mode = att->resolveMode; + resolve_info->src_iview = iview; + resolve_info->dst_iview = resolve_iview; + } + } + + if (pRenderingInfo->pDepthAttachment && + pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) { + const VkRenderingAttachmentInfo *att = pRenderingInfo->pDepthAttachment; + VK_FROM_HANDLE(panvk_image_view, iview, att->imageView); + struct panvk_image *img = + container_of(iview->vk.image, struct panvk_image, vk); + const VkExtent3D iview_size = + vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level); + + if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + cmdbuf->state.gfx.render.bound_attachments |= + MESA_VK_RP_ATTACHMENT_DEPTH_BIT; + att_width = MAX2(iview_size.width, att_width); + att_height = MAX2(iview_size.height, att_height); + + fbinfo->zs.view.zs = &iview->pview; + fbinfo->nr_samples = MAX2( + fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview)); + + if (vk_format_has_stencil(img->vk.format)) + fbinfo->zs.preload.s = true; + + if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + fbinfo->zs.clear.z = true; + fbinfo->zs.clear_value.depth = att->clearValue.depthStencil.depth; + } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) { + fbinfo->zs.preload.z = true; + } + + if (att->resolveMode != VK_RESOLVE_MODE_NONE) { + struct panvk_resolve_attachment *resolve_info = + &cmdbuf->state.gfx.render.z_attachment.resolve; + VK_FROM_HANDLE(panvk_image_view, resolve_iview, + att->resolveImageView); + + resolve_info->mode = att->resolveMode; + resolve_info->src_iview = iview; + resolve_info->dst_iview = resolve_iview; + } + } + } + + if (pRenderingInfo->pStencilAttachment && + pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE) { + const VkRenderingAttachmentInfo *att = pRenderingInfo->pStencilAttachment; + VK_FROM_HANDLE(panvk_image_view, iview, att->imageView); + struct panvk_image *img = + container_of(iview->vk.image, struct panvk_image, vk); + const VkExtent3D iview_size = + vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level); + + if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + cmdbuf->state.gfx.render.bound_attachments |= + MESA_VK_RP_ATTACHMENT_STENCIL_BIT; + att_width = MAX2(iview_size.width, att_width); + att_height = MAX2(iview_size.height, att_height); + + if (drm_is_afbc(img->pimage.layout.modifier)) { + assert(fbinfo->zs.view.zs == &iview->pview || !fbinfo->zs.view.zs); + fbinfo->zs.view.zs = &iview->pview; + } else { + fbinfo->zs.view.s = + &iview->pview != fbinfo->zs.view.zs ? &iview->pview : NULL; + } + + fbinfo->zs.view.s = + &iview->pview != fbinfo->zs.view.zs ? &iview->pview : NULL; + fbinfo->nr_samples = MAX2( + fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview)); + + if (vk_format_has_depth(img->vk.format)) { + assert(fbinfo->zs.view.zs == NULL || + &iview->pview == fbinfo->zs.view.zs); + fbinfo->zs.view.zs = &iview->pview; + + fbinfo->zs.preload.s = false; + fbinfo->zs.clear.s = false; + if (!fbinfo->zs.clear.z) + fbinfo->zs.preload.z = true; + } + + if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + fbinfo->zs.clear.s = true; + fbinfo->zs.clear_value.stencil = + att->clearValue.depthStencil.stencil; + } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) { + fbinfo->zs.preload.s = true; + } + + if (att->resolveMode != VK_RESOLVE_MODE_NONE) { + struct panvk_resolve_attachment *resolve_info = + &cmdbuf->state.gfx.render.s_attachment.resolve; + VK_FROM_HANDLE(panvk_image_view, resolve_iview, + att->resolveImageView); + + resolve_info->mode = att->resolveMode; + resolve_info->src_iview = iview; + resolve_info->dst_iview = resolve_iview; + } + } + } + + if (fbinfo->zs.view.zs) { + const struct util_format_description *fdesc = + util_format_description(fbinfo->zs.view.zs->format); + bool needs_depth = fbinfo->zs.clear.z | fbinfo->zs.preload.z | + util_format_has_depth(fdesc); + bool needs_stencil = fbinfo->zs.clear.s | fbinfo->zs.preload.s | + util_format_has_stencil(fdesc); + enum pipe_format new_fmt = + util_format_get_blocksize(fbinfo->zs.view.zs->format) == 4 + ? PIPE_FORMAT_Z24_UNORM_S8_UINT + : PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; + + if (needs_depth && needs_stencil && + fbinfo->zs.view.zs->format != new_fmt) { + cmdbuf->state.gfx.render.zs_pview = *fbinfo->zs.view.zs; + cmdbuf->state.gfx.render.zs_pview.format = new_fmt; + fbinfo->zs.view.zs = &cmdbuf->state.gfx.render.zs_pview; + } + } + + fbinfo->extent.minx = pRenderingInfo->renderArea.offset.x; + fbinfo->extent.maxx = pRenderingInfo->renderArea.offset.x + + pRenderingInfo->renderArea.extent.width - 1; + fbinfo->extent.miny = pRenderingInfo->renderArea.offset.y; + fbinfo->extent.maxy = pRenderingInfo->renderArea.offset.y + + pRenderingInfo->renderArea.extent.height - 1; + + if (cmdbuf->state.gfx.render.bound_attachments) { + fbinfo->width = att_width; + fbinfo->height = att_height; + } else { + fbinfo->width = fbinfo->extent.maxx + 1; + fbinfo->height = fbinfo->extent.maxy + 1; + } + + assert(fbinfo->width && fbinfo->height); +} + +static void +preload_render_area_border(struct panvk_cmd_buffer *cmdbuf, + const VkRenderingInfo *render_info) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + bool render_area_is_32x32_aligned = + ((fbinfo->extent.minx | fbinfo->extent.miny) % 32) == 0 && + (fbinfo->extent.maxx + 1 == fbinfo->width || + (fbinfo->extent.maxx % 32) == 31) && + (fbinfo->extent.maxy + 1 == fbinfo->height || + (fbinfo->extent.maxy % 32) == 31); + + /* If the render area is aligned on a 32x32 section, we're good. */ + if (render_area_is_32x32_aligned) + return; + + /* We force preloading for all active attachments to preserve content falling + * outside the render area, but we need to compensate with attachment clears + * for attachments that were initially cleared. + */ + uint32_t bound_atts = cmdbuf->state.gfx.render.bound_attachments; + VkClearAttachment clear_atts[MAX_RTS + 2]; + uint32_t clear_att_count = 0; + + for (uint32_t i = 0; i < render_info->colorAttachmentCount; i++) { + if (bound_atts & MESA_VK_RP_ATTACHMENT_COLOR_BIT(i)) { + if (fbinfo->rts[i].clear) { + const VkRenderingAttachmentInfo *att = + &render_info->pColorAttachments[i]; + + clear_atts[clear_att_count++] = (VkClearAttachment){ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .colorAttachment = i, + .clearValue = att->clearValue, + }; + } + + fbinfo->rts[i].preload = true; + fbinfo->rts[i].clear = false; + } + } + + if (bound_atts & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) { + if (fbinfo->zs.clear.z) { + const VkRenderingAttachmentInfo *att = render_info->pDepthAttachment; + + clear_atts[clear_att_count++] = (VkClearAttachment){ + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, + .clearValue = att->clearValue, + }; + } + + fbinfo->zs.preload.z = true; + fbinfo->zs.clear.z = false; + } + + if (bound_atts & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) { + if (fbinfo->zs.clear.s) { + const VkRenderingAttachmentInfo *att = render_info->pStencilAttachment; + + clear_atts[clear_att_count++] = (VkClearAttachment){ + .aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT, + .clearValue = att->clearValue, + }; + } + + fbinfo->zs.preload.s = true; + fbinfo->zs.clear.s = false; + } + + if (clear_att_count) { + VkClearRect clear_rect = { + .rect = render_info->renderArea, + .baseArrayLayer = 0, + .layerCount = render_info->layerCount, + }; + + panvk_per_arch(CmdClearAttachments)(panvk_cmd_buffer_to_handle(cmdbuf), + clear_att_count, clear_atts, 1, + &clear_rect); + } +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer, + const VkRenderingInfo *pRenderingInfo) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx; + + panvk_cmd_begin_rendering_init_state(cmdbuf, pRenderingInfo); + + bool resuming = state->render.flags & VK_RENDERING_RESUMING_BIT; + + /* If we're not resuming, the FBD should be NULL. */ + assert(!state->render.fbds.gpu || resuming); + + if (!resuming) + preload_render_area_border(cmdbuf, pRenderingInfo); +} + +static void +resolve_attachments(struct panvk_cmd_buffer *cmdbuf) +{ + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + bool needs_resolve = false; + + unsigned bound_atts = cmdbuf->state.gfx.render.bound_attachments; + unsigned color_att_count = + util_last_bit(bound_atts & MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS); + VkRenderingAttachmentInfo color_atts[MAX_RTS]; + for (uint32_t i = 0; i < color_att_count; i++) { + const struct panvk_resolve_attachment *resolve_info = + &cmdbuf->state.gfx.render.color_attachments.resolve[i]; + + color_atts[i] = (VkRenderingAttachmentInfo){ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = panvk_image_view_to_handle(resolve_info->src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = resolve_info->mode, + .resolveImageView = + panvk_image_view_to_handle(resolve_info->dst_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + if (resolve_info->mode != VK_RESOLVE_MODE_NONE) + needs_resolve = true; + } + + const struct panvk_resolve_attachment *resolve_info = + &cmdbuf->state.gfx.render.z_attachment.resolve; + VkRenderingAttachmentInfo z_att = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = panvk_image_view_to_handle(resolve_info->src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = resolve_info->mode, + .resolveImageView = panvk_image_view_to_handle(resolve_info->dst_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + if (resolve_info->mode != VK_RESOLVE_MODE_NONE) + needs_resolve = true; + + resolve_info = &cmdbuf->state.gfx.render.s_attachment.resolve; + + VkRenderingAttachmentInfo s_att = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = panvk_image_view_to_handle(resolve_info->src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = resolve_info->mode, + .resolveImageView = panvk_image_view_to_handle(resolve_info->dst_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + if (resolve_info->mode != VK_RESOLVE_MODE_NONE) + needs_resolve = true; + + if (!needs_resolve) + return; + + const VkRenderingInfo render_info = { + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = + { + .offset.x = fbinfo->extent.minx, + .offset.y = fbinfo->extent.miny, + .extent.width = fbinfo->extent.maxx - fbinfo->extent.minx + 1, + .extent.height = fbinfo->extent.maxy - fbinfo->extent.miny + 1, + }, + .layerCount = cmdbuf->state.gfx.render.layer_count, + .viewMask = 0, + .colorAttachmentCount = color_att_count, + .pColorAttachments = color_atts, + .pDepthAttachment = &z_att, + .pStencilAttachment = &s_att, + }; + + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_cmd_meta_graphics_save_ctx save = {0}; + + panvk_per_arch(cmd_meta_gfx_start)(cmdbuf, &save); + vk_meta_resolve_rendering(&cmdbuf->vk, &dev->meta, &render_info); + panvk_per_arch(cmd_meta_gfx_end)(cmdbuf, &save); +} + +static uint8_t +prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + bool simul_use = + !(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT); + + if (cmdbuf->state.tls.desc.gpu) { + ASSERTED unsigned num_preload_jobs = + GENX(pan_preload_fb)(&dev->blitter.cache, &cmdbuf->desc_pool.base, + &cmdbuf->state.gfx.render.fb.info, layer, + cmdbuf->state.tls.desc.gpu, NULL); + + /* Valhall GPUs use pre frame DCDs to preload the FB content. We + * thus expect num_preload_jobs to be zero. + */ + assert(!num_preload_jobs); + } + + struct pan_tiler_context tiler_ctx = { + .valhall.desc = !simul_use ? cmdbuf->state.gfx.render.tiler : 0, + }; + + return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL, + &tiler_ctx, fbd); +} + +static void +flush_tiling(struct panvk_cmd_buffer *cmdbuf) +{ + if (!cmdbuf->state.gfx.render.fbds.gpu) + return; + + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + + struct cs_index render_ctx = cs_scratch_reg64(b, 2); + + if (cmdbuf->state.gfx.render.tiler) { + /* Flush the tiling operations and signal the internal sync object. */ + cs_req_res(b, CS_TILER_RES); + cs_finish_tiling(b, false); + cs_req_res(b, 0); + + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index iter_sb = cs_scratch_reg32(b, 2); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); + struct cs_index add_val = cs_scratch_reg64(b, 4); + + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + + /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to + * skip an ADD operation on the syncobjs pointer. */ + STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0); + + cs_move64_to(b, add_val, 1); + + cs_match(b, iter_sb, cmp_scratch) { +#define CASE(x) \ + cs_case(b, x) { \ + cs_heap_operation(b, \ + MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, \ + cs_defer(SB_WAIT_ITER(x), \ + SB_ID(DEFERRED_SYNC))); \ + cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \ + add_val, sync_addr, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \ + cs_move32_to(b, iter_sb, next_iter_sb(x)); \ + } + + CASE(0) + CASE(1) + CASE(2) + CASE(3) + CASE(4) +#undef CASE + } + + cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_wait_slot(b, SB_ID(LS), false); + + /* Update the vertex seqno. */ + ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point; + } else { + cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, render)); + cs_wait_slot(b, SB_ID(LS), false); + } +} + +static void +wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf) +{ + if (!cmdbuf->state.gfx.render.tiler) + return; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0); + struct cs_index vt_sync_point = cs_scratch_reg64(b, 2); + uint64_t rel_vt_sync_point = + cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point; + + cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + + cs_add64(b, vt_sync_point, + cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER), + rel_vt_sync_point); + cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point, + vt_sync_addr); +} + +static void +issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) +{ + if (!cmdbuf->state.gfx.render.fbds.gpu) + return; + + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + + /* Wait for the tiling to be done before submitting the fragment job. */ + wait_finish_tiling(cmdbuf); + + /* Reserve a scoreboard for the fragment job. */ + panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + + /* Now initialize the fragment bits. */ + cs_update_frag_ctx(b) { + cs_move32_to(b, cs_sr_reg32(b, 42), + (fbinfo->extent.miny << 16) | fbinfo->extent.minx); + cs_move32_to(b, cs_sr_reg32(b, 43), + (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx); + } + + fbinfo->sample_positions = + dev->sample_positions->addr.dev + + panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples)); + + bool simul_use = + cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + + /* The only bit we patch in FBDs is the tiler pointer. If tiler is not + * involved (clear job) or if the update can happen in place (not + * simultaneous use of the command buffer), we can avoid the + * copy. */ + bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler; + uint32_t fbd_sz = calc_fbd_size(cmdbuf); + struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds; + uint8_t fbd_flags = 0; + + /* We prepare all FB descriptors upfront. */ + for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) { + uint32_t new_fbd_flags = + prepare_fb_desc(cmdbuf, i, fbds.cpu + (fbd_sz * i)); + + /* Make sure all FBDs have the same flags. */ + assert(i == 0 || new_fbd_flags == fbd_flags); + fbd_flags = new_fbd_flags; + } + + struct cs_index layer_count = cs_sr_reg32(b, 47); + struct cs_index fbd_ptr = cs_sr_reg64(b, 48); + struct cs_index tiler_ptr = cs_sr_reg64(b, 50); + struct cs_index src_fbd_ptr = cs_undef(); + + if (copy_fbds) { + src_fbd_ptr = cs_sr_reg64(b, 52); + + cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count); + cs_load64_to( + b, tiler_ptr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); + cs_wait_slot(b, SB_ID(LS), false); + + cs_add64(b, fbd_ptr, tiler_ptr, pan_size(TILER_CONTEXT)); + cs_move64_to(b, src_fbd_ptr, fbds.gpu); + } else if (cmdbuf->state.gfx.render.tiler) { + cs_move64_to(b, fbd_ptr, fbds.gpu); + cs_move64_to(b, tiler_ptr, cmdbuf->state.gfx.render.tiler); + } + + cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count); + cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) { + if (copy_fbds) { + for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + cs_wait_slot(b, SB_ID(LS), false); + cs_store(b, cs_scratch_reg_tuple(b, 0, 16), fbd_ptr, + BITFIELD_MASK(16), fbd_off); + cs_wait_slot(b, SB_ID(LS), false); + } + + cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz); + } + + if (cmdbuf->state.gfx.render.tiler) { + cs_store64(b, tiler_ptr, fbd_ptr, 56); + cs_wait_slot(b, SB_ID(LS), false); + } + + cs_update_frag_ctx(b) + cs_add64(b, cs_sr_reg64(b, 40), fbd_ptr, fbd_flags); + + cs_req_res(b, CS_FRAG_RES); + cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_req_res(b, 0); + cs_add64(b, fbd_ptr, fbd_ptr, fbd_sz); + cs_add32(b, layer_count, layer_count, -1); + } + + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index iter_sb = cs_scratch_reg32(b, 2); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); + struct cs_index add_val = cs_scratch_reg64(b, 4); + struct cs_index release_sz = cs_scratch_reg32(b, 5); + struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6); + struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4); + struct cs_index completed_top = cs_scratch_reg64(b, 10); + struct cs_index completed_bottom = cs_scratch_reg64(b, 12); + + cs_move64_to(b, add_val, 1); + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + + if (copy_fbds) { + cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf)); + cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, + render.desc_ringbuf.syncobj)); + } + + if (cmdbuf->state.gfx.render.tiler) + cs_load_to(b, completed, tiler_ptr, BITFIELD_MASK(4), 40); + + cs_wait_slot(b, SB_ID(LS), false); + + cs_add64(b, sync_addr, sync_addr, + PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64)); + + cs_match(b, iter_sb, cmp_scratch) { +#define CASE(x) \ + cs_case(b, x) { \ + if (cmdbuf->state.gfx.render.tiler) { \ + cs_finish_fragment(b, true, completed_top, completed_bottom, \ + cs_defer(SB_WAIT_ITER(x), \ + SB_ID(DEFERRED_SYNC))); \ + } \ + if (copy_fbds) { \ + cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \ + release_sz, ringbuf_sync_addr, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \ + } \ + cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \ + add_val, sync_addr, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \ + cs_move32_to(b, iter_sb, next_iter_sb(x)); \ + } + + CASE(0) + CASE(1) + CASE(2) + CASE(3) + CASE(4) +#undef CASE + } + + cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_wait_slot(b, SB_ID(LS), false); + + /* Update the ring buffer position. */ + if (copy_fbds) + cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf)); + + /* Update the frag seqno. */ + ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point; + + memset(&cmdbuf->state.gfx.render.fbds, 0, + sizeof(cmdbuf->state.gfx.render.fbds)); + cmdbuf->state.gfx.render.tiler = 0; +} + +void +panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf) +{ + /* If there was no draw queued, we don't need to force a preload. */ + if (!cmdbuf->state.gfx.render.fbds.gpu) + return; + + flush_tiling(cmdbuf); + issue_fragment_jobs(cmdbuf); + force_fb_preload(cmdbuf); + memset(&cmdbuf->state.gfx.render.fbds, 0, + sizeof(cmdbuf->state.gfx.render.fbds)); + cmdbuf->state.gfx.render.tiler = 0; +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + if (!(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) { + struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; + bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s; + for (unsigned i = 0; i < fbinfo->rt_count; i++) + clear |= fbinfo->rts[i].clear; + + if (clear) + get_fb_descs(cmdbuf); + + flush_tiling(cmdbuf); + issue_fragment_jobs(cmdbuf); + resolve_attachments(cmdbuf); + } +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdBindVertexBuffers)(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + assert(firstBinding + bindingCount <= MAX_VBS); + + for (uint32_t i = 0; i < bindingCount; i++) { + VK_FROM_HANDLE(panvk_buffer, buffer, pBuffers[i]); + + cmdbuf->state.gfx.vb.bufs[firstBinding + i].address = + panvk_buffer_gpu_ptr(buffer, pOffsets[i]); + cmdbuf->state.gfx.vb.bufs[firstBinding + i].size = + panvk_buffer_range(buffer, pOffsets[i], VK_WHOLE_SIZE); + } + + cmdbuf->state.gfx.vb.count = + MAX2(cmdbuf->state.gfx.vb.count, firstBinding + bindingCount); + memset(&cmdbuf->state.gfx.vs.desc.driver_set, 0, + sizeof(cmdbuf->state.gfx.vs.desc.driver_set)); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdBindIndexBuffer)(VkCommandBuffer commandBuffer, + VkBuffer buffer, VkDeviceSize offset, + VkIndexType indexType) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + VK_FROM_HANDLE(panvk_buffer, buf, buffer); + + cmdbuf->state.gfx.ib.buffer = buf; + cmdbuf->state.gfx.ib.offset = offset; + cmdbuf->state.gfx.ib.index_size = vk_index_type_to_bytes(indexType); + cmdbuf->state.gfx.ib.dirty = true; +} diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c new file mode 100644 index 00000000000..7e6f587b5e0 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -0,0 +1,745 @@ +/* + * Copyright © 2024 Collabora Ltd. + * + * SPDX-License-Identifier: MIT + */ + +#include "drm-uapi/panthor_drm.h" + +#include "genxml/cs_builder.h" +#include "genxml/decode.h" + +#include "panvk_cmd_buffer.h" +#include "panvk_macros.h" +#include "panvk_queue.h" + +#include "vk_drm_syncobj.h" +#include "vk_log.h" + +static void +finish_render_desc_ringbuf(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf; + + panvk_pool_free_mem(&dev->mempools.rw, ringbuf->syncobj); + + if (dev->debug.decode_ctx && ringbuf->addr.dev) { + pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev, + RENDER_DESC_RINGBUF_SIZE); + pandecode_inject_free(dev->debug.decode_ctx, + ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE, + RENDER_DESC_RINGBUF_SIZE); + } + + if (ringbuf->addr.dev) { + struct pan_kmod_vm_op op = { + .type = PAN_KMOD_VM_OP_TYPE_UNMAP, + .va = { + .start = ringbuf->addr.dev, + .size = RENDER_DESC_RINGBUF_SIZE * 2, + }, + }; + + ASSERTED int ret = + pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1); + assert(!ret); + } + + if (ringbuf->addr.host) { + ASSERTED int ret = + os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE); + assert(!ret); + } + + pan_kmod_bo_put(ringbuf->bo); +} + +static VkResult +init_render_desc_ringbuf(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP); + struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf; + const size_t size = RENDER_DESC_RINGBUF_SIZE; + VkResult result; + int ret; + + ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags); + if (!ringbuf->bo) + return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a descriptor ring buffer context"); + + if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) { + ringbuf->addr.host = pan_kmod_bo_mmap( + ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); + if (ringbuf->addr.host == MAP_FAILED) { + result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to CPU map ringbuf BO"); + goto err_finish_ringbuf; + } + } + + /* We choose the alignment to guarantee that we won't ever cross a 4G + * boundary when accessing the mapping. This way we can encode the wraparound + * using 32-bit operations. */ + uint64_t dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2); + + struct pan_kmod_vm_op vm_ops[] = { + { + .type = PAN_KMOD_VM_OP_TYPE_MAP, + .va = { + .start = dev_addr, + .size = RENDER_DESC_RINGBUF_SIZE, + }, + .map = { + .bo = ringbuf->bo, + .bo_offset = 0, + }, + }, + { + .type = PAN_KMOD_VM_OP_TYPE_MAP, + .va = { + .start = dev_addr + RENDER_DESC_RINGBUF_SIZE, + .size = RENDER_DESC_RINGBUF_SIZE, + }, + .map = { + .bo = ringbuf->bo, + .bo_offset = 0, + }, + }, + }; + + ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops, + ARRAY_SIZE(vm_ops)); + if (ret) { + result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to GPU map ringbuf BO"); + goto err_finish_ringbuf; + } + + ringbuf->addr.dev = dev_addr; + + if (dev->debug.decode_ctx) { + pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev, + ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL); + pandecode_inject_mmap(dev->debug.decode_ctx, + ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE, + ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL); + } + + struct panvk_pool_alloc_info alloc_info = { + .size = sizeof(struct panvk_cs_sync32), + .alignment = 64, + }; + + ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + + struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj); + + if (!syncobj) { + result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create the render desc ringbuf context"); + goto err_finish_ringbuf; + } + + *syncobj = (struct panvk_cs_sync32){ + .seqno = RENDER_DESC_RINGBUF_SIZE, + }; + + return VK_SUCCESS; + +err_finish_ringbuf: + finish_render_desc_ringbuf(queue); + return result; +} + +static VkResult +init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_subqueue *subq = &queue->subqueues[subqueue]; + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + unsigned debug = instance->debug_flags; + struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs); + + struct panvk_pool_alloc_info alloc_info = { + .size = sizeof(struct panvk_cs_subqueue_context), + .alignment = 64, + }; + + subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_host_addr(subq->context)) + return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a queue context"); + + struct panvk_cs_subqueue_context *cs_ctx = + panvk_priv_mem_host_addr(subq->context); + + *cs_ctx = (struct panvk_cs_subqueue_context){ + .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs), + .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs), + .iter_sb = 0, + }; + + /* We use the geometry buffer for our temporary CS buffer. */ + struct cs_buffer root_cs = { + .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096, + .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096, + .capacity = 64 * 1024 / sizeof(uint64_t), + }; + const struct cs_builder_conf conf = { + .nr_registers = 96, + .nr_kernel_registers = 4, + }; + struct cs_builder b; + + assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0); + + cs_builder_init(&b, &conf, root_cs); + /* Pass the context to through r62. */ + cs_move64_to(&b, cs_subqueue_ctx_reg(&b), + panvk_priv_mem_dev_addr(subq->context)); + + /* Intialize scoreboard slots used for asynchronous operations. */ + cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS)); + + /* We do greater than test on sync objects, and given the reference seqno + * registers are all zero at init time, we need to initialize all syncobjs + * with a seqno of one. */ + syncobjs[subqueue].seqno = 1; + + if (subqueue != PANVK_SUBQUEUE_COMPUTE) { + cs_ctx->render.tiler_heap = + panvk_priv_mem_dev_addr(queue->tiler_heap.desc); + /* Our geometry buffer comes 4k after the tiler heap, and we encode the + * size in the lower 12 bits so the address can be copied directly + * to the tiler descriptors. */ + cs_ctx->render.geom_buf = + (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12); + + /* Initialize the ringbuf */ + cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){ + .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj), + .ptr = queue->render_desc_ringbuf.addr.dev, + .pos = 0, + }; + + struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0); + + /* Pre-set the heap context on the vertex-tiler/fragment queues. */ + cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr); + cs_heap_set(&b, heap_ctx_addr); + } + + cs_finish(&b); + + assert(cs_is_valid(&b)); + + struct drm_panthor_sync_op syncop = { + .flags = + DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL, + .handle = queue->syncobj_handle, + .timeline_value = 0, + }; + struct drm_panthor_queue_submit qsubmit = { + .queue_index = subqueue, + .stream_size = cs_root_chunk_size(&b), + .stream_addr = cs_root_chunk_gpu_addr(&b), + .latest_flush = 0, + .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop), + }; + struct drm_panthor_group_submit gsubmit = { + .group_handle = queue->group_handle, + .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit), + }; + + int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit); + if (ret) + return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED, + "Failed to initialized subqueue: %m"); + + ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0, + NULL); + if (ret) + return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED, + "SyncobjWait failed: %m"); + + if (debug & PANVK_DEBUG_TRACE) { + uint32_t regs[256] = {0}; + + pandecode_cs(dev->debug.decode_ctx, qsubmit.stream_addr, + qsubmit.stream_size, phys_dev->kmod.props.gpu_prod_id, regs); + pandecode_next_frame(dev->debug.decode_ctx); + } + + return VK_SUCCESS; +} + +static void +cleanup_queue(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) + panvk_pool_free_mem(&dev->mempools.rw, queue->subqueues[i].context); + + finish_render_desc_ringbuf(queue); + + panvk_pool_free_mem(&dev->mempools.rw, queue->debug_syncobjs); + panvk_pool_free_mem(&dev->mempools.rw, queue->syncobjs); +} + +static VkResult +init_queue(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + VkResult result; + + struct panvk_pool_alloc_info alloc_info = { + .size = + ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT, + .alignment = 64, + }; + + queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_host_addr(queue->syncobjs)) + return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate subqueue sync objects"); + + if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) { + alloc_info.size = + ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT, + queue->debug_syncobjs = + panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info); + if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) { + result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate subqueue sync objects"); + goto err_cleanup_queue; + } + } + + result = init_render_desc_ringbuf(queue); + if (result != VK_SUCCESS) + goto err_cleanup_queue; + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + result = init_subqueue(queue, i); + if (result != VK_SUCCESS) + goto err_cleanup_queue; + } + + return VK_SUCCESS; + +err_cleanup_queue: + cleanup_queue(queue); + return result; +} + +static VkResult +create_group(struct panvk_queue *queue) +{ + const struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + + struct drm_panthor_queue_create qc[] = { + [PANVK_SUBQUEUE_VERTEX_TILER] = + { + .priority = 1, + .ringbuf_size = 64 * 1024, + }, + [PANVK_SUBQUEUE_FRAGMENT] = + { + .priority = 1, + .ringbuf_size = 64 * 1024, + }, + [PANVK_SUBQUEUE_COMPUTE] = + { + .priority = 1, + .ringbuf_size = 64 * 1024, + }, + }; + + struct drm_panthor_group_create gc = { + .compute_core_mask = phys_dev->kmod.props.shader_present, + .fragment_core_mask = phys_dev->kmod.props.shader_present, + .tiler_core_mask = 1, + .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present), + .max_fragment_cores = + util_bitcount64(phys_dev->kmod.props.shader_present), + .max_tiler_cores = 1, + .priority = PANTHOR_GROUP_PRIORITY_MEDIUM, + .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc), + .vm_id = pan_kmod_vm_handle(dev->kmod.vm), + }; + + int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc); + if (ret) + return vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED, + "Failed to create a scheduling group"); + + queue->group_handle = gc.group_handle; + return VK_SUCCESS; +} + +static void +destroy_group(struct panvk_queue *queue) +{ + const struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct drm_panthor_group_destroy gd = { + .group_handle = queue->group_handle, + }; + + ASSERTED int ret = + drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd); + assert(!ret); +} + +static VkResult +init_tiler(struct panvk_queue *queue) +{ + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap; + VkResult result; + + /* We allocate the tiler heap descriptor and geometry buffer in one go, + * so we can pass it through a single 64-bit register to the VERTEX_TILER + * command streams. */ + struct panvk_pool_alloc_info alloc_info = { + .size = (64 * 1024) + 4096, + .alignment = 4096, + }; + + tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_host_addr(tiler_heap->desc)) { + result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a tiler heap context"); + goto err_free_desc; + } + + tiler_heap->chunk_size = 2 * 1024 * 1024; + + struct drm_panthor_tiler_heap_create thc = { + .vm_id = pan_kmod_vm_handle(dev->kmod.vm), + .chunk_size = tiler_heap->chunk_size, + .initial_chunk_count = 5, + .max_chunks = 64, + .target_in_flight = 65535, + }; + + int ret = + drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc); + if (ret) { + result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED, + "Failed to create a tiler heap context"); + goto err_free_desc; + } + + tiler_heap->context.handle = thc.handle; + tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va; + + pan_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP, cfg) { + cfg.size = tiler_heap->chunk_size; + cfg.base = thc.first_heap_chunk_gpu_va; + cfg.bottom = cfg.base + 64; + cfg.top = cfg.base + cfg.size; + } + + return VK_SUCCESS; + +err_free_desc: + panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc); + return result; +} + +static void +cleanup_tiler(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap; + struct drm_panthor_tiler_heap_destroy thd = { + .handle = tiler_heap->context.handle, + }; + ASSERTED int ret = + drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd); + assert(!ret); + + panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc); +} + +static VkResult +panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit) +{ + struct panvk_queue *queue = container_of(vk_queue, struct panvk_queue, vk); + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(queue->vk.base.device->physical); + VkResult result = VK_SUCCESS; + int ret; + + if (vk_queue_is_lost(&queue->vk)) + return VK_ERROR_DEVICE_LOST; + + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + unsigned debug = instance->debug_flags; + bool force_sync = debug & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC); + uint32_t qsubmit_count = 0; + uint32_t used_queue_mask = 0; + for (uint32_t i = 0; i < submit->command_buffer_count; i++) { + struct panvk_cmd_buffer *cmdbuf = + container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk); + + for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) { + assert(cs_is_valid(&cmdbuf->state.cs[j].builder)); + if (!cs_is_empty(&cmdbuf->state.cs[j].builder)) { + used_queue_mask |= BITFIELD_BIT(j); + qsubmit_count++; + } + } + } + + /* Synchronize all subqueues if we have no command buffer submitted. */ + if (!qsubmit_count) + used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT); + + /* We add sync-only queue submits to place our wait/signal operations. */ + if (submit->wait_count > 0) + qsubmit_count += util_bitcount(used_queue_mask); + + if (submit->signal_count > 0) + qsubmit_count += util_bitcount(used_queue_mask); + + uint32_t syncop_count = submit->wait_count + util_bitcount(used_queue_mask); + + STACK_ARRAY(struct drm_panthor_queue_submit, qsubmits, qsubmit_count); + STACK_ARRAY(struct drm_panthor_sync_op, syncops, syncop_count); + struct drm_panthor_sync_op *wait_ops = syncops; + struct drm_panthor_sync_op *signal_ops = syncops + submit->wait_count; + + qsubmit_count = 0; + if (submit->wait_count) { + for (uint32_t i = 0; i < submit->wait_count; i++) { + assert(vk_sync_type_is_drm_syncobj(submit->waits[i].sync->type)); + struct vk_drm_syncobj *syncobj = + vk_sync_as_drm_syncobj(submit->waits[i].sync); + + wait_ops[i] = (struct drm_panthor_sync_op){ + .flags = (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE + ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ + : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) | + DRM_PANTHOR_SYNC_OP_WAIT, + .handle = syncobj->syncobj, + .timeline_value = submit->waits[i].wait_value, + }; + } + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (used_queue_mask & BITFIELD_BIT(i)) { + qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){ + .queue_index = i, + .syncs = DRM_PANTHOR_OBJ_ARRAY(submit->wait_count, wait_ops), + }; + } + } + } + + for (uint32_t i = 0; i < submit->command_buffer_count; i++) { + struct panvk_cmd_buffer *cmdbuf = + container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk); + + for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) { + if (cs_is_empty(&cmdbuf->state.cs[j].builder)) + continue; + + qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){ + .queue_index = j, + .stream_size = cs_root_chunk_size(&cmdbuf->state.cs[j].builder), + .stream_addr = cs_root_chunk_gpu_addr(&cmdbuf->state.cs[j].builder), + .latest_flush = 0, + }; + } + } + + if (submit->signal_count || force_sync) { + uint32_t signal_op = 0; + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (used_queue_mask & BITFIELD_BIT(i)) { + signal_ops[signal_op] = (struct drm_panthor_sync_op){ + .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ | + DRM_PANTHOR_SYNC_OP_SIGNAL, + .handle = queue->syncobj_handle, + .timeline_value = signal_op + 1, + }; + + qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){ + .queue_index = i, + .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &signal_ops[signal_op++]), + }; + } + } + } + + if (force_sync) { + struct panvk_cs_sync32 *debug_syncs = + panvk_priv_mem_host_addr(queue->debug_syncobjs); + + assert(debug_syncs); + memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT); + } + + struct drm_panthor_group_submit gsubmit = { + .group_handle = queue->group_handle, + .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits), + }; + + ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit); + if (ret) { + result = vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m"); + goto out; + } + + if (submit->signal_count || force_sync) { + if (force_sync) { + uint64_t point = util_bitcount(used_queue_mask); + ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle, + &point, 1, INT64_MAX, + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL); + assert(!ret); + } + + for (uint32_t i = 0; i < submit->signal_count; i++) { + assert(vk_sync_type_is_drm_syncobj(submit->signals[i].sync->type)); + struct vk_drm_syncobj *syncobj = + vk_sync_as_drm_syncobj(submit->signals[i].sync); + + drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj, + submit->signals[i].signal_value, + queue->syncobj_handle, 0, 0); + } + + drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1); + } + + if (debug & PANVK_DEBUG_TRACE) { + for (uint32_t i = 0; i < qsubmit_count; i++) { + if (!qsubmits[i].stream_size) + continue; + + uint32_t subqueue = qsubmits[i].queue_index; + uint32_t regs[256] = {0}; + uint64_t ctx = + panvk_priv_mem_dev_addr(queue->subqueues[subqueue].context); + + regs[PANVK_CS_REG_SUBQUEUE_CTX_START] = ctx; + regs[PANVK_CS_REG_SUBQUEUE_CTX_START + 1] = ctx >> 32; + + simple_mtx_lock(&dev->debug.decode_ctx->lock); + pandecode_dump_file_open(dev->debug.decode_ctx); + pandecode_log(dev->debug.decode_ctx, "CS%d\n", + qsubmits[i].queue_index); + simple_mtx_unlock(&dev->debug.decode_ctx->lock); + pandecode_cs(dev->debug.decode_ctx, qsubmits[i].stream_addr, + qsubmits[i].stream_size, phys_dev->kmod.props.gpu_prod_id, + regs); + } + } + + if (debug & PANVK_DEBUG_DUMP) + pandecode_dump_mappings(dev->debug.decode_ctx); + + if (force_sync) { + struct panvk_cs_sync32 *debug_syncs = + panvk_priv_mem_host_addr(queue->debug_syncobjs); + uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0}; + + for (uint32_t i = 0; i < qsubmit_count; i++) { + if (qsubmits[i].stream_size) + debug_sync_points[qsubmits[i].queue_index]++; + } + + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { + if (debug_syncs[i].seqno != debug_sync_points[i] || + debug_syncs[i].error != 0) + assert(!"Incomplete job or timeout\n"); + } + } + + if (debug & PANVK_DEBUG_TRACE) + pandecode_next_frame(dev->debug.decode_ctx); + +out: + STACK_ARRAY_FINISH(syncops); + STACK_ARRAY_FINISH(qsubmits); + return result; +} + +VkResult +panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue, + int idx, const VkDeviceQueueCreateInfo *create_info) +{ + const struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); + + VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx); + if (result != VK_SUCCESS) + return result; + + int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle); + if (ret) { + result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED, + "Failed to create our internal sync object"); + goto err_finish_queue; + } + + result = init_tiler(queue); + if (result != VK_SUCCESS) + goto err_destroy_syncobj; + + result = create_group(queue); + if (result != VK_SUCCESS) + goto err_cleanup_tiler; + + result = init_queue(queue); + if (result != VK_SUCCESS) + goto err_destroy_group; + + queue->vk.driver_submit = panvk_queue_submit; + return VK_SUCCESS; + +err_destroy_group: + destroy_group(queue); + +err_cleanup_tiler: + cleanup_tiler(queue); + +err_destroy_syncobj: + drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle); + +err_finish_queue: + vk_queue_finish(&queue->vk); + return result; +} + +void +panvk_per_arch(queue_finish)(struct panvk_queue *queue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + + destroy_group(queue); + cleanup_tiler(queue); + drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle); + vk_queue_finish(&queue->vk); +} diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index da7d318395b..d296e3d8a96 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -57,6 +57,15 @@ jm_files = [ 'jm/panvk_vX_queue.c', ] +csf_archs = [10] +csf_inc_dir = ['csf'] +csf_files = [ + 'csf/panvk_vX_cmd_buffer.c', + 'csf/panvk_vX_cmd_dispatch.c', + 'csf/panvk_vX_cmd_draw.c', + 'csf/panvk_vX_queue.c', +] + common_per_arch_files = [ panvk_entrypoints[0], 'panvk_vX_blend.c', @@ -74,21 +83,7 @@ common_per_arch_files = [ ] foreach arch : [6, 7, 10] - if arch in valhall_archs - # We're lacking key components to compile common source files for v9/v10. - # Just add the v10 entrypoints for now. - per_arch_files = [ - panvk_entrypoints[0], - 'panvk_vX_blend.c', - 'panvk_vX_descriptor_set.c', - 'panvk_vX_descriptor_set_layout.c', - 'panvk_vX_nir_lower_descriptors.c', - 'panvk_vX_shader.c', - ] - else - per_arch_files = common_per_arch_files - endif - + per_arch_files = common_per_arch_files inc_panvk_per_arch = [] if arch in bifrost_archs @@ -102,6 +97,9 @@ foreach arch : [6, 7, 10] if arch in jm_archs inc_panvk_per_arch += jm_inc_dir per_arch_files += jm_files + elif arch in csf_archs + inc_panvk_per_arch += csf_inc_dir + per_arch_files += csf_files endif panvk_per_arch_libs += static_library( diff --git a/src/panfrost/vulkan/panvk_instance.c b/src/panfrost/vulkan/panvk_instance.c index d9c53f0480f..05132fbbeee 100644 --- a/src/panfrost/vulkan/panvk_instance.c +++ b/src/panfrost/vulkan/panvk_instance.c @@ -36,6 +36,7 @@ static const struct debug_control panvk_debug_options[] = { {"linear", PANVK_DEBUG_LINEAR}, {"dump", PANVK_DEBUG_DUMP}, {"no_known_warn", PANVK_DEBUG_NO_KNOWN_WARN}, + {"cs", PANVK_DEBUG_CS}, {NULL, 0}}; VKAPI_ATTR VkResult VKAPI_CALL diff --git a/src/panfrost/vulkan/panvk_instance.h b/src/panfrost/vulkan/panvk_instance.h index 55576b8f436..e386deedfd3 100644 --- a/src/panfrost/vulkan/panvk_instance.h +++ b/src/panfrost/vulkan/panvk_instance.h @@ -21,6 +21,7 @@ enum panvk_debug_flags { PANVK_DEBUG_LINEAR = 1 << 5, PANVK_DEBUG_DUMP = 1 << 6, PANVK_DEBUG_NO_KNOWN_WARN = 1 << 7, + PANVK_DEBUG_CS = 1 << 8, }; #if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index e3e85a9c588..27b87f5aa8b 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -29,11 +29,7 @@ #include "genxml/gen_macros.h" -/* FIXME: make the include statement unconditional when the CSF command buffer - * logic is implemented. */ -#if PAN_ARCH <= 7 #include "panvk_cmd_buffer.h" -#endif #include "panvk_device.h" #include "panvk_instance.h" #include "panvk_mempool.h" @@ -1333,9 +1329,6 @@ static const struct vk_shader_ops panvk_shader_ops = { panvk_shader_get_executable_internal_representations, }; -/* FIXME: make this unconditional when the CSF command buffer logic is - * implemented. */ -#if PAN_ARCH <= 7 static void panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage, struct panvk_shader *shader) @@ -1380,7 +1373,6 @@ panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count, panvk_cmd_bind_shader(cmd, stages[i], shader); } } -#endif const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = { .get_nir_options = panvk_get_nir_options, @@ -1390,10 +1382,5 @@ const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = { .compile = panvk_compile_shaders, .deserialize = panvk_deserialize_shader, .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, - -/* FIXME: make the assignment unconditional when the CSF command buffer logic is - * implemented. */ -#if PAN_ARCH <= 7 .cmd_bind_shaders = panvk_cmd_bind_shaders, -#endif };