panvk: Add a CSF backend for panvk_queue/cmd_buffer

With those two components implemented, we can now compile all common
per-arch source files.

Co-developed-by: Rebecca Mckeever <rebecca.mckeever@collabora.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Rebecca Mckeever <rebecca.mckeever@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30969>
This commit is contained in:
Boris Brezillon 2024-08-23 17:44:59 +02:00 committed by Marge Bot
parent 7049d31676
commit 5544d39f44
11 changed files with 4406 additions and 28 deletions

View file

@ -10,5 +10,10 @@ ForEachMacros: [
'cs_case', 'cs_case',
'cs_default', 'cs_default',
'cs_match', 'cs_match',
'cs_update_compute_ctx',
'cs_update_frag_ctx',
'cs_update_progress_seqno',
'cs_update_vt_ctx',
'cs_while', 'cs_while',
'panvk_cs_reg_upd_ctx',
] ]

View file

@ -0,0 +1,460 @@
/*
* Copyright © 2024 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_CMD_BUFFER_H
#define PANVK_CMD_BUFFER_H
#ifndef PAN_ARCH
#error "PAN_ARCH must be defined"
#endif
#include <stdint.h>
#include "genxml/cs_builder.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_queue.h"
#include "vk_command_buffer.h"
#include "util/list.h"
#define MAX_VBS 16
#define MAX_RTS 8
struct panvk_cs_sync32 {
uint32_t seqno;
uint32_t error;
};
struct panvk_cs_sync64 {
uint64_t seqno;
uint32_t error;
uint32_t pad;
};
struct panvk_cs_desc_ringbuf {
uint64_t syncobj;
uint64_t ptr;
uint32_t pos;
uint32_t pad;
};
/* 512k of render descriptors that can be used when
* VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT is set on the command buffer. */
#define RENDER_DESC_RINGBUF_SIZE (512 * 1024)
struct panvk_cs_subqueue_context {
uint64_t syncobjs;
uint32_t iter_sb;
uint32_t pad;
struct {
struct panvk_cs_desc_ringbuf desc_ringbuf;
uint64_t tiler_heap;
uint64_t geom_buf;
} render;
uint64_t debug_syncobjs;
} __attribute__((aligned(64)));
struct panvk_cache_flush_info {
enum mali_cs_flush_mode l2;
enum mali_cs_flush_mode lsc;
bool others;
};
struct panvk_cs_deps {
bool needs_draw_flush;
struct {
uint32_t wait_sb_mask;
struct panvk_cache_flush_info cache_flush;
} src[PANVK_SUBQUEUE_COUNT];
struct {
uint32_t wait_subqueue_mask;
} dst[PANVK_SUBQUEUE_COUNT];
};
enum panvk_sb_ids {
PANVK_SB_LS = 0,
PANVK_SB_IMM_FLUSH = 0,
PANVK_SB_DEFERRED_SYNC = 1,
PANVK_SB_DEFERRED_FLUSH = 2,
PANVK_SB_ITER_START = 3,
PANVK_SB_ITER_COUNT = 5,
};
#define SB_IMM_MASK 0
#define SB_MASK(nm) BITFIELD_BIT(PANVK_SB_##nm)
#define SB_ID(nm) PANVK_SB_##nm
#define SB_ITER(x) (PANVK_SB_ITER_START + (x))
#define SB_WAIT_ITER(x) BITFIELD_BIT(PANVK_SB_ITER_START + (x))
#define SB_ALL_MASK BITFIELD_MASK(8)
static inline uint32_t
next_iter_sb(uint32_t sb)
{
return sb + 1 < PANVK_SB_ITER_COUNT ? sb + 1 : 0;
}
enum panvk_cs_regs {
/* RUN_IDVS staging regs. */
PANVK_CS_REG_RUN_IDVS_SR_START = 0,
PANVK_CS_REG_RUN_IDVS_SR_END = 60,
/* RUN_FRAGMENT staging regs. */
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 40,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
/* RUN_COMPUTE staging regs. */
PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
PANVK_CS_REG_RUN_COMPUTE_SR_END = 39,
/* Range of registers that can be used to store temporary data on
* all queues. Note that some queues have extra space they can use
* as scratch space.*/
PANVK_CS_REG_SCRATCH_START = 66,
PANVK_CS_REG_SCRATCH_END = 83,
/* Driver context. */
PANVK_CS_REG_PROGRESS_SEQNO_START = 84,
PANVK_CS_REG_PROGRESS_SEQNO_END = 89,
PANVK_CS_REG_SUBQUEUE_CTX_START = 90,
PANVK_CS_REG_SUBQUEUE_CTX_END = 91,
};
static inline struct cs_index
cs_scratch_reg_tuple(struct cs_builder *b, unsigned start, unsigned count)
{
assert(PANVK_CS_REG_SCRATCH_START + start + count - 1 <=
PANVK_CS_REG_SCRATCH_END);
return cs_reg_tuple(b, PANVK_CS_REG_SCRATCH_START + start, count);
}
static inline struct cs_index
cs_scratch_reg32(struct cs_builder *b, unsigned reg)
{
return cs_scratch_reg_tuple(b, reg, 1);
}
static inline struct cs_index
cs_scratch_reg64(struct cs_builder *b, unsigned reg)
{
assert(reg % 2 == 0);
return cs_scratch_reg_tuple(b, reg, 2);
}
static inline struct cs_index
cs_sr_reg_tuple(struct cs_builder *b, unsigned start, unsigned count)
{
assert(start + count - 1 < PANVK_CS_REG_SCRATCH_START);
return cs_reg_tuple(b, start, count);
}
static inline struct cs_index
cs_sr_reg32(struct cs_builder *b, unsigned reg)
{
return cs_sr_reg_tuple(b, reg, 1);
}
static inline struct cs_index
cs_sr_reg64(struct cs_builder *b, unsigned reg)
{
assert(reg % 2 == 0);
return cs_sr_reg_tuple(b, reg, 2);
}
static inline struct cs_index
cs_subqueue_ctx_reg(struct cs_builder *b)
{
return cs_reg64(b, PANVK_CS_REG_SUBQUEUE_CTX_START);
}
static inline struct cs_index
cs_progress_seqno_reg(struct cs_builder *b, enum panvk_subqueue_id subqueue)
{
assert(PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2) <
PANVK_CS_REG_PROGRESS_SEQNO_END);
return cs_reg64(b, PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2));
}
struct panvk_cs_reg_upd_context {
reg_perm_cb_t reg_perm;
struct panvk_cs_reg_upd_context *next;
};
struct panvk_cs_state {
struct cs_builder builder;
struct cs_load_store_tracker ls_tracker;
/* Used to debug register writes in invalid contexts. */
struct {
struct panvk_cs_reg_upd_context *upd_ctx_stack;
reg_perm_cb_t base_perm;
} reg_access;
/* Sync point relative to the beginning of the command buffer.
* Needs to be offset with the subqueue sync point. */
int32_t relative_sync_point;
};
static inline struct panvk_cs_reg_upd_context *
panvk_cs_reg_ctx_push(struct cs_builder *b,
struct panvk_cs_reg_upd_context *ctx,
reg_perm_cb_t reg_perm)
{
struct panvk_cs_state *cs_state =
container_of(b, struct panvk_cs_state, builder);
ctx->reg_perm = reg_perm;
ctx->next = cs_state->reg_access.upd_ctx_stack;
cs_state->reg_access.upd_ctx_stack = ctx;
return ctx;
}
static inline void
panvk_cs_reg_ctx_pop(struct cs_builder *b, struct panvk_cs_reg_upd_context *ctx)
{
struct panvk_cs_state *cs_state =
container_of(b, struct panvk_cs_state, builder);
assert(cs_state->reg_access.upd_ctx_stack == ctx);
cs_state->reg_access.upd_ctx_stack = ctx->next;
}
struct panvk_cs_reg_range {
unsigned start;
unsigned end;
};
#define PANVK_CS_REG_RANGE(__name) \
{ \
.start = PANVK_CS_REG_##__name##_START, \
.end = PANVK_CS_REG_##__name##_END, \
}
#define panvk_cs_reg_blacklist(__name, ...) \
static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm( \
struct cs_builder *b, unsigned reg) \
{ \
const struct panvk_cs_reg_range ranges[] = { \
__VA_ARGS__, \
}; \
\
for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) { \
if (reg >= ranges[i].start && reg <= ranges[i].end) \
return CS_REG_RD; \
} \
\
return CS_REG_RW; \
}
panvk_cs_reg_blacklist(vt, PANVK_CS_REG_RANGE(RUN_IDVS_SR),
PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
panvk_cs_reg_blacklist(frag, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR),
PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
panvk_cs_reg_blacklist(compute, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR),
PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
#define panvk_cs_reg_whitelist(__name, ...) \
static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm( \
struct cs_builder *b, unsigned reg) \
{ \
const struct panvk_cs_reg_range ranges[] = { \
__VA_ARGS__, \
}; \
\
for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) { \
if (reg >= ranges[i].start && reg <= ranges[i].end) \
return CS_REG_RW; \
} \
\
return CS_REG_RD; \
}
#define panvk_cs_reg_upd_ctx(__b, __name) \
for (struct panvk_cs_reg_upd_context __reg_upd_ctx, \
*reg_upd_ctxp = panvk_cs_reg_ctx_push(__b, &__reg_upd_ctx, \
panvk_cs_##__name##_reg_perm); \
reg_upd_ctxp; \
panvk_cs_reg_ctx_pop(__b, &__reg_upd_ctx), reg_upd_ctxp = NULL)
panvk_cs_reg_whitelist(progress_seqno, PANVK_CS_REG_RANGE(PROGRESS_SEQNO));
#define cs_update_progress_seqno(__b) panvk_cs_reg_upd_ctx(__b, progress_seqno)
panvk_cs_reg_whitelist(compute_ctx, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR));
#define cs_update_compute_ctx(__b) panvk_cs_reg_upd_ctx(__b, compute_ctx)
panvk_cs_reg_whitelist(frag_ctx, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR));
#define cs_update_frag_ctx(__b) panvk_cs_reg_upd_ctx(__b, frag_ctx)
panvk_cs_reg_whitelist(vt_ctx, PANVK_CS_REG_RANGE(RUN_IDVS_SR));
#define cs_update_vt_ctx(__b) panvk_cs_reg_upd_ctx(__b, vt_ctx)
struct panvk_tls_state {
struct panfrost_ptr desc;
struct pan_tls_info info;
unsigned max_wg_count;
};
struct panvk_cmd_compute_state {
struct panvk_descriptor_state desc_state;
const struct panvk_shader *shader;
struct panvk_compute_sysvals sysvals;
mali_ptr push_uniforms;
struct {
struct panvk_shader_desc_state desc;
} cs;
};
struct panvk_attrib_buf {
mali_ptr address;
unsigned size;
};
struct panvk_resolve_attachment {
VkResolveModeFlagBits mode;
struct panvk_image_view *src_iview;
struct panvk_image_view *dst_iview;
};
struct panvk_cmd_graphics_state {
struct panvk_descriptor_state desc_state;
struct {
struct vk_vertex_input_state vi;
struct vk_sample_locations_state sl;
} dynamic;
struct panvk_graphics_sysvals sysvals;
struct panvk_shader_link link;
bool linked;
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
mali_ptr spd;
} fs;
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
struct {
mali_ptr pos, var;
} spds;
} vs;
struct {
struct panvk_attrib_buf bufs[MAX_VBS];
unsigned count;
bool dirty;
} vb;
/* Index buffer */
struct {
struct panvk_buffer *buffer;
uint64_t offset;
uint8_t index_size;
uint32_t first_vertex, base_vertex, base_instance;
bool dirty;
} ib;
struct {
struct panvk_blend_info info;
} cb;
struct {
VkRenderingFlags flags;
uint32_t layer_count;
enum vk_rp_attachment_flags bound_attachments;
struct {
VkFormat fmts[MAX_RTS];
uint8_t samples[MAX_RTS];
struct panvk_resolve_attachment resolve[MAX_RTS];
} color_attachments;
struct pan_image_view zs_pview;
struct {
struct panvk_resolve_attachment resolve;
} z_attachment, s_attachment;
struct {
struct pan_fb_info info;
bool crc_valid[MAX_RTS];
} fb;
struct panfrost_ptr fbds;
mali_ptr tiler;
bool dirty;
} render;
mali_ptr push_uniforms;
};
struct panvk_cmd_buffer {
struct vk_command_buffer vk;
VkCommandBufferUsageFlags flags;
struct panvk_pool cs_pool;
struct panvk_pool desc_pool;
struct panvk_pool tls_pool;
struct list_head push_sets;
uint32_t flush_id;
struct {
struct panvk_cmd_graphics_state gfx;
struct panvk_cmd_compute_state compute;
struct panvk_push_constant_state push_constants;
struct panvk_cs_state cs[PANVK_SUBQUEUE_COUNT];
struct panvk_tls_state tls;
} state;
};
VK_DEFINE_HANDLE_CASTS(panvk_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
static inline struct cs_builder *
panvk_get_cs_builder(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
{
return &cmdbuf->state.cs[subqueue].builder;
}
static inline struct panvk_descriptor_state *
panvk_cmd_get_desc_state(struct panvk_cmd_buffer *cmdbuf,
VkPipelineBindPoint bindpoint)
{
switch (bindpoint) {
case VK_PIPELINE_BIND_POINT_GRAPHICS:
return &cmdbuf->state.gfx.desc_state;
case VK_PIPELINE_BIND_POINT_COMPUTE:
return &cmdbuf->state.compute.desc_state;
default:
assert(!"Unsupported bind point");
return NULL;
}
}
extern const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops);
void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf);
void panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue);
void panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
const VkDependencyInfo *in,
struct panvk_cs_deps *out);
#endif /* PANVK_CMD_BUFFER_H */

View file

@ -0,0 +1,72 @@
/*
* Copyright © 2021 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_QUEUE_H
#define PANVK_QUEUE_H
#ifndef PAN_ARCH
#error "PAN_ARCH must be defined"
#endif
#include "genxml/gen_macros.h"
#include <stdint.h>
#include "panvk_device.h"
#include "vk_queue.h"
enum panvk_subqueue_id {
PANVK_SUBQUEUE_VERTEX_TILER = 0,
PANVK_SUBQUEUE_FRAGMENT,
PANVK_SUBQUEUE_COMPUTE,
PANVK_SUBQUEUE_COUNT,
};
struct panvk_tiler_heap {
uint32_t chunk_size;
struct panvk_priv_mem desc;
struct {
uint32_t handle;
mali_ptr dev_addr;
} context;
};
struct panvk_subqueue {
struct panvk_priv_mem context;
};
struct panvk_desc_ringbuf {
struct panvk_priv_mem syncobj;
struct pan_kmod_bo *bo;
struct {
uint64_t dev;
void *host;
} addr;
};
struct panvk_queue {
struct vk_queue vk;
uint32_t group_handle;
uint32_t syncobj_handle;
struct panvk_tiler_heap tiler_heap;
struct panvk_desc_ringbuf render_desc_ringbuf;
struct panvk_priv_mem syncobjs;
struct panvk_priv_mem debug_syncobjs;
struct panvk_subqueue subqueues[PANVK_SUBQUEUE_COUNT];
};
VK_DEFINE_HANDLE_CASTS(panvk_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
void panvk_per_arch(queue_finish)(struct panvk_queue *queue);
VkResult panvk_per_arch(queue_init)(struct panvk_device *device,
struct panvk_queue *queue, int idx,
const VkDeviceQueueCreateInfo *create_info);
#endif

View file

@ -0,0 +1,703 @@
/*
* Copyright © 2021 Collabora Ltd.
*
* Derived from tu_cmd_buffer.c which is:
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "genxml/gen_macros.h"
#include "panvk_buffer.h"
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_pool.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
#include "panvk_instance.h"
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "pan_blitter.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_props.h"
#include "pan_samples.h"
#include "vk_descriptor_update_template.h"
#include "vk_format.h"
#include "vk_synchronization.h"
static void
emit_tls(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
unsigned core_id_range;
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
if (cmdbuf->state.tls.info.tls.size) {
unsigned thread_tls_alloc =
panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
unsigned size = panfrost_get_total_stack_size(
cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
cmdbuf->state.tls.info.tls.ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
}
assert(!cmdbuf->state.tls.info.wls.size);
if (cmdbuf->state.tls.desc.cpu) {
GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
}
}
static void
finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
cs_update_progress_seqno(b) {
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
uint32_t rel_sync_point = cmdbuf->state.cs[i].relative_sync_point;
if (!rel_sync_point)
continue;
cs_add64(b, cs_progress_seqno_reg(b, i), cs_progress_seqno_reg(b, i),
rel_sync_point);
}
}
/* If we're decoding the CS or dumping memory mappings, we need a flush
* to make sure all data have been pushed to memory. */
if (instance->debug_flags & (PANVK_DEBUG_DUMP | PANVK_DEBUG_TRACE)) {
struct cs_index flush_id = cs_scratch_reg32(b, 0);
cs_move32_to(b, flush_id, 0);
cs_wait_slots(b, SB_ALL_MASK, false);
cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
}
/* If we're in sync/trace more, we signal the debug object. */
if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
struct cs_index one = cs_scratch_reg32(b, 2);
struct cs_index error = cs_scratch_reg32(b, 3);
struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
cs_move32_to(b, one, 1);
cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, debug_syncobjs));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, debug_sync_addr, debug_sync_addr,
sizeof(struct panvk_cs_sync32) * subqueue);
cs_load32_to(b, error, debug_sync_addr,
offsetof(struct panvk_cs_sync32, error));
cs_wait_slots(b, SB_ALL_MASK, false);
cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_SYSTEM, one, debug_sync_addr,
cs_now());
cs_match(b, error, cmp_scratch) {
cs_case(b, 0) {
/* Do nothing. */
}
cs_default(b) {
/* Overwrite the sync error with the first error we encountered. */
cs_store32(b, error, debug_sync_addr,
offsetof(struct panvk_cs_sync32, error));
cs_wait_slots(b, SB_ID(LS), false);
}
}
}
cs_finish(&cmdbuf->state.cs[subqueue].builder);
}
VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
emit_tls(cmdbuf);
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
struct cs_builder *b = &cmdbuf->state.cs[i].builder;
if (!cs_is_valid(b)) {
vk_command_buffer_set_error(&cmdbuf->vk,
VK_ERROR_OUT_OF_DEVICE_MEMORY);
} else {
finish_cs(cmdbuf, i);
}
}
return vk_command_buffer_end(&cmdbuf->vk);
}
static bool
src_stages_need_draw_flush(VkPipelineStageFlags2 stages)
{
static const VkPipelineStageFlags2 draw_flush_stage_mask =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
return (stages & draw_flush_stage_mask) != 0;
}
static bool
stages_cover_subqueue(enum panvk_subqueue_id subqueue,
VkPipelineStageFlags2 stages)
{
static const VkPipelineStageFlags2 queue_coverage[PANVK_SUBQUEUE_COUNT] = {
[PANVK_SUBQUEUE_VERTEX_TILER] = VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
[PANVK_SUBQUEUE_FRAGMENT] =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT,
[PANVK_SUBQUEUE_COMPUTE] =
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT,
};
return (stages & queue_coverage[subqueue]) != 0;
}
static uint32_t
src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,
VkPipelineStageFlags2 stages)
{
if (!stages_cover_subqueue(subqueue, stages))
return 0;
/* Indirect draw buffers are read from the command stream, and load/store
* operations are synchronized with the LS scoreboad immediately after the
* read, so no need to wait in that case.
*/
if (subqueue == PANVK_SUBQUEUE_VERTEX_TILER &&
stages == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
return 0;
/* We need to wait for all previously submitted jobs, and given the
* iterator scoreboard is a moving target, we just wait for the
* whole dynamic scoreboard range. */
return BITFIELD_RANGE(PANVK_SB_ITER_START, PANVK_SB_ITER_COUNT);
}
static void
collect_cache_flush_info(enum panvk_subqueue_id subqueue,
struct panvk_cache_flush_info *cache_flush,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
{
static const VkAccessFlags2 dev_writes[PANVK_SUBQUEUE_COUNT] = {
[PANVK_SUBQUEUE_VERTEX_TILER] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_TRANSFER_WRITE_BIT,
[PANVK_SUBQUEUE_FRAGMENT] =
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_2_TRANSFER_WRITE_BIT,
[PANVK_SUBQUEUE_COMPUTE] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_TRANSFER_WRITE_BIT,
};
static const VkAccessFlags2 dev_reads[PANVK_SUBQUEUE_COUNT] = {
[PANVK_SUBQUEUE_VERTEX_TILER] =
VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_INDEX_READ_BIT |
VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_UNIFORM_READ_BIT |
VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT |
VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
[PANVK_SUBQUEUE_FRAGMENT] =
VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
[PANVK_SUBQUEUE_COMPUTE] =
VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
};
/* Note on the cache organization:
* - L2 cache is unified, so all changes to this cache are automatically
* visible to all GPU sub-components (shader cores, tiler, ...). This
* means we only need to flush when the host (AKA CPU) is involved.
* - LS caches (which are basically just read-write L1 caches) are coherent
* with each other and with the L2 cache, so again, we only need to flush
* when the host is involved.
* - Other read-only L1 caches (like the ones in front of the texture unit)
* are not coherent with the LS or L2 caches, and thus need to be
* invalidated any time a write happens.
*/
#define ACCESS_HITS_RO_L1_CACHE \
(VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | \
VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | \
VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | \
VK_ACCESS_2_TRANSFER_READ_BIT)
if ((dev_writes[subqueue] & src_access) &&
(dev_reads[subqueue] & ACCESS_HITS_RO_L1_CACHE & dst_access))
cache_flush->others |= true;
/* If the host wrote something, we need to clean/invalidate everything. */
if ((src_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
(src_access & VK_ACCESS_2_HOST_WRITE_BIT) &&
((dev_reads[subqueue] | dev_writes[subqueue]) & dst_access)) {
cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
cache_flush->others |= true;
}
/* If the host needs to read something we wrote, we need to clean
* everything. */
if ((dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
(dst_access & VK_ACCESS_2_HOST_READ_BIT) &&
(dev_writes[subqueue] & src_access)) {
cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
}
}
static void
collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages, VkAccessFlags src_access,
VkAccessFlags dst_access, struct panvk_cs_deps *deps)
{
if (src_stages_need_draw_flush(src_stages) && cmdbuf->state.gfx.render.tiler)
deps->needs_draw_flush = true;
uint32_t wait_subqueue_mask = 0;
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
uint32_t sb_mask = src_stages_to_subqueue_sb_mask(i, src_stages);
assert((sb_mask != 0) == stages_cover_subqueue(i, src_stages));
if (!sb_mask)
continue;
deps->src[i].wait_sb_mask |= sb_mask;
collect_cache_flush_info(i, &deps->src[i].cache_flush, src_stages,
dst_stages, src_access, dst_access);
wait_subqueue_mask |= BITFIELD_BIT(i);
}
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (!stages_cover_subqueue(i, dst_stages))
continue;
deps->dst[i].wait_subqueue_mask |= wait_subqueue_mask & ~BITFIELD_BIT(i);
}
}
void
panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
const VkDependencyInfo *in,
struct panvk_cs_deps *out)
{
memset(out, 0, sizeof(*out));
for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
VkPipelineStageFlags2 src_stages =
vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
VkPipelineStageFlags2 dst_stages =
vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
VkAccessFlags2 src_access =
vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
VkAccessFlags2 dst_access =
vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
out);
}
for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
VkPipelineStageFlags2 src_stages =
vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
VkPipelineStageFlags2 dst_stages =
vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
VkAccessFlags2 src_access =
vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
VkAccessFlags2 dst_access =
vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
out);
}
for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
VkPipelineStageFlags2 src_stages =
vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
VkPipelineStageFlags2 dst_stages =
vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
VkAccessFlags2 src_access =
vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
VkAccessFlags2 dst_access =
vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
out);
}
/* The draw flush will add a vertex -> fragment dependency, so we can skip
* the one described in the deps. */
if (out->needs_draw_flush)
out->dst[PANVK_SUBQUEUE_FRAGMENT].wait_subqueue_mask &=
~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
const VkDependencyInfo *pDependencyInfo)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cs_deps deps;
panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
if (deps.needs_draw_flush)
panvk_per_arch(cmd_flush_draws)(cmdbuf);
uint32_t wait_subqueue_mask = 0;
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (!deps.src[i].wait_sb_mask)
continue;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
struct cs_index flush_id = cs_scratch_reg32(b, 0);
cs_move32_to(b, flush_id, 0);
cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
}
/* If no one waits on us, there's no point signaling the sync object. */
if (wait_subqueue_mask & BITFIELD_BIT(i)) {
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index add_val = cs_scratch_reg64(b, 2);
cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
cs_move64_to(b, add_val, 1);
cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
cs_now());
++cs_state->relative_sync_point;
}
}
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (!deps.dst[i].wait_subqueue_mask)
continue;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
if (!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(j)))
continue;
struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index wait_val = cs_scratch_reg64(b, 2);
cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
cs_state->relative_sync_point);
cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
sync_addr);
}
}
}
void
panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
enum panvk_subqueue_id subqueue)
{
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
struct cs_index iter_sb = cs_scratch_reg32(b, 0);
struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS), false);
cs_match(b, iter_sb, cmp_scratch) {
#define CASE(x) \
cs_case(b, x) { \
cs_wait_slot(b, SB_ITER(x), false); \
cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS)); \
}
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
#undef CASE
}
}
static struct cs_buffer
alloc_cs_buffer(void *cookie)
{
struct panvk_cmd_buffer *cmdbuf = cookie;
const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
struct panfrost_ptr ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
return (struct cs_buffer){
.cpu = ptr.cpu,
.gpu = ptr.gpu,
.capacity = capacity,
};
}
static enum cs_reg_perm
cs_reg_perm(struct cs_builder *b, unsigned reg)
{
struct panvk_cs_state *cs_state =
container_of(b, struct panvk_cs_state, builder);
struct panvk_cs_reg_upd_context *upd_ctx;
for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
upd_ctx = upd_ctx->next) {
if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
return CS_REG_RW;
}
return cs_state->reg_access.base_perm(b, reg);
}
static void
init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
[PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
[PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
[PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
};
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
/* Lazy allocation of the root CS. */
struct cs_buffer root_cs = {0};
struct cs_builder_conf conf = {
.nr_registers = 96,
.nr_kernel_registers = 4,
.alloc_buffer = alloc_cs_buffer,
.cookie = cmdbuf,
};
if (instance->debug_flags & PANVK_DEBUG_CS) {
cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
.sb_slot = SB_ID(LS),
};
conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
conf.reg_perm = cs_reg_perm;
}
cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs);
}
}
static void
panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
VkCommandBufferResetFlags flags)
{
struct panvk_cmd_buffer *cmdbuf =
container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
struct panvk_cmd_pool *pool =
container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
vk_command_buffer_reset(&cmdbuf->vk);
panvk_pool_reset(&cmdbuf->cs_pool);
panvk_pool_reset(&cmdbuf->desc_pool);
panvk_pool_reset(&cmdbuf->tls_pool);
list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
list_inithead(&cmdbuf->push_sets);
memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
init_cs_builders(cmdbuf);
}
static void
panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
{
struct panvk_cmd_buffer *cmdbuf =
container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
struct panvk_cmd_pool *pool =
container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
panvk_pool_cleanup(&cmdbuf->cs_pool);
panvk_pool_cleanup(&cmdbuf->desc_pool);
panvk_pool_cleanup(&cmdbuf->tls_pool);
list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
vk_command_buffer_finish(&cmdbuf->vk);
vk_free(&dev->vk.alloc, cmdbuf);
}
static VkResult
panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
struct vk_command_buffer **cmdbuf_out)
{
struct panvk_device *device =
container_of(vk_pool->base.device, struct panvk_device, vk);
struct panvk_cmd_pool *pool =
container_of(vk_pool, struct panvk_cmd_pool, vk);
struct panvk_cmd_buffer *cmdbuf;
cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!cmdbuf)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result = vk_command_buffer_init(
&pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
if (result != VK_SUCCESS) {
vk_free(&device->vk.alloc, cmdbuf);
return result;
}
list_inithead(&cmdbuf->push_sets);
cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
&cmdbuf->state.gfx.dynamic.sl;
struct panvk_pool_properties cs_pool_props = {
.create_flags = 0,
.slab_size = 64 * 1024,
.label = "Command buffer CS pool",
.prealloc = false,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
struct panvk_pool_properties desc_pool_props = {
.create_flags = 0,
.slab_size = 64 * 1024,
.label = "Command buffer descriptor pool",
.prealloc = false,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
&desc_pool_props);
struct panvk_pool_properties tls_pool_props = {
.create_flags =
panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
.slab_size = 64 * 1024,
.label = "TLS pool",
.prealloc = false,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
&tls_pool_props);
init_cs_builders(cmdbuf);
*cmdbuf_out = &cmdbuf->vk;
return VK_SUCCESS;
}
const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
.create = panvk_create_cmdbuf,
.reset = panvk_reset_cmdbuf,
.destroy = panvk_destroy_cmdbuf,
};
VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
cmdbuf->flags = pBeginInfo->flags;
/* The descriptor ringbuf trips out pandecode because we always point to the
* next tiler/framebuffer descriptor after CS execution, which means we're
* decoding an uninitialized or stale descriptor.
* FIXME: find a way to trace the simultaneous path that doesn't crash. One
* option would be to disable CS intepretation and dump the RUN_xxx context
* on the side at execution time.
*/
if (instance->debug_flags & PANVK_DEBUG_TRACE)
cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
return VK_SUCCESS;
}

View file

@ -0,0 +1,285 @@
/*
* Copyright © 2024 Collabora Ltd.
*
* Derived from tu_cmd_buffer.c which is:
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
* Copyright © 2015 Intel Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "genxml/gen_macros.h"
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_meta.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
#include "panvk_meta.h"
#include "panvk_physical_device.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_props.h"
#include <vulkan/vulkan_core.h>
static VkResult
prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
if (cs_desc_state->driver_set.dev_addr)
return VK_SUCCESS;
const struct panvk_descriptor_state *desc_state =
&cmdbuf->state.compute.desc_state;
const struct panvk_shader *cs = cmdbuf->state.compute.shader;
uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
struct panvk_opaque_desc *descs = driver_set.cpu;
if (!driver_set.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
/* Dummy sampler always comes first. */
pan_pack(&descs[0], SAMPLER, _) {
}
panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
(struct mali_buffer_packed *)(&descs[1]));
cs_desc_state->driver_set.dev_addr = driver_set.gpu;
cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
return VK_SUCCESS;
}
static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
{
cmdbuf->state.compute.push_uniforms = panvk_per_arch(
cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals,
sizeof(cmdbuf->state.compute.sysvals));
return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS
: VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
uint32_t baseGroupX, uint32_t baseGroupY,
uint32_t baseGroupZ, uint32_t groupCountX,
uint32_t groupCountY, uint32_t groupCountZ)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
const struct panvk_shader *shader = cmdbuf->state.compute.shader;
VkResult result;
/* If there's no compute shader, we can skip the dispatch. */
if (!panvk_priv_mem_dev_addr(shader->spd))
return;
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_descriptor_state *desc_state =
&cmdbuf->state.compute.desc_state;
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!tsd.gpu)
return;
struct pan_tls_info tlsinfo = {
.tls.size = shader->info.tls_size,
.wls.size = shader->info.wls_size,
};
unsigned core_id_range;
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
if (tlsinfo.wls.size) {
/* TODO: Clamp WLS instance to some maximum WLS budget. */
struct pan_compute_dim dim = {groupCountX, groupCountY, groupCountZ};
tlsinfo.wls.instances = pan_wls_instances(&dim);
unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
tlsinfo.wls.instances * core_id_range;
tlsinfo.wls.ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
if (!tlsinfo.wls.ptr)
return;
}
cmdbuf->state.tls.info.tls.size =
MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
if (!cmdbuf->state.tls.desc.gpu) {
cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!cmdbuf->state.tls.desc.gpu)
return;
}
GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
result = panvk_per_arch(cmd_prepare_push_descs)(
cmdbuf, desc_state, shader->desc_info.used_set_mask);
if (result != VK_SUCCESS)
return;
struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
sysvals->num_work_groups.x = groupCountX;
sysvals->num_work_groups.y = groupCountY;
sysvals->num_work_groups.z = groupCountZ;
sysvals->local_group_size.x = shader->local_size.x;
sysvals->local_group_size.y = shader->local_size.y;
sysvals->local_group_size.z = shader->local_size.z;
result = prepare_driver_set(cmdbuf);
if (result != VK_SUCCESS)
return;
cmdbuf->state.compute.push_uniforms = 0;
result = prepare_push_uniforms(cmdbuf);
if (result != VK_SUCCESS)
return;
result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
shader, cs_desc_state);
if (result != VK_SUCCESS)
return;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
unsigned task_axis = MALI_TASK_AXIS_X;
unsigned task_increment = 0;
/* Copy the global TLS pointer to the per-job TSD. */
cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS), false);
cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS), false);
cs_update_compute_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
uint32_t push_size = 256 + sizeof(struct panvk_compute_sysvals);
uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
mali_ptr fau_ptr =
cmdbuf->state.compute.push_uniforms | (fau_count << 56);
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
/* Global attribute offset */
cs_move32_to(b, cs_sr_reg32(b, 32), 0);
struct mali_compute_size_workgroup_packed wg_size;
pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
cfg.workgroup_size_x = shader->local_size.x;
cfg.workgroup_size_y = shader->local_size.y;
cfg.workgroup_size_z = shader->local_size.z;
cfg.allow_merging_workgroups = false;
}
cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, 34), baseGroupX * shader->local_size.x);
cs_move32_to(b, cs_sr_reg32(b, 35), baseGroupY * shader->local_size.y);
cs_move32_to(b, cs_sr_reg32(b, 36), baseGroupZ * shader->local_size.z);
cs_move32_to(b, cs_sr_reg32(b, 37), groupCountX);
cs_move32_to(b, cs_sr_reg32(b, 38), groupCountY);
cs_move32_to(b, cs_sr_reg32(b, 39), groupCountZ);
/* Pick the task_axis and task_increment to maximize thread utilization. */
unsigned threads_per_wg =
shader->local_size.x * shader->local_size.y * shader->local_size.z;
unsigned max_thread_cnt = panfrost_compute_max_thread_count(
&phys_dev->kmod.props, shader->info.work_reg_count);
unsigned threads_per_task = threads_per_wg;
unsigned local_size[3] = {
shader->local_size.x,
shader->local_size.y,
shader->local_size.z,
};
for (unsigned i = 0; i < 3; i++) {
if (threads_per_task * local_size[i] >= max_thread_cnt) {
/* We reached out thread limit, stop at the current axis and
* calculate the increment so it doesn't exceed the per-core
* thread capacity.
*/
task_increment = max_thread_cnt / threads_per_task;
break;
} else if (task_axis == MALI_TASK_AXIS_Z) {
/* We reached the Z axis, and there's still room to stuff more
* threads. Pick the current axis grid size as our increment
* as there's no point using something bigger.
*/
task_increment = local_size[i];
break;
}
threads_per_task *= local_size[i];
task_axis++;
}
}
assert(task_axis <= MALI_TASK_AXIS_Z);
assert(task_increment > 0);
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
cs_req_res(b, CS_COMPUTE_RES);
cs_run_compute(b, task_increment, task_axis, false,
cs_shader_res_sel(0, 0, 0, 0));
cs_req_res(b, 0);
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index iter_sb = cs_scratch_reg32(b, 2);
struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
struct cs_index add_val = cs_scratch_reg64(b, 4);
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
cs_move64_to(b, add_val, 1);
cs_match(b, iter_sb, cmp_scratch) {
#define CASE(x) \
cs_case(b, x) { \
cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \
add_val, sync_addr, \
cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
cs_move32_to(b, iter_sb, next_iter_sb(x)); \
}
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
#undef CASE
}
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS), false);
++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
VkBuffer _buffer, VkDeviceSize offset)
{
panvk_stub();
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,745 @@
/*
* Copyright © 2024 Collabora Ltd.
*
* SPDX-License-Identifier: MIT
*/
#include "drm-uapi/panthor_drm.h"
#include "genxml/cs_builder.h"
#include "genxml/decode.h"
#include "panvk_cmd_buffer.h"
#include "panvk_macros.h"
#include "panvk_queue.h"
#include "vk_drm_syncobj.h"
#include "vk_log.h"
static void
finish_render_desc_ringbuf(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
panvk_pool_free_mem(&dev->mempools.rw, ringbuf->syncobj);
if (dev->debug.decode_ctx && ringbuf->addr.dev) {
pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
RENDER_DESC_RINGBUF_SIZE);
pandecode_inject_free(dev->debug.decode_ctx,
ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
RENDER_DESC_RINGBUF_SIZE);
}
if (ringbuf->addr.dev) {
struct pan_kmod_vm_op op = {
.type = PAN_KMOD_VM_OP_TYPE_UNMAP,
.va = {
.start = ringbuf->addr.dev,
.size = RENDER_DESC_RINGBUF_SIZE * 2,
},
};
ASSERTED int ret =
pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
assert(!ret);
}
if (ringbuf->addr.host) {
ASSERTED int ret =
os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE);
assert(!ret);
}
pan_kmod_bo_put(ringbuf->bo);
}
static VkResult
init_render_desc_ringbuf(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
const size_t size = RENDER_DESC_RINGBUF_SIZE;
VkResult result;
int ret;
ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags);
if (!ringbuf->bo)
return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a descriptor ring buffer context");
if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
ringbuf->addr.host = pan_kmod_bo_mmap(
ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
if (ringbuf->addr.host == MAP_FAILED) {
result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to CPU map ringbuf BO");
goto err_finish_ringbuf;
}
}
/* We choose the alignment to guarantee that we won't ever cross a 4G
* boundary when accessing the mapping. This way we can encode the wraparound
* using 32-bit operations. */
uint64_t dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2);
struct pan_kmod_vm_op vm_ops[] = {
{
.type = PAN_KMOD_VM_OP_TYPE_MAP,
.va = {
.start = dev_addr,
.size = RENDER_DESC_RINGBUF_SIZE,
},
.map = {
.bo = ringbuf->bo,
.bo_offset = 0,
},
},
{
.type = PAN_KMOD_VM_OP_TYPE_MAP,
.va = {
.start = dev_addr + RENDER_DESC_RINGBUF_SIZE,
.size = RENDER_DESC_RINGBUF_SIZE,
},
.map = {
.bo = ringbuf->bo,
.bo_offset = 0,
},
},
};
ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
ARRAY_SIZE(vm_ops));
if (ret) {
result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to GPU map ringbuf BO");
goto err_finish_ringbuf;
}
ringbuf->addr.dev = dev_addr;
if (dev->debug.decode_ctx) {
pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
pandecode_inject_mmap(dev->debug.decode_ctx,
ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
}
struct panvk_pool_alloc_info alloc_info = {
.size = sizeof(struct panvk_cs_sync32),
.alignment = 64,
};
ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
if (!syncobj) {
result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create the render desc ringbuf context");
goto err_finish_ringbuf;
}
*syncobj = (struct panvk_cs_sync32){
.seqno = RENDER_DESC_RINGBUF_SIZE,
};
return VK_SUCCESS;
err_finish_ringbuf:
finish_render_desc_ringbuf(queue);
return result;
}
static VkResult
init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_subqueue *subq = &queue->subqueues[subqueue];
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
unsigned debug = instance->debug_flags;
struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
struct panvk_pool_alloc_info alloc_info = {
.size = sizeof(struct panvk_cs_subqueue_context),
.alignment = 64,
};
subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
if (!panvk_priv_mem_host_addr(subq->context))
return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a queue context");
struct panvk_cs_subqueue_context *cs_ctx =
panvk_priv_mem_host_addr(subq->context);
*cs_ctx = (struct panvk_cs_subqueue_context){
.syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
.debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
.iter_sb = 0,
};
/* We use the geometry buffer for our temporary CS buffer. */
struct cs_buffer root_cs = {
.cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
.gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
.capacity = 64 * 1024 / sizeof(uint64_t),
};
const struct cs_builder_conf conf = {
.nr_registers = 96,
.nr_kernel_registers = 4,
};
struct cs_builder b;
assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
cs_builder_init(&b, &conf, root_cs);
/* Pass the context to through r62. */
cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
panvk_priv_mem_dev_addr(subq->context));
/* Intialize scoreboard slots used for asynchronous operations. */
cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
/* We do greater than test on sync objects, and given the reference seqno
* registers are all zero at init time, we need to initialize all syncobjs
* with a seqno of one. */
syncobjs[subqueue].seqno = 1;
if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
cs_ctx->render.tiler_heap =
panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
/* Our geometry buffer comes 4k after the tiler heap, and we encode the
* size in the lower 12 bits so the address can be copied directly
* to the tiler descriptors. */
cs_ctx->render.geom_buf =
(cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
/* Initialize the ringbuf */
cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
.syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
.ptr = queue->render_desc_ringbuf.addr.dev,
.pos = 0,
};
struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
/* Pre-set the heap context on the vertex-tiler/fragment queues. */
cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
cs_heap_set(&b, heap_ctx_addr);
}
cs_finish(&b);
assert(cs_is_valid(&b));
struct drm_panthor_sync_op syncop = {
.flags =
DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
.handle = queue->syncobj_handle,
.timeline_value = 0,
};
struct drm_panthor_queue_submit qsubmit = {
.queue_index = subqueue,
.stream_size = cs_root_chunk_size(&b),
.stream_addr = cs_root_chunk_gpu_addr(&b),
.latest_flush = 0,
.syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
};
struct drm_panthor_group_submit gsubmit = {
.group_handle = queue->group_handle,
.queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
};
int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
if (ret)
return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
"Failed to initialized subqueue: %m");
ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
NULL);
if (ret)
return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
"SyncobjWait failed: %m");
if (debug & PANVK_DEBUG_TRACE) {
uint32_t regs[256] = {0};
pandecode_cs(dev->debug.decode_ctx, qsubmit.stream_addr,
qsubmit.stream_size, phys_dev->kmod.props.gpu_prod_id, regs);
pandecode_next_frame(dev->debug.decode_ctx);
}
return VK_SUCCESS;
}
static void
cleanup_queue(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
panvk_pool_free_mem(&dev->mempools.rw, queue->subqueues[i].context);
finish_render_desc_ringbuf(queue);
panvk_pool_free_mem(&dev->mempools.rw, queue->debug_syncobjs);
panvk_pool_free_mem(&dev->mempools.rw, queue->syncobjs);
}
static VkResult
init_queue(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
VkResult result;
struct panvk_pool_alloc_info alloc_info = {
.size =
ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
.alignment = 64,
};
queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
if (!panvk_priv_mem_host_addr(queue->syncobjs))
return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to allocate subqueue sync objects");
if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
alloc_info.size =
ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
queue->debug_syncobjs =
panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to allocate subqueue sync objects");
goto err_cleanup_queue;
}
}
result = init_render_desc_ringbuf(queue);
if (result != VK_SUCCESS)
goto err_cleanup_queue;
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
result = init_subqueue(queue, i);
if (result != VK_SUCCESS)
goto err_cleanup_queue;
}
return VK_SUCCESS;
err_cleanup_queue:
cleanup_queue(queue);
return result;
}
static VkResult
create_group(struct panvk_queue *queue)
{
const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
struct drm_panthor_queue_create qc[] = {
[PANVK_SUBQUEUE_VERTEX_TILER] =
{
.priority = 1,
.ringbuf_size = 64 * 1024,
},
[PANVK_SUBQUEUE_FRAGMENT] =
{
.priority = 1,
.ringbuf_size = 64 * 1024,
},
[PANVK_SUBQUEUE_COMPUTE] =
{
.priority = 1,
.ringbuf_size = 64 * 1024,
},
};
struct drm_panthor_group_create gc = {
.compute_core_mask = phys_dev->kmod.props.shader_present,
.fragment_core_mask = phys_dev->kmod.props.shader_present,
.tiler_core_mask = 1,
.max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
.max_fragment_cores =
util_bitcount64(phys_dev->kmod.props.shader_present),
.max_tiler_cores = 1,
.priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
.queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
.vm_id = pan_kmod_vm_handle(dev->kmod.vm),
};
int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
if (ret)
return vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
"Failed to create a scheduling group");
queue->group_handle = gc.group_handle;
return VK_SUCCESS;
}
static void
destroy_group(struct panvk_queue *queue)
{
const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct drm_panthor_group_destroy gd = {
.group_handle = queue->group_handle,
};
ASSERTED int ret =
drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
assert(!ret);
}
static VkResult
init_tiler(struct panvk_queue *queue)
{
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
VkResult result;
/* We allocate the tiler heap descriptor and geometry buffer in one go,
* so we can pass it through a single 64-bit register to the VERTEX_TILER
* command streams. */
struct panvk_pool_alloc_info alloc_info = {
.size = (64 * 1024) + 4096,
.alignment = 4096,
};
tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a tiler heap context");
goto err_free_desc;
}
tiler_heap->chunk_size = 2 * 1024 * 1024;
struct drm_panthor_tiler_heap_create thc = {
.vm_id = pan_kmod_vm_handle(dev->kmod.vm),
.chunk_size = tiler_heap->chunk_size,
.initial_chunk_count = 5,
.max_chunks = 64,
.target_in_flight = 65535,
};
int ret =
drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
if (ret) {
result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
"Failed to create a tiler heap context");
goto err_free_desc;
}
tiler_heap->context.handle = thc.handle;
tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
pan_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP, cfg) {
cfg.size = tiler_heap->chunk_size;
cfg.base = thc.first_heap_chunk_gpu_va;
cfg.bottom = cfg.base + 64;
cfg.top = cfg.base + cfg.size;
}
return VK_SUCCESS;
err_free_desc:
panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
return result;
}
static void
cleanup_tiler(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
struct drm_panthor_tiler_heap_destroy thd = {
.handle = tiler_heap->context.handle,
};
ASSERTED int ret =
drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
assert(!ret);
panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
}
static VkResult
panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
{
struct panvk_queue *queue = container_of(vk_queue, struct panvk_queue, vk);
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(queue->vk.base.device->physical);
VkResult result = VK_SUCCESS;
int ret;
if (vk_queue_is_lost(&queue->vk))
return VK_ERROR_DEVICE_LOST;
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
unsigned debug = instance->debug_flags;
bool force_sync = debug & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
uint32_t qsubmit_count = 0;
uint32_t used_queue_mask = 0;
for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
struct panvk_cmd_buffer *cmdbuf =
container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
assert(cs_is_valid(&cmdbuf->state.cs[j].builder));
if (!cs_is_empty(&cmdbuf->state.cs[j].builder)) {
used_queue_mask |= BITFIELD_BIT(j);
qsubmit_count++;
}
}
}
/* Synchronize all subqueues if we have no command buffer submitted. */
if (!qsubmit_count)
used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
/* We add sync-only queue submits to place our wait/signal operations. */
if (submit->wait_count > 0)
qsubmit_count += util_bitcount(used_queue_mask);
if (submit->signal_count > 0)
qsubmit_count += util_bitcount(used_queue_mask);
uint32_t syncop_count = submit->wait_count + util_bitcount(used_queue_mask);
STACK_ARRAY(struct drm_panthor_queue_submit, qsubmits, qsubmit_count);
STACK_ARRAY(struct drm_panthor_sync_op, syncops, syncop_count);
struct drm_panthor_sync_op *wait_ops = syncops;
struct drm_panthor_sync_op *signal_ops = syncops + submit->wait_count;
qsubmit_count = 0;
if (submit->wait_count) {
for (uint32_t i = 0; i < submit->wait_count; i++) {
assert(vk_sync_type_is_drm_syncobj(submit->waits[i].sync->type));
struct vk_drm_syncobj *syncobj =
vk_sync_as_drm_syncobj(submit->waits[i].sync);
wait_ops[i] = (struct drm_panthor_sync_op){
.flags = (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE
? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
: DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
DRM_PANTHOR_SYNC_OP_WAIT,
.handle = syncobj->syncobj,
.timeline_value = submit->waits[i].wait_value,
};
}
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (used_queue_mask & BITFIELD_BIT(i)) {
qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
.queue_index = i,
.syncs = DRM_PANTHOR_OBJ_ARRAY(submit->wait_count, wait_ops),
};
}
}
}
for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
struct panvk_cmd_buffer *cmdbuf =
container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
if (cs_is_empty(&cmdbuf->state.cs[j].builder))
continue;
qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
.queue_index = j,
.stream_size = cs_root_chunk_size(&cmdbuf->state.cs[j].builder),
.stream_addr = cs_root_chunk_gpu_addr(&cmdbuf->state.cs[j].builder),
.latest_flush = 0,
};
}
}
if (submit->signal_count || force_sync) {
uint32_t signal_op = 0;
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (used_queue_mask & BITFIELD_BIT(i)) {
signal_ops[signal_op] = (struct drm_panthor_sync_op){
.flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
DRM_PANTHOR_SYNC_OP_SIGNAL,
.handle = queue->syncobj_handle,
.timeline_value = signal_op + 1,
};
qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
.queue_index = i,
.syncs = DRM_PANTHOR_OBJ_ARRAY(1, &signal_ops[signal_op++]),
};
}
}
}
if (force_sync) {
struct panvk_cs_sync32 *debug_syncs =
panvk_priv_mem_host_addr(queue->debug_syncobjs);
assert(debug_syncs);
memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
}
struct drm_panthor_group_submit gsubmit = {
.group_handle = queue->group_handle,
.queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
};
ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
if (ret) {
result = vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
goto out;
}
if (submit->signal_count || force_sync) {
if (force_sync) {
uint64_t point = util_bitcount(used_queue_mask);
ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
&point, 1, INT64_MAX,
DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
assert(!ret);
}
for (uint32_t i = 0; i < submit->signal_count; i++) {
assert(vk_sync_type_is_drm_syncobj(submit->signals[i].sync->type));
struct vk_drm_syncobj *syncobj =
vk_sync_as_drm_syncobj(submit->signals[i].sync);
drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
submit->signals[i].signal_value,
queue->syncobj_handle, 0, 0);
}
drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
}
if (debug & PANVK_DEBUG_TRACE) {
for (uint32_t i = 0; i < qsubmit_count; i++) {
if (!qsubmits[i].stream_size)
continue;
uint32_t subqueue = qsubmits[i].queue_index;
uint32_t regs[256] = {0};
uint64_t ctx =
panvk_priv_mem_dev_addr(queue->subqueues[subqueue].context);
regs[PANVK_CS_REG_SUBQUEUE_CTX_START] = ctx;
regs[PANVK_CS_REG_SUBQUEUE_CTX_START + 1] = ctx >> 32;
simple_mtx_lock(&dev->debug.decode_ctx->lock);
pandecode_dump_file_open(dev->debug.decode_ctx);
pandecode_log(dev->debug.decode_ctx, "CS%d\n",
qsubmits[i].queue_index);
simple_mtx_unlock(&dev->debug.decode_ctx->lock);
pandecode_cs(dev->debug.decode_ctx, qsubmits[i].stream_addr,
qsubmits[i].stream_size, phys_dev->kmod.props.gpu_prod_id,
regs);
}
}
if (debug & PANVK_DEBUG_DUMP)
pandecode_dump_mappings(dev->debug.decode_ctx);
if (force_sync) {
struct panvk_cs_sync32 *debug_syncs =
panvk_priv_mem_host_addr(queue->debug_syncobjs);
uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
for (uint32_t i = 0; i < qsubmit_count; i++) {
if (qsubmits[i].stream_size)
debug_sync_points[qsubmits[i].queue_index]++;
}
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
if (debug_syncs[i].seqno != debug_sync_points[i] ||
debug_syncs[i].error != 0)
assert(!"Incomplete job or timeout\n");
}
}
if (debug & PANVK_DEBUG_TRACE)
pandecode_next_frame(dev->debug.decode_ctx);
out:
STACK_ARRAY_FINISH(syncops);
STACK_ARRAY_FINISH(qsubmits);
return result;
}
VkResult
panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
int idx, const VkDeviceQueueCreateInfo *create_info)
{
const struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
if (result != VK_SUCCESS)
return result;
int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
if (ret) {
result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
"Failed to create our internal sync object");
goto err_finish_queue;
}
result = init_tiler(queue);
if (result != VK_SUCCESS)
goto err_destroy_syncobj;
result = create_group(queue);
if (result != VK_SUCCESS)
goto err_cleanup_tiler;
result = init_queue(queue);
if (result != VK_SUCCESS)
goto err_destroy_group;
queue->vk.driver_submit = panvk_queue_submit;
return VK_SUCCESS;
err_destroy_group:
destroy_group(queue);
err_cleanup_tiler:
cleanup_tiler(queue);
err_destroy_syncobj:
drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
err_finish_queue:
vk_queue_finish(&queue->vk);
return result;
}
void
panvk_per_arch(queue_finish)(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
destroy_group(queue);
cleanup_tiler(queue);
drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
vk_queue_finish(&queue->vk);
}

View file

@ -57,6 +57,15 @@ jm_files = [
'jm/panvk_vX_queue.c', 'jm/panvk_vX_queue.c',
] ]
csf_archs = [10]
csf_inc_dir = ['csf']
csf_files = [
'csf/panvk_vX_cmd_buffer.c',
'csf/panvk_vX_cmd_dispatch.c',
'csf/panvk_vX_cmd_draw.c',
'csf/panvk_vX_queue.c',
]
common_per_arch_files = [ common_per_arch_files = [
panvk_entrypoints[0], panvk_entrypoints[0],
'panvk_vX_blend.c', 'panvk_vX_blend.c',
@ -74,21 +83,7 @@ common_per_arch_files = [
] ]
foreach arch : [6, 7, 10] foreach arch : [6, 7, 10]
if arch in valhall_archs per_arch_files = common_per_arch_files
# We're lacking key components to compile common source files for v9/v10.
# Just add the v10 entrypoints for now.
per_arch_files = [
panvk_entrypoints[0],
'panvk_vX_blend.c',
'panvk_vX_descriptor_set.c',
'panvk_vX_descriptor_set_layout.c',
'panvk_vX_nir_lower_descriptors.c',
'panvk_vX_shader.c',
]
else
per_arch_files = common_per_arch_files
endif
inc_panvk_per_arch = [] inc_panvk_per_arch = []
if arch in bifrost_archs if arch in bifrost_archs
@ -102,6 +97,9 @@ foreach arch : [6, 7, 10]
if arch in jm_archs if arch in jm_archs
inc_panvk_per_arch += jm_inc_dir inc_panvk_per_arch += jm_inc_dir
per_arch_files += jm_files per_arch_files += jm_files
elif arch in csf_archs
inc_panvk_per_arch += csf_inc_dir
per_arch_files += csf_files
endif endif
panvk_per_arch_libs += static_library( panvk_per_arch_libs += static_library(

View file

@ -36,6 +36,7 @@ static const struct debug_control panvk_debug_options[] = {
{"linear", PANVK_DEBUG_LINEAR}, {"linear", PANVK_DEBUG_LINEAR},
{"dump", PANVK_DEBUG_DUMP}, {"dump", PANVK_DEBUG_DUMP},
{"no_known_warn", PANVK_DEBUG_NO_KNOWN_WARN}, {"no_known_warn", PANVK_DEBUG_NO_KNOWN_WARN},
{"cs", PANVK_DEBUG_CS},
{NULL, 0}}; {NULL, 0}};
VKAPI_ATTR VkResult VKAPI_CALL VKAPI_ATTR VkResult VKAPI_CALL

View file

@ -21,6 +21,7 @@ enum panvk_debug_flags {
PANVK_DEBUG_LINEAR = 1 << 5, PANVK_DEBUG_LINEAR = 1 << 5,
PANVK_DEBUG_DUMP = 1 << 6, PANVK_DEBUG_DUMP = 1 << 6,
PANVK_DEBUG_NO_KNOWN_WARN = 1 << 7, PANVK_DEBUG_NO_KNOWN_WARN = 1 << 7,
PANVK_DEBUG_CS = 1 << 8,
}; };
#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ #if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \

View file

@ -29,11 +29,7 @@
#include "genxml/gen_macros.h" #include "genxml/gen_macros.h"
/* FIXME: make the include statement unconditional when the CSF command buffer
* logic is implemented. */
#if PAN_ARCH <= 7
#include "panvk_cmd_buffer.h" #include "panvk_cmd_buffer.h"
#endif
#include "panvk_device.h" #include "panvk_device.h"
#include "panvk_instance.h" #include "panvk_instance.h"
#include "panvk_mempool.h" #include "panvk_mempool.h"
@ -1333,9 +1329,6 @@ static const struct vk_shader_ops panvk_shader_ops = {
panvk_shader_get_executable_internal_representations, panvk_shader_get_executable_internal_representations,
}; };
/* FIXME: make this unconditional when the CSF command buffer logic is
* implemented. */
#if PAN_ARCH <= 7
static void static void
panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage, panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
struct panvk_shader *shader) struct panvk_shader *shader)
@ -1380,7 +1373,6 @@ panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
panvk_cmd_bind_shader(cmd, stages[i], shader); panvk_cmd_bind_shader(cmd, stages[i], shader);
} }
} }
#endif
const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = { const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
.get_nir_options = panvk_get_nir_options, .get_nir_options = panvk_get_nir_options,
@ -1390,10 +1382,5 @@ const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
.compile = panvk_compile_shaders, .compile = panvk_compile_shaders,
.deserialize = panvk_deserialize_shader, .deserialize = panvk_deserialize_shader,
.cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
/* FIXME: make the assignment unconditional when the CSF command buffer logic is
* implemented. */
#if PAN_ARCH <= 7
.cmd_bind_shaders = panvk_cmd_bind_shaders, .cmd_bind_shaders = panvk_cmd_bind_shaders,
#endif
}; };