mesa/src/freedreno/vulkan/tu_cmd_buffer.h
Danylo Piliaiev be481e6615 tu: Disable FS in certain cases even if FS is not empty
If FS doesn't have side-effects and color write mask is zero.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33735>
2025-03-31 12:15:56 +02:00

816 lines
27 KiB
C++

/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
* SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
*/
#ifndef TU_CMD_BUFFER_H
#define TU_CMD_BUFFER_H
#include "tu_common.h"
#include "tu_cs.h"
#include "tu_descriptor_set.h"
#include "tu_device.h"
#include "tu_lrz.h"
#include "tu_pass.h"
#include "tu_pipeline.h"
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_VS,
TU_DRAW_STATE_VS_BINNING,
TU_DRAW_STATE_HS,
TU_DRAW_STATE_DS,
TU_DRAW_STATE_GS,
TU_DRAW_STATE_GS_BINNING,
TU_DRAW_STATE_VPC,
TU_DRAW_STATE_FS,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_FS_PARAMS,
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};
struct tu_descriptor_state
{
struct tu_descriptor_set *sets[MAX_SETS];
struct tu_descriptor_set push_set;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
uint64_t set_iova[MAX_SETS];
uint32_t max_sets_bound;
uint32_t max_dynamic_offset_size;
};
enum tu_cmd_dirty_bits
{
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
TU_CMD_DIRTY_DESC_SETS = BIT(1),
TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
TU_CMD_DIRTY_LRZ = BIT(4),
TU_CMD_DIRTY_VS_PARAMS = BIT(5),
TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
TU_CMD_DIRTY_SUBPASS = BIT(7),
TU_CMD_DIRTY_FDM = BIT(8),
TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
TU_CMD_DIRTY_TES = BIT(10),
TU_CMD_DIRTY_PROGRAM = BIT(11),
TU_CMD_DIRTY_RAST_ORDER = BIT(12),
TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
TU_CMD_DIRTY_FS = BIT(14),
TU_CMD_DIRTY_SHADING_RATE = BIT(15),
TU_CMD_DIRTY_DISABLE_FS = BIT(16),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(17)
};
/* There are only three cache domains we have to care about: the CCU, or
* color cache unit, which is used for color and depth/stencil attachments
* and copy/blit destinations, and is split conceptually into color and depth,
* and the universal cache or UCHE which is used for pretty much everything
* else, except for the CP (uncached) and host. We need to flush whenever data
* crosses these boundaries.
*/
enum tu_cmd_access_mask {
TU_ACCESS_NONE = 0,
TU_ACCESS_UCHE_READ = 1 << 0,
TU_ACCESS_UCHE_WRITE = 1 << 1,
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
/* Experiments have shown that while it's safe to avoid flushing the CCU
* after each blit/renderpass, it's not safe to assume that subsequent
* lookups with a different attachment state will hit unflushed cache
* entries. That is, the CCU needs to be flushed and possibly invalidated
* when accessing memory with a different attachment state. Writing to an
* attachment under the following conditions after clearing using the
* normal 2d engine path is known to have issues:
*
* - It isn't the 0'th layer.
* - There are more than one attachment, and this isn't the 0'th attachment
* (this seems to also depend on the cpp of the attachments).
*
* Our best guess is that the layer/MRT state is used when computing
* the location of a cache entry in CCU, to avoid conflicts. We assume that
* any access in a renderpass after or before an access by a transfer needs
* a flush/invalidate, and use the _INCOHERENT variants to represent access
* by a renderpass.
*/
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
/* Accesses which bypasses any cache. e.g. writes via the host,
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
*/
TU_ACCESS_SYSMEM_READ = 1 << 10,
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
/* Memory writes from the CP start in-order with draws and event writes,
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
*/
TU_ACCESS_CP_WRITE = 1 << 12,
/* Descriptors are read through UCHE but are also prefetched via
* CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
* when they change.
*/
TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
/* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
/* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
/* The CCHE is a write-through cache which sits behind UCHE, with multiple
* incoherent copies. Because it's write-through we only have to worry
* about invalidating it for reads. It's invalidated by "ccinv" in the
* shader and CP_CCHE_INVALIDATE in the command stream.
*/
TU_ACCESS_CCHE_READ = 1 << 16,
TU_ACCESS_RTU_READ = 1 << 17,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
TU_ACCESS_CCU_DEPTH_READ |
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ |
TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
TU_ACCESS_CCHE_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
TU_ACCESS_CCU_COLOR_WRITE |
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
TU_ACCESS_CCU_DEPTH_WRITE |
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
TU_ACCESS_SYSMEM_WRITE |
TU_ACCESS_CP_WRITE,
TU_ACCESS_ALL =
TU_ACCESS_READ |
TU_ACCESS_WRITE,
};
/* From the driver's point of view, we only need to distinguish between things
* which won't start until a WFI is complete and things which additionally
* need a WAIT_FOR_ME.
*
* TODO: This will get more complicated with concurrent binning.
*/
enum tu_stage {
/* As a destination stage, this is for operations on the CP which don't
* wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
* As a source stage, it is for things needing no waits.
*/
TU_STAGE_CP,
/* This is for most operations, which WFI will wait to finish and will not
* start until any pending WFIs are finished.
*/
TU_STAGE_GPU,
/* This is only used as a destination stage and is for things needing no
* waits on the GPU (e.g. host operations).
*/
TU_STAGE_BOTTOM,
};
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
/* This is an unusual flush that isn't automatically executed if pending,
* as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
*/
TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
TU_CMD_FLAG_ALL_CLEAN =
TU_CMD_FLAG_CCU_CLEAN_DEPTH |
TU_CMD_FLAG_CCU_CLEAN_COLOR |
TU_CMD_FLAG_CACHE_CLEAN |
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
*/
TU_CMD_FLAG_WAIT_MEM_WRITES,
TU_CMD_FLAG_ALL_INVALIDATE =
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CACHE_INVALIDATE |
TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
TU_CMD_FLAG_CCHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME |
TU_CMD_FLAG_RTU_INVALIDATE,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
* which part of the gmem is used by the CCU. Here we keep track of what the
* state of the CCU.
*/
enum tu_cmd_ccu_state {
TU_CMD_CCU_SYSMEM,
TU_CMD_CCU_GMEM,
TU_CMD_CCU_UNKNOWN,
};
struct tu_cache_state {
/* Caches which must be made available (flushed) eventually if there are
* any users outside that cache domain, and caches which must be
* invalidated eventually if there are any reads.
*/
BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
/* Pending flushes */
BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
};
struct tu_vs_params {
uint32_t vertex_offset;
uint32_t first_instance;
uint32_t draw_id;
bool empty;
};
struct tu_tess_params {
bool valid;
enum a6xx_tess_output output_upper_left, output_lower_left;
enum a6xx_tess_spacing spacing;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool has_zpass_done_sample_count_write_in_rp;
bool disable_gmem;
bool sysmem_single_prim_mode;
bool shared_viewport;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
const char *lrz_disable_reason;
uint32_t lrz_disabled_at_draw;
const char *gmem_disable_reason;
};
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum tu_suspend_resume_state
{
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
};
struct tu_cmd_state
{
uint32_t dirty;
struct tu_shader *shaders[MESA_SHADER_STAGES];
struct tu_program_state program;
struct tu_render_pass_state rp;
struct vk_render_pass_state vk_rp;
struct vk_vertex_input_state vi;
struct vk_sample_locations_state sl;
struct tu_bandwidth bandwidth;
/* Vertex buffers
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
*/
struct {
uint64_t base;
uint32_t size;
} vb[MAX_VBS];
uint32_t max_vbs_bound;
bool per_view_viewport;
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_draw_state vertex_buffers;
struct tu_draw_state shader_const;
struct tu_draw_state desc_sets;
struct tu_draw_state load_state;
struct tu_draw_state compute_load_state;
struct tu_draw_state prim_order_gmem;
struct tu_draw_state vs_params;
struct tu_draw_state fs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;
uint8_t index_size;
/* because streamout base has to be 32-byte aligned
* there is an extra offset to deal with when it is
* unaligned
*/
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
/* Renderpasses are tricky, because we may need to flush differently if
* using sysmem vs. gmem and therefore we have to delay any flushing that
* happens before a renderpass. So we have to have two copies of the flush
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
* and one for outside a renderpass.
*/
struct tu_cache_state cache;
struct tu_cache_state renderpass_cache;
enum tu_cmd_ccu_state ccu_state;
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
* might get used by tu_store_gmem_attachment().
*/
enum tu_gmem_layout gmem_layout;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
const struct tu_tiling_config *tiling;
VkRect2D render_area;
const struct tu_image_view **attachments;
VkClearValue *clear_values;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
enum tu_gmem_layout gmem_layout;
const struct tu_image_view **attachments;
VkClearValue *clear_values;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
bool msaa_disable;
bool blend_reads_dest;
bool disable_fs;
bool stencil_front_write;
bool stencil_back_write;
bool pipeline_sysmem_single_prim_mode;
bool pipeline_has_tess;
bool pipeline_disable_gmem;
bool raster_order_attachment_access;
bool raster_order_attachment_access_valid;
bool blit_cache_cleaned;
VkImageAspectFlags pipeline_feedback_loops;
bool pipeline_writes_shading_rate;
bool pipeline_reads_shading_rate;
bool pipeline_accesses_smask;
bool pipeline_blend_lrz, pipeline_bandwidth, pipeline_disable_fs;
uint32_t pipeline_draw_states;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
enum tu_suspend_resume_state suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
struct tu_vs_params last_vs_params;
bool last_draw_indexed;
struct tu_tess_params tess_params;
uint64_t descriptor_buffer_iova[MAX_SETS];
};
struct tu_cmd_buffer
{
struct vk_command_buffer vk;
struct tu_device *device;
struct u_trace trace;
struct u_trace_iterator trace_renderpass_start;
struct u_trace_iterator trace_renderpass_end;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
void *patchpoints_ctx;
struct util_dynarray fdm_bin_patchpoints;
VkCommandBufferUsageFlags usage_flags;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 2];
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
struct tu_subpass_attachment dynamic_input_attachments[MAX_RTS + 1];
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 2];
VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
struct tu_render_pass dynamic_pass;
struct tu_subpass dynamic_subpass;
struct tu_framebuffer dynamic_framebuffer;
struct tu_cs cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
struct util_dynarray fdm_bin_patchpoints;
void *patchpoints_ctx;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
bool vsc_initialized;
bool prev_fsr_is_null;
};
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att,
uint32_t layer)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset[cmd->state.gmem_layout] +
layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
att->cpp;
}
static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att,
uint32_t layer)
{
assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
return att->gmem_offset_stencil[cmd->state.gmem_layout] +
layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
att->samples;
}
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
const VkCommandBufferBeginInfo *pBeginInfo);
template <chip CHIP>
void
tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
template <chip CHIP>
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
template <chip CHIP>
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
enum tu_cmd_ccu_state ccu_state);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
template <chip CHIP>
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
uint32_t x, uint32_t y, uint32_t z);
void tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer,
VkDeviceAddress size_addr);
void tu_write_buffer_cp(VkCommandBuffer commandBuffer,
VkDeviceAddress addr,
void *data, uint32_t size);
void tu_flush_buffer_write_cp(VkCommandBuffer commandBuffer);
enum fd_gpu_event : uint32_t;
template <chip CHIP>
void
tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event,
bool needs_seqno);
template <chip CHIP>
void
tu_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum fd_gpu_event event);
void
tu_flush_for_access(struct tu_cache_state *cache,
enum tu_cmd_access_mask src_mask,
enum tu_cmd_access_mask dst_mask);
static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point)
{
return &cmd_buffer->descriptors[bind_point];
}
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
bool msaa_disable);
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
unsigned views,
const VkExtent2D *frag_areas);
struct tu_fdm_bin_patchpoint {
uint64_t iova;
uint32_t size;
void *data;
tu_fdm_bin_apply_t apply;
};
void
tu_barrier(struct tu_cmd_buffer *cmd,
uint32_t dep_count,
const VkDependencyInfo *dep_info);
template <chip CHIP>
void
tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
VkPipelineStageFlags2 stageMask, unsigned value);
static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
unsigned size,
tu_fdm_bin_apply_t apply,
void *state,
unsigned state_size)
{
void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
memcpy(data, state, state_size);
assert(cs->writeable);
tu_cs_reserve_space(cs, size);
struct tu_fdm_bin_patchpoint patch = {
.iova = tu_cs_get_cur_iova(cs),
.size = size,
.data = data,
.apply = apply,
};
/* Apply the "default" setup where there is no scaling. This is used if
* sysmem is required, and uses up the dwords that have been reserved.
*/
unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
VkExtent2D unscaled_frag_areas[num_views];
for (unsigned i = 0; i < num_views; i++) {
unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
}
apply(cmd, cs, state, (VkRect2D) {
{ 0, 0 },
{ MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
}, num_views, unscaled_frag_areas);
assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
util_dynarray_append(&cmd->fdm_bin_patchpoints,
struct tu_fdm_bin_patchpoint,
patch);
}
#define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
_tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
VkResult tu_init_bin_preamble(struct tu_device *device);
#endif /* TU_CMD_BUFFER_H */