From 95104707f189b2e1b06c855b563c1203b33da354 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Wed, 19 Apr 2023 19:18:13 +0200 Subject: [PATCH] tu: Basic a7xx support Works: - sysmem rendering Doesn't work: - gmem rendering - 3d blits - TESS and GS Wild Life Extreme benchmarks runs without issues, most Sascha Willems Vulkan demos are working. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/common/freedreno_gpu_event.h | 104 ++++ src/freedreno/registers/adreno/a6xx.xml | 6 +- src/freedreno/vulkan/tu_clear_blit.cc | 305 +++++---- src/freedreno/vulkan/tu_clear_blit.h | 7 + src/freedreno/vulkan/tu_cmd_buffer.cc | 611 +++++++++++++------ src/freedreno/vulkan/tu_cmd_buffer.h | 13 +- src/freedreno/vulkan/tu_cs.h | 1 + src/freedreno/vulkan/tu_device.cc | 3 +- src/freedreno/vulkan/tu_dynamic_rendering.cc | 5 +- src/freedreno/vulkan/tu_image.cc | 4 +- src/freedreno/vulkan/tu_image.h | 1 + src/freedreno/vulkan/tu_lrz.cc | 29 +- src/freedreno/vulkan/tu_pipeline.cc | 364 +++++++---- src/freedreno/vulkan/tu_pipeline.h | 5 + src/freedreno/vulkan/tu_query.cc | 75 ++- 15 files changed, 1054 insertions(+), 479 deletions(-) create mode 100644 src/freedreno/common/freedreno_gpu_event.h diff --git a/src/freedreno/common/freedreno_gpu_event.h b/src/freedreno/common/freedreno_gpu_event.h new file mode 100644 index 00000000000..e18f609814d --- /dev/null +++ b/src/freedreno/common/freedreno_gpu_event.h @@ -0,0 +1,104 @@ +/* + * Copyright © 2023 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#ifndef __FREEDRENO_GPU_EVENT_H__ +#define __FREEDRENO_GPU_EVENT_H__ + +#include "adreno_pm4.xml.h" + +enum fd_gpu_event : uint32_t { + FD_WRITE_PRIMITIVE_COUNTS = 0, + FD_START_PRIMITIVE_CTRS, + FD_STOP_PRIMITIVE_CTRS, + FD_START_FRAGMENT_CTRS, + FD_STOP_FRAGMENT_CTRS, + FD_START_COMPUTE_CTRS, + FD_STOP_COMPUTE_CTRS, + FD_ZPASS_DONE, + FD_RB_DONE, + FD_FLUSH_SO_0, + FD_FLUSH_SO_1, + FD_FLUSH_SO_2, + FD_FLUSH_SO_3, + FD_CACHE_FLUSH, + FD_CACHE_INVALIDATE, + FD_CCU_INVALIDATE_DEPTH, + FD_CCU_INVALIDATE_COLOR, + FD_CCU_FLUSH_BLIT_CACHE, + FD_CCU_FLUSH_DEPTH, + FD_CCU_FLUSH_COLOR, + FD_LRZ_CLEAR, + FD_LRZ_FLUSH, + FD_BLIT, + FD_LABEL, + + FD_GPU_EVENT_MAX, +}; + +struct fd_gpu_event_info { + enum vgt_event_type raw_event; + bool needs_seqno; +}; + +template +constexpr struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] = {}; + +template <> +constexpr inline struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] = { + {WRITE_PRIMITIVE_COUNTS, false}, /* FD_WRITE_PRIMITIVE_COUNTS */ + {START_PRIMITIVE_CTRS, false}, /* FD_START_PRIMITIVE_CTRS */ + {STOP_PRIMITIVE_CTRS, false}, /* FD_STOP_PRIMITIVE_CTRS */ + {START_FRAGMENT_CTRS, false}, /* FD_START_FRAGMENT_CTRS */ + {STOP_FRAGMENT_CTRS, false}, /* FD_STOP_FRAGMENT_CTRS */ + {START_COMPUTE_CTRS, false}, /* FD_START_COMPUTE_CTRS */ + {STOP_COMPUTE_CTRS, false}, /* FD_STOP_COMPUTE_CTRS */ + {ZPASS_DONE, false}, /* FD_ZPASS_DONE */ + {RB_DONE_TS, true}, /* FD_RB_DONE */ + {FLUSH_SO_0, false}, /* FD_FLUSH_SO_0 */ + {FLUSH_SO_1, false}, /* FD_FLUSH_SO_1 */ + {FLUSH_SO_2, false}, /* FD_FLUSH_SO_2 */ + {FLUSH_SO_3, false}, /* FD_FLUSH_SO_3 */ + {CACHE_FLUSH_TS, true}, /* FD_CACHE_FLUSH */ + {CACHE_INVALIDATE, false}, /* FD_CACHE_INVALIDATE */ + {PC_CCU_INVALIDATE_DEPTH, false}, /* FD_CCU_INVALIDATE_DEPTH */ + {PC_CCU_INVALIDATE_COLOR, false}, /* FD_CCU_INVALIDATE_COLOR */ + {PC_CCU_RESOLVE_TS, true}, /* FD_CCU_FLUSH_BLIT_CACHE */ + {PC_CCU_FLUSH_DEPTH_TS, true}, /* FD_CCU_FLUSH_DEPTH */ + {PC_CCU_FLUSH_COLOR_TS, true}, /* FD_CCU_FLUSH_COLOR */ + {LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */ + {LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */ + {BLIT, false}, /* FD_BLIT */ + {LABEL, false}, /* FD_LABEL */ +}; + +template <> +constexpr inline struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] = { + {WRITE_PRIMITIVE_COUNTS, false}, /* FD_WRITE_PRIMITIVE_COUNTS */ + {START_PRIMITIVE_CTRS, false}, /* FD_START_PRIMITIVE_CTRS */ + {STOP_PRIMITIVE_CTRS, false}, /* FD_STOP_PRIMITIVE_CTRS */ + {START_FRAGMENT_CTRS, false}, /* FD_START_FRAGMENT_CTRS */ + {STOP_FRAGMENT_CTRS, false}, /* FD_STOP_FRAGMENT_CTRS */ + {START_COMPUTE_CTRS, false}, /* FD_START_COMPUTE_CTRS */ + {STOP_COMPUTE_CTRS, false}, /* FD_STOP_COMPUTE_CTRS */ + {ZPASS_DONE, false}, /* FD_ZPASS_DONE */ + {RB_DONE_TS, true}, /* FD_RB_DONE */ + {FLUSH_SO_0, false}, /* FD_FLUSH_SO_0 */ + {FLUSH_SO_1, false}, /* FD_FLUSH_SO_1 */ + {FLUSH_SO_2, false}, /* FD_FLUSH_SO_2 */ + {FLUSH_SO_3, false}, /* FD_FLUSH_SO_3 */ + {CACHE_FLUSH7, false}, /* FD_CACHE_FLUSH */ + {CACHE_INVALIDATE7, false}, /* FD_CACHE_INVALIDATE */ + {CCU_INVALIDATE_DEPTH, false}, /* FD_CCU_INVALIDATE_DEPTH */ + {CCU_INVALIDATE_COLOR, false}, /* FD_CCU_INVALIDATE_COLOR */ + {CCU_RESOLVE_CLEAN, false}, /* FD_CCU_FLUSH_BLIT_CACHE */ + {CCU_FLUSH_DEPTH, false}, /* FD_CCU_FLUSH_DEPTH */ + {CCU_FLUSH_COLOR, false}, /* FD_CCU_FLUSH_COLOR */ + {LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */ + {LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */ + {BLIT, false}, /* FD_BLIT */ + {LABEL, false}, /* FD_LABEL */ +}; + +#endif \ No newline at end of file diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml index c01a5b3eb53..0ea19770aed 100644 --- a/src/freedreno/registers/adreno/a6xx.xml +++ b/src/freedreno/registers/adreno/a6xx.xml @@ -2107,8 +2107,8 @@ to upconvert to 32b float internally? - - + + @@ -4001,7 +4001,7 @@ to upconvert to 32b float internally? - + diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index 39814b62ae3..044202f81e9 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -22,6 +22,8 @@ #include "tu_image.h" #include "tu_tracepoints.h" +#include "common/freedreno_gpu_event.h" + static const VkOffset2D blt_no_coord = { ~0, ~0 }; static uint32_t @@ -236,6 +238,7 @@ fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format, } } +template static void r2d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -257,15 +260,16 @@ r2d_src(struct tu_cmd_buffer *cmd, (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) | A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); + tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5); tu_cs_emit(cs, src_info); tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE); - tu_cs_image_ref_2d(cs, iview, layer, true); + tu_cs_image_ref_2d(cs, iview, layer, true); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3); + tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS({}).reg, 3); tu_cs_image_flag_ref(cs, iview, layer); } +template static void r2d_src_depth(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -273,17 +277,18 @@ r2d_src_depth(struct tu_cmd_buffer *cmd, uint32_t layer, VkFilter filter) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); + tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5); tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO)); tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE); tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer); /* SP_PS_2D_SRC_PITCH has shifted pitch field */ - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_PITCH(.pitch = iview->depth_pitch).value); + tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3); + tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS({}).reg, 3); tu_cs_image_flag_ref(cs, &iview->view, layer); } +template static void r2d_src_stencil(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -291,13 +296,14 @@ r2d_src_stencil(struct tu_cmd_buffer *cmd, uint32_t layer, VkFilter filter) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); + tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5); tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS); tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE); tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_PITCH(.pitch = iview->stencil_pitch).value); + tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value); } +template static void r2d_src_buffer(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -311,17 +317,18 @@ r2d_src_buffer(struct tu_cmd_buffer *cmd, fixup_src_format(&format, dst_format, &color_format); tu_cs_emit_regs(cs, - A6XX_SP_PS_2D_SRC_INFO( + SP_PS_2D_SRC_INFO(CHIP, .color_format = color_format, .color_swap = fmt.swap, .srgb = util_format_is_srgb(format), .unk20 = 1, .unk22 = 1), - A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height), - A6XX_SP_PS_2D_SRC(.qword = va), - A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch)); + SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height), + SP_PS_2D_SRC(CHIP, .qword = va), + SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch)); } +template static void r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, enum pipe_format src_format) @@ -336,7 +343,7 @@ r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt; tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); tu_cs_emit(cs, dst_info); - tu_cs_image_ref_2d(cs, iview, layer, false); + tu_cs_image_ref_2d(cs, iview, layer, false); tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3); tu_cs_image_flag_ref(cs, iview, layer); @@ -381,6 +388,7 @@ r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t A6XX_RB_2D_DST_PITCH(pitch)); } +template static void r2d_setup_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -413,7 +421,7 @@ r2d_setup_common(struct tu_cmd_buffer *cmd, } tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); - tu_cs_emit(cs, unknown_8c01); + tu_cs_emit(cs, unknown_8c01); // TODO: seem to be always 0 on A7XX uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL( .rotate = (enum a6xx_rotation) blit_param, @@ -431,10 +439,15 @@ r2d_setup_common(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); tu_cs_emit(cs, blit_cntl); + if (CHIP > A6XX) { + tu_cs_emit_pkt4(cs, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1); + tu_cs_emit(cs, 0x20000000); + } + if (fmt == FMT6_10_10_10_2_UNORM_DEST) fmt = FMT6_16_16_16_16_FLOAT; - tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT( + tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP, .sint = util_format_is_pure_sint(dst_format), .uint = util_format_is_pure_uint(dst_format), .color_format = fmt, @@ -442,6 +455,7 @@ r2d_setup_common(struct tu_cmd_buffer *cmd, .mask = 0xf)); } +template static void r2d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -456,10 +470,10 @@ r2d_setup(struct tu_cmd_buffer *cmd, assert(samples == VK_SAMPLE_COUNT_1_BIT); if (!cmd->state.pass) { - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); } - r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false); + r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false); } static void @@ -809,15 +823,15 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, .cs_bindless = 0x1f, .gfx_bindless = 0x1f,)); - tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs); - tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL); - tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL); - tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL); - tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs); + tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs); + tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL); + tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL); + tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL); + tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs); struct tu_pvtmem_config pvtmem = {}; - tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); - tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); + tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); + tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0()); tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0()); @@ -836,13 +850,13 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, } tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL()); - tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs); + tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs); /* REPL_MODE for varying with RECTLIST (2 vertices only) */ tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0)); tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0)); - tu6_emit_fs_inputs(cs, fs); + tu6_emit_fs_inputs(cs, fs); tu_cs_emit_regs(cs, A6XX_GRAS_CL_CNTL( @@ -1395,7 +1409,7 @@ r3d_setup(struct tu_cmd_buffer *cmd, fixup_dst_format(src_format, &dst_format, &fmt); if (!cmd->state.pass) { - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff); } @@ -1450,7 +1464,7 @@ r3d_setup(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true)); if (cmd->state.prim_generated_query_running_before_rp) { - tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_STOP_PRIMITIVE_CTRS); } if (cmd->state.predication_active) { @@ -1493,7 +1507,7 @@ r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false)); if (cmd->state.prim_generated_query_running_before_rp) { - tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_START_PRIMITIVE_CTRS); } } @@ -1537,20 +1551,22 @@ struct blit_ops { struct tu_cs *cs); }; +template static const struct blit_ops r2d_ops = { .coords = r2d_coords, .clear_value = r2d_clear_value, - .src = r2d_src, - .src_buffer = r2d_src_buffer, - .dst = r2d_dst, + .src = r2d_src, + .src_buffer = r2d_src_buffer, + .dst = r2d_dst, .dst_depth = r2d_dst_depth, .dst_stencil = r2d_dst_stencil, .dst_buffer = r2d_dst_buffer, - .setup = r2d_setup, + .setup = r2d_setup, .run = r2d_run, .teardown = r2d_teardown, }; +template static const struct blit_ops r3d_ops = { .coords = r3d_coords, .clear_value = r3d_clear_value, @@ -1629,13 +1645,14 @@ copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask) } } +template void tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image, const VkClearValue *value) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; /* It is assumed that LRZ cache is invalidated at this point for * the writes here to become visible to LRZ. @@ -1644,7 +1661,7 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd, * LRZ via CCU. Don't need to invalidate CCU since we are presumably * writing whole cache lines we assume to be 64 bytes. */ - tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS); + tu_emit_event_write(cmd, &cmd->cs, FD_CACHE_FLUSH); ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, @@ -1665,13 +1682,15 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd, TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | TU_CMD_FLAG_WAIT_FOR_IDLE; } +TU_GENX(tu6_clear_lrz); +template void tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; VkClearValue clear = {}; clear.color.uint32[0] = 0xffffffff; @@ -1687,6 +1706,7 @@ tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); ops->teardown(cmd, cs); } +TU_GENX(tu6_dirty_lrz_fc); static void tu_image_view_copy_blit(struct fdl6_view *iview, @@ -1744,6 +1764,7 @@ tu_image_view_blit(struct fdl6_view *iview, tu_image_view_copy_blit(iview, image, format, subres, layer, false); } +template static void tu6_blit_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, @@ -1751,7 +1772,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, const VkImageBlit2 *info, VkFilter filter) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; bool z_scale = false; uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z; @@ -1801,7 +1822,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK || filter == VK_FILTER_CUBIC_EXT || z_scale) { - ops = &r3d_ops; + ops = &r3d_ops; blit_param = z_scale ? R3D_Z_SCALE : 0; } @@ -1817,7 +1838,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, tu6_plane_index(src_image->vk.format, info->srcSubresource.aspectMask)); trace_start_blit(&cmd->trace, cs, - ops == &r3d_ops, + ops == &r3d_ops, src_image->vk.format, dst_image->vk.format, layers); @@ -1826,7 +1847,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, blit_param, false, dst_image->layout[0].ubwc, (VkSampleCountFlagBits) dst_image->layout[0].nr_samples); - if (ops == &r3d_ops) { + if (ops == &r3d_ops) { const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y, info->srcOffsets[0].x, info->srcOffsets[0].y, info->dstOffsets[1].x, info->dstOffsets[1].y, @@ -1873,6 +1894,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, trace_end_blit(&cmd->trace, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, const VkBlitImageInfo2* pBlitImageInfo) @@ -1892,11 +1914,11 @@ tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, u_foreach_bit(b, region.dstSubresource.aspectMask) { region.srcSubresource.aspectMask = BIT(b); region.dstSubresource.aspectMask = BIT(b); - tu6_blit_image(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter); + tu6_blit_image(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter); } continue; } - tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i, + tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i, pBlitImageInfo->filter); } @@ -1904,6 +1926,7 @@ tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, tu_disable_lrz(cmd, &cmd->cs, dst_image); } } +TU_GENX(tu_CmdBlitImage2KHR); static void copy_compressed(VkFormat format, @@ -1931,6 +1954,7 @@ copy_compressed(VkFormat format, *height = DIV_ROUND_UP(*height, block_height); } +template static void tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, struct tu_buffer *src_buffer, @@ -1943,7 +1967,7 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, copy_format(dst_image->vk.format, info->imageSubresource.aspectMask); enum pipe_format dst_format = copy_format(dst_image->vk.format, info->imageSubresource.aspectMask); - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; /* special case for buffer to stencil */ if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && @@ -1953,7 +1977,7 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, /* note: could use "R8_UNORM" when no UBWC */ if (src_format == PIPE_FORMAT_Y8_UNORM) - ops = &r3d_ops; + ops = &r3d_ops; VkOffset3D offset = info->imageOffset; VkExtent3D extent = info->imageExtent; @@ -1996,6 +2020,7 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, ops->teardown(cmd, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) @@ -2005,14 +2030,16 @@ tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer); for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i) - tu_copy_buffer_to_image(cmd, src_buffer, dst_image, + tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pCopyBufferToImageInfo->pRegions + i); if (dst_image->lrz_height) { tu_disable_lrz(cmd, &cmd->cs, dst_image); } } +TU_GENX(tu_CmdCopyBufferToImage2KHR); +template static void tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, struct tu_image *src_image, @@ -2025,7 +2052,7 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, copy_format(src_image->vk.format, info->imageSubresource.aspectMask); enum pipe_format src_format = copy_format(src_image->vk.format, info->imageSubresource.aspectMask); - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { @@ -2034,7 +2061,7 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, /* note: could use "R8_UNORM" when no UBWC */ if (dst_format == PIPE_FORMAT_Y8_UNORM) - ops = &r3d_ops; + ops = &r3d_ops; VkOffset3D offset = info->imageOffset; VkExtent3D extent = info->imageExtent; @@ -2075,6 +2102,7 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, ops->teardown(cmd, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo) @@ -2084,9 +2112,10 @@ tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer); for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i) - tu_copy_image_to_buffer(cmd, src_image, dst_buffer, + tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pCopyImageToBufferInfo->pRegions + i); } +TU_GENX(tu_CmdCopyImageToBuffer2KHR); /* Tiled formats don't support swapping, which means that we can't support * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some @@ -2117,17 +2146,18 @@ image_is_r8g8(struct tu_image *image) vk_format_get_nr_components(image->vk.format) == 2; } +template static void tu_copy_image_to_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_image *dst_image, const VkImageCopy2 *info) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; if (dst_image->layout[0].nr_samples > 1) - ops = &r3d_ops; + ops = &r3d_ops; enum pipe_format format = PIPE_FORMAT_NONE; VkOffset3D src_offset = info->srcOffset; @@ -2160,7 +2190,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, /* note: could use "R8_UNORM" when no UBWC */ if (dst_format == PIPE_FORMAT_Y8_UNORM || src_format == PIPE_FORMAT_Y8_UNORM) - ops = &r3d_ops; + ops = &r3d_ops; bool use_staging_blit = false; @@ -2258,8 +2288,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, /* When executed by the user there has to be a pipeline barrier here, * but since we're doing it manually we'll have to flush ourselves. */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); tu_cs_emit_wfi(cs); const struct fdl_view_args copy_from_args = { @@ -2303,6 +2333,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, ops->teardown(cmd, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, const VkCopyImageInfo2* pCopyImageInfo) @@ -2317,12 +2348,12 @@ tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, u_foreach_bit(b, info.dstSubresource.aspectMask) { info.srcSubresource.aspectMask = BIT(b); info.dstSubresource.aspectMask = BIT(b); - tu_copy_image_to_image(cmd, src_image, dst_image, &info); + tu_copy_image_to_image(cmd, src_image, dst_image, &info); } continue; } - tu_copy_image_to_image(cmd, src_image, dst_image, + tu_copy_image_to_image(cmd, src_image, dst_image, pCopyImageInfo->pRegions + i); } @@ -2330,7 +2361,9 @@ tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, tu_disable_lrz(cmd, &cmd->cs, dst_image); } } +TU_GENX(tu_CmdCopyImage2KHR); +template static void copy_buffer(struct tu_cmd_buffer *cmd, uint64_t dst_va, @@ -2338,7 +2371,7 @@ copy_buffer(struct tu_cmd_buffer *cmd, uint64_t size, uint32_t block_size) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM; uint64_t blocks = size / block_size; @@ -2364,6 +2397,7 @@ copy_buffer(struct tu_cmd_buffer *cmd, ops->teardown(cmd, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo) @@ -2374,13 +2408,15 @@ tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) { const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i]; - copy_buffer(cmd, + copy_buffer(cmd, dst_buffer->iova + region->dstOffset, src_buffer->iova + region->srcOffset, region->size, 1); } } +TU_GENX(tu_CmdCopyBuffer2KHR); +template VKAPI_ATTR void VKAPI_CALL tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, @@ -2399,9 +2435,11 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, } memcpy(tmp.map, pData, dataSize); - copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4); + copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4); } +TU_GENX(tu_CmdUpdateBuffer); +template VKAPI_ATTR void VKAPI_CALL tu_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, @@ -2411,7 +2449,7 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize); @@ -2441,7 +2479,9 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, ops->teardown(cmd, cs); } +TU_GENX(tu_CmdFillBuffer); +template VKAPI_ATTR void VKAPI_CALL tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, const VkResolveImageInfo2* pResolveImageInfo) @@ -2449,7 +2489,7 @@ tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage); TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage); - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; enum pipe_format src_format = @@ -2482,6 +2522,7 @@ tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, ops->teardown(cmd, cs); } +TU_GENX(tu_CmdResolveImage2KHR); #define for_each_layer(layer, layer_mask, layers) \ for (uint32_t layer = 0; \ @@ -2489,6 +2530,7 @@ tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, layer++) \ if (!layer_mask || (layer_mask & BIT(layer))) +template static void resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2502,7 +2544,7 @@ resolve_sysmem(struct tu_cmd_buffer *cmd, bool src_separate_ds, bool dst_separate_ds) { - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format); @@ -2517,9 +2559,9 @@ resolve_sysmem(struct tu_cmd_buffer *cmd, for_each_layer(i, layer_mask, layers) { if (src_separate_ds) { if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) { - r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST); + r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST); } else { - r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST); + r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST); } } else { ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format); @@ -2543,6 +2585,7 @@ resolve_sysmem(struct tu_cmd_buffer *cmd, trace_end_sysmem_resolve(&cmd->trace, cs); } +template void tu_resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2560,19 +2603,21 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT; if (dst_separate_ds) { - resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT, + resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT, src, dst, layer_mask, layers, rect, src_separate_ds, dst_separate_ds); - resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT, + resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT, src, dst, layer_mask, layers, rect, src_separate_ds, dst_separate_ds); } else { - resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format, + resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format, src, dst, layer_mask, layers, rect, src_separate_ds, dst_separate_ds); } } +TU_GENX(tu_resolve_sysmem); +template static void clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, @@ -2597,7 +2642,7 @@ clear_image(struct tu_cmd_buffer *cmd, assert(range->baseArrayLayer == 0); } - const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops; + const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops; ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc, (VkSampleCountFlagBits) image->layout[0].nr_samples); @@ -2633,6 +2678,7 @@ clear_image(struct tu_cmd_buffer *cmd, ops->teardown(cmd, cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage image_h, @@ -2645,9 +2691,11 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_image, image, image_h); for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); + clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); } +TU_GENX(tu_CmdClearColorImage); +template VKAPI_ATTR void VKAPI_CALL tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage image_h, @@ -2665,16 +2713,18 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { /* can't clear both depth and stencil at once, split up the aspect mask */ u_foreach_bit(b, range->aspectMask) - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); continue; } - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); } tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges); } +TU_GENX(tu_CmdClearDepthStencilImage); +template static void tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, @@ -2758,7 +2808,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true)); if (cmd->state.prim_generated_query_running_before_rp) { - tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_STOP_PRIMITIVE_CTRS); } tu_cs_emit_regs(cs, @@ -2845,7 +2895,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false)); if (cmd->state.prim_generated_query_running_before_rp) { - tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_START_PRIMITIVE_CTRS); } trace_end_sysmem_clear_all(&cmd->trace, cs); @@ -2928,6 +2978,7 @@ pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t #undef PACK_F } +template static void clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2954,9 +3005,10 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); tu_cs_emit_array(cs, clear_vals, 4); - tu6_emit_event_write(cmd, cs, BLIT); + tu_emit_event_write(cmd, cs, FD_BLIT); } +template static void tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2980,15 +3032,15 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t layer = i + base_layer; if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) { - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, + clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att, layer), value); } if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) { - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, + clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att, layer), value); } } else { - clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), + clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), tu_attachment_gmem_offset(cmd, att, layer), value); } } @@ -2996,6 +3048,7 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, trace_end_gmem_clear(&cmd->trace, cs); } +template static void tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, @@ -3029,7 +3082,7 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, if (a == VK_ATTACHMENT_UNUSED) continue; - tu_emit_clear_gmem_attachment(cmd, cs, a, rects[i].baseArrayLayer, + tu_emit_clear_gmem_attachment(cmd, cs, a, rects[i].baseArrayLayer, rects[i].layerCount, subpass->multiview_mask, attachments[j].aspectMask, @@ -3038,6 +3091,7 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, } } +template VKAPI_ATTR void VKAPI_CALL tu_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount, @@ -3051,7 +3105,7 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, /* sysmem path behaves like a draw, note we don't have a way of using different * flushes for sysmem/gmem, so this needs to be outside of the cond_exec */ - tu_emit_cache_flush_renderpass(cmd); + tu_emit_cache_flush_renderpass(cmd); for (uint32_t j = 0; j < attachmentCount; j++) { if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0) @@ -3069,7 +3123,7 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, * doesn't know the GMEM layout that will be chosen by the primary. */ if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) { - tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); return; } @@ -3089,7 +3143,7 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, if (a != VK_ATTACHMENT_UNUSED) { const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; if (att->cond_load_allowed || att->cond_store_allowed) { - tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); return; } } @@ -3097,14 +3151,16 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, /* Otherwise, emit 2D blits for gmem rendering. */ tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); tu_cond_exec_end(cs); tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); tu_cond_exec_end(cs); } +TU_GENX(tu_CmdClearAttachments); +template static void clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3117,12 +3173,12 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd, const struct tu_framebuffer *fb = cmd->state.framebuffer; const struct tu_image_view *iview = cmd->state.attachments[a]; const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views; - const struct blit_ops *ops = &r2d_ops; + const struct blit_ops *ops = &r2d_ops; const VkClearValue *value = &cmd->state.clear_values[a]; if (cmd->state.pass->attachments[a].samples > 1) - ops = &r3d_ops; + ops = &r3d_ops; - trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops, + trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops, cmd->state.pass->attachments[a].samples); ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled, @@ -3149,6 +3205,7 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd, trace_end_sysmem_clear(&cmd->trace, cs); } +template void tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3162,15 +3219,15 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { - clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, + clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, a, true); } if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { - clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, + clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, a, true); } } else { - clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, + clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, a, false); } @@ -3184,17 +3241,19 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, * beforehand as depth should already be flushed. */ if (vk_format_is_depth_or_stencil(attachment->format)) { - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_DEPTH); + tu_emit_event_write(cmd, cs, FD_CCU_INVALIDATE_DEPTH); } else { - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); + tu_emit_event_write(cmd, cs, FD_CCU_INVALIDATE_COLOR); } tu_cs_emit_wfi(cs); } +TU_GENX(tu_clear_sysmem_attachment); +template void tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3206,12 +3265,14 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, if (!attachment->clear_mask) return; - tu_emit_clear_gmem_attachment(cmd, cs, a, 0, cmd->state.framebuffer->layers, + tu_emit_clear_gmem_attachment(cmd, cs, a, 0, cmd->state.framebuffer->layers, attachment->clear_views, attachment->clear_mask, &cmd->state.clear_values[a]); } +TU_GENX(tu_clear_gmem_attachment); +template static void tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3247,7 +3308,7 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, } } else { tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO); - tu_cs_image_ref_2d(cs, &iview->view, i, false); + tu_cs_image_ref_2d(cs, &iview->view, i, false); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3); tu_cs_image_flag_ref(cs, &iview->view, i); @@ -3264,7 +3325,7 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); tu_cs_emit(cs, 0); - tu6_emit_event_write(cmd, cs, BLIT); + tu_emit_event_write(cmd, cs, FD_BLIT); } } @@ -3332,6 +3393,7 @@ fdm_apply_load_coords(struct tu_cs *cs, void *data, VkRect2D bin, r3d_coords_raw(cs, coords); } +template static void load_3d_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3359,7 +3421,7 @@ load_3d_blit(struct tu_cmd_buffer *cmd, /* Normal loads read directly from system memory, so we have to invalidate * UCHE in case it contains stale data. */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); /* Wait for CACHE_INVALIDATE to land */ tu_cs_emit_wfi(cs); @@ -3442,6 +3504,7 @@ tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd, tu_cs_emit_qw(cs, global_iova(cmd, dbg_one)); } +template void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3480,16 +3543,16 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, tu_disable_draw_states(cmd, cs); if (load_common) - load_3d_blit(cmd, cs, iview, attachment, false); + load_3d_blit(cmd, cs, iview, attachment, false); if (load_stencil) - load_3d_blit(cmd, cs, iview, attachment, true); + load_3d_blit(cmd, cs, iview, attachment, true); } else { if (load_common) - tu_emit_blit(cmd, cs, iview, attachment, false, false); + tu_emit_blit(cmd, cs, iview, attachment, false, false); if (load_stencil) - tu_emit_blit(cmd, cs, iview, attachment, false, true); + tu_emit_blit(cmd, cs, iview, attachment, false, true); } if (cond_exec) @@ -3497,7 +3560,9 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, trace_end_gmem_load(&cmd->trace, cs); } +TU_GENX(tu_load_gmem_attachment); +template static void store_cp_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3510,8 +3575,9 @@ store_cp_blit(struct tu_cmd_buffer *cmd, uint32_t gmem_offset, uint32_t cpp) { - r2d_setup_common(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, - iview->view.ubwc_enabled, true); + r2d_setup_common(cmd, cs, src_format, dst_format, + VK_IMAGE_ASPECT_COLOR_BIT, 0, false, + iview->view.ubwc_enabled, true); if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (!separate_stencil) { @@ -3520,14 +3586,14 @@ store_cp_blit(struct tu_cmd_buffer *cmd, r2d_dst_stencil(cs, iview, layer); } } else { - r2d_dst(cs, &iview->view, layer, src_format); + r2d_dst(cs, &iview->view, layer, src_format); } enum a6xx_format fmt = blit_format_texture(src_format, TILE6_2).fmt; fixup_src_format(&src_format, dst_format, &fmt); tu_cs_emit_regs(cs, - A6XX_SP_PS_2D_SRC_INFO( + SP_PS_2D_SRC_INFO(CHIP, .color_format = fmt, .tile_mode = TILE6_2, .color_swap = WZYX, @@ -3537,12 +3603,12 @@ store_cp_blit(struct tu_cmd_buffer *cmd, !util_format_is_depth_or_stencil(dst_format), .unk20 = 1, .unk22 = 1), - A6XX_SP_PS_2D_SRC_SIZE( .width = iview->vk.extent.width, .height = iview->vk.extent.height), - A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset), - A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp)); + SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height), + SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset), + SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp)); /* sync GMEM writes with CACHE. */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); /* Wait for CACHE_INVALIDATE to land */ tu_cs_emit_wfi(cs); @@ -3553,9 +3619,10 @@ store_cp_blit(struct tu_cmd_buffer *cmd, * sysmem, and we generally assume that GMEM renderpasses leave their * results in sysmem, so we need to flush manually here. */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); } +template static void store_3d_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3598,7 +3665,7 @@ store_3d_blit(struct tu_cmd_buffer *cmd, r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp); /* sync GMEM writes with CACHE. */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); /* Wait for CACHE_INVALIDATE to land */ tu_cs_emit_wfi(cs); @@ -3612,7 +3679,7 @@ store_3d_blit(struct tu_cmd_buffer *cmd, * results in sysmem, so we need to flush manually here. The 3d blit path * writes to depth images as a color RT, so there's no need to flush depth. */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */ tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1); @@ -3720,6 +3787,7 @@ fdm_apply_store_coords(struct tu_cs *cs, void *data, VkRect2D bin, A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1)); } +template void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3773,9 +3841,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, /* use fast path when render area is aligned, except for unsupported resolve cases */ if (use_fast_path) { if (store_common) - tu_emit_blit(cmd, cs, iview, src, true, false); + tu_emit_blit(cmd, cs, iview, src, true, false); if (store_separate_stencil) - tu_emit_blit(cmd, cs, iview, src, true, true); + tu_emit_blit(cmd, cs, iview, src, true, true); if (cond_exec) { tu_end_load_store_cond_exec(cmd, cs, false); @@ -3808,11 +3876,11 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, for_each_layer(i, layer_mask, layers) { if (store_common) { - store_3d_blit(cmd, cs, iview, dst->samples, false, src_format, + store_3d_blit(cmd, cs, iview, dst->samples, false, src_format, dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp); } if (store_separate_stencil) { - store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT, + store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT, PIPE_FORMAT_S8_UINT, render_area, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples); } @@ -3846,11 +3914,11 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, state); } if (store_common) { - store_cp_blit(cmd, cs, iview, src->samples, false, src_format, + store_cp_blit(cmd, cs, iview, src->samples, false, src_format, dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp); } if (store_separate_stencil) { - store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT, + store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT, PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples); } } @@ -3862,3 +3930,4 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, trace_end_gmem_store(&cmd->trace, cs); } +TU_GENX(tu_store_gmem_attachment); diff --git a/src/freedreno/vulkan/tu_clear_blit.h b/src/freedreno/vulkan/tu_clear_blit.h index 778f5e045d5..e734a8401da 100644 --- a/src/freedreno/vulkan/tu_clear_blit.h +++ b/src/freedreno/vulkan/tu_clear_blit.h @@ -16,12 +16,15 @@ void tu_init_clear_blit_shaders(struct tu_device *dev); void tu_destroy_clear_blit_shaders(struct tu_device *dev); +template void tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); +template void tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image); +template void tu_resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -31,16 +34,19 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, uint32_t layers, const VkRect2D *rect); +template void tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); +template void tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); +template void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -49,6 +55,7 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, bool force_load); /* note: gmem store can also resolve */ +template void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 9905d3f315d..14d76083fb8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -18,6 +18,8 @@ #include "tu_image.h" #include "tu_tracepoints.h" +#include "common/freedreno_gpu_event.h" + static void tu_clone_trace_range(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct u_trace_iterator begin, struct u_trace_iterator end) @@ -39,33 +41,43 @@ tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs, u_trace_end_iterator(trace)); } -void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event) +template +static void +tu_emit_raw_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event, + bool needs_seqno) { - bool need_seqno = false; - switch (event) { - case CACHE_FLUSH_TS: - case WT_DONE_TS: - case RB_DONE_TS: - case PC_CCU_FLUSH_DEPTH_TS: - case PC_CCU_FLUSH_COLOR_TS: - case PC_CCU_RESOLVE_TS: - need_seqno = true; - break; - default: - break; + if (CHIP == A6XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, needs_seqno ? 4 : 1); + tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); + } else { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, needs_seqno ? 4 : 1); + tu_cs_emit(cs, + CP_EVENT_WRITE7_0(.event = event, + .write_src = EV_WRITE_USER_32B, + .write_dst = EV_DST_RAM, + .write_enabled = needs_seqno).value); } - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); - if (need_seqno) { + if (needs_seqno) { tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy)); tu_cs_emit(cs, 0); } } +template +void +tu_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum fd_gpu_event event) +{ + struct fd_gpu_event_info event_info = fd_gpu_events[event]; + tu_emit_raw_event_write(cmd, cs, event_info.raw_event, + event_info.needs_seqno); +} +TU_GENX(tu_emit_event_write); + /* Emits the tessfactor address to the top-level CS if it hasn't been already. * Updating this register requires a WFI if outstanding drawing is using it, but * tu6_init_hardware() will have WFIed before we started and no other draws @@ -132,6 +144,7 @@ tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) cmd->vsc_initialized = true; } +template static void tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, struct tu_cs *cs, @@ -155,20 +168,20 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, */ if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_INVALIDATE_COLOR)) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS); + tu_emit_event_write(cmd_buffer, cs, FD_CCU_FLUSH_COLOR); if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH | TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS); + tu_emit_event_write(cmd_buffer, cs, FD_CCU_FLUSH_DEPTH); if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR); + tu_emit_event_write(cmd_buffer, cs, FD_CCU_INVALIDATE_COLOR); if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH); + tu_emit_event_write(cmd_buffer, cs, FD_CCU_INVALIDATE_DEPTH); if (flushes & TU_CMD_FLAG_CACHE_FLUSH) - tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); + tu_emit_event_write(cmd_buffer, cs, FD_CACHE_FLUSH); if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) - tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd_buffer, cs, FD_CACHE_INVALIDATE); if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) { - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, .cs_bindless = 0x1f, .gfx_bindless = 0x1f, )); @@ -182,26 +195,38 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, } /* "Normal" cache flushes outside the renderpass, that don't require any special handling */ +template void tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer) { - tu6_emit_flushes(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache); + tu6_emit_flushes(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache); } +TU_GENX(tu_emit_cache_flush); /* Renderpass cache flushes inside the draw_cs */ +template void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer) { if (!cmd_buffer->state.renderpass_cache.flush_bits && likely(!tu_env.debug)) return; - tu6_emit_flushes(cmd_buffer, &cmd_buffer->draw_cs, + tu6_emit_flushes(cmd_buffer, &cmd_buffer->draw_cs, &cmd_buffer->state.renderpass_cache); } +TU_GENX(tu_emit_cache_flush_renderpass); +template static struct fd_reg_pair -rb_ccu_cntl(struct tu_device *dev, uint32_t color_offset) +rb_ccu_cntl(struct tu_device *dev, bool gmem) { + if (CHIP == A7XX) { + return A6XX_RB_CCU_CNTL(.dword = gmem ? 0x68 : 0); + } + + uint32_t color_offset = gmem ? dev->physical_device->ccu_offset_gmem + : dev->physical_device->ccu_offset_bypass; + uint32_t color_offset_hi = color_offset >> 21; color_offset &= 0x1fffff; enum a6xx_ccu_color_cache_size cache_size = @@ -222,7 +247,7 @@ rb_ccu_cntl(struct tu_device *dev, uint32_t color_offset) * blits and draws). This deals with changing CCU state as well as the usual * cache flushing. */ - +template void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, struct tu_cs *cs, @@ -256,20 +281,17 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, TU_CMD_FLAG_WAIT_FOR_IDLE); } - tu6_emit_flushes(cmd_buffer, cs, &cmd_buffer->state.cache); + tu6_emit_flushes(cmd_buffer, cs, &cmd_buffer->state.cache); if (ccu_state != cmd_buffer->state.ccu_state) { - struct tu_physical_device *phys_dev = - cmd_buffer->device->physical_device; - tu_cs_emit_regs(cs, - rb_ccu_cntl(cmd_buffer->device, - ccu_state == TU_CMD_CCU_GMEM ? - phys_dev->ccu_offset_gmem : - phys_dev->ccu_offset_bypass)); + tu_cs_emit_regs(cs, rb_ccu_cntl(cmd_buffer->device, + ccu_state == TU_CMD_CCU_GMEM)); cmd_buffer->state.ccu_state = ccu_state; } } +TU_GENX(tu_emit_cache_flush_ccu); +template static void tu6_emit_zs(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass, @@ -298,7 +320,11 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format); tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value); + tu_cs_emit(cs, RB_DEPTH_BUFFER_INFO(CHIP, + .depth_format = fmt, + .tilemode = TILE6_3, + .losslesscompen = iview->view.ubwc_enabled, + ).value); if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) tu_cs_image_depth_ref(cs, iview, 0); else @@ -315,7 +341,10 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, attachment->format == VK_FORMAT_S8_UINT) { tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6); - tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value); + tu_cs_emit(cs, RB_STENCIL_INFO(CHIP, + .separate_stencil = true, + .tilemode = TILE6_3, + ).value); if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { tu_cs_image_stencil_ref(cs, iview, 0); tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment, 0)); @@ -390,19 +419,45 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1)); } +struct tu_bin_size_params { + enum a6xx_render_mode render_mode; + bool force_lrz_write_dis; + enum a6xx_buffers_location buffers_location; + unsigned lrz_feedback_zmode_mask; +}; + +template static void tu6_emit_bin_size(struct tu_cs *cs, - uint32_t bin_w, uint32_t bin_h, uint32_t flags) + uint32_t bin_w, + uint32_t bin_h, + struct tu_bin_size_params &&p) { - tu_cs_emit_regs(cs, - A6XX_GRAS_BIN_CONTROL(.binw = bin_w, - .binh = bin_h, - .dword = flags)); + if (CHIP == A6XX) { + tu_cs_emit_regs( + cs, A6XX_GRAS_BIN_CONTROL(.binw = bin_w, + .binh = bin_h, + .render_mode = p.render_mode, + .force_lrz_write_dis = p.force_lrz_write_dis, + .buffers_location = p.buffers_location, + .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); + } else { + tu_cs_emit_regs(cs, + A6XX_GRAS_BIN_CONTROL(.binw = bin_w, + .binh = bin_h, + .render_mode = p.render_mode, + .force_lrz_write_dis = p.force_lrz_write_dis, + .lrz_feedback_zmode_mask = + p.lrz_feedback_zmode_mask, )); + } - tu_cs_emit_regs(cs, - A6XX_RB_BIN_CONTROL(.binw = bin_w, - .binh = bin_h, - .dword = flags)); + tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, + .binw = bin_w, + .binh = bin_h, + .render_mode = p.render_mode, + .force_lrz_write_dis = p.force_lrz_write_dis, + .buffers_location = p.buffers_location, + .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); /* no flag for RB_BIN_CONTROL2... */ tu_cs_emit_regs(cs, @@ -410,8 +465,16 @@ tu6_emit_bin_size(struct tu_cs *cs, .binh = bin_h)); } +template static void tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + struct tu_cs *cs, + bool binning); + +template <> +void +tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass, struct tu_cs *cs, bool binning) @@ -469,6 +532,18 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, cntl); } +template <> +void +tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + struct tu_cs *cs, + bool binning) +{ + tu_cs_emit_regs( + cs, A7XX_RB_RENDER_CNTL(.binning = binning, .raster_mode = TYPE_TILED, + .raster_direction = LR_TB)); +} + static void tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) { @@ -518,6 +593,7 @@ tu6_emit_window_scissor(struct tu_cs *cs, A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2)); } +template void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1) { @@ -528,7 +604,7 @@ tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1) A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1)); tu_cs_emit_regs(cs, - A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1)); + SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1)); tu_cs_emit_regs(cs, A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1)); @@ -712,6 +788,10 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, if (TU_DEBUG(SYSMEM)) return true; + /* A7XX TODO: Add gmem support */ + if (cmd->device->physical_device->info->chip >= 7) + return true; + /* can't fit attachments into gmem */ if (!cmd->state.tiling->possible) return true; @@ -774,6 +854,7 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } } +template static void tu6_emit_tile_select(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -790,7 +871,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE); const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE); tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1); - tu6_emit_window_offset(cs, x1, y1); + tu6_emit_window_offset(cs, x1, y1); bool hw_binning = use_hw_binning(cmd); @@ -896,6 +977,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, } } +template static void tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -907,9 +989,10 @@ tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, const struct tu_image_view *dst = cmd->state.attachments[a]; const struct tu_image_view *src = cmd->state.attachments[gmem_a]; - tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area); + tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area); } +template static void tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -937,11 +1020,11 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, * resolve case. However, a flush afterwards isn't needed because of the * last sentence and the fact that we're in sysmem mode. */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_COLOR); if (subpass->resolve_depth_stencil) - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_DEPTH); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); /* Wait for the flushes to land before using the 2D engine */ tu_cs_emit_wfi(cs); @@ -953,11 +1036,12 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a); + tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a); } } } +template static void tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -977,7 +1061,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) if (pass->attachments[a].gmem) { const bool cond_exec_allowed = cmd->state.tiling->binning_possible && cmd->state.pass->has_cond_load_store; - tu_store_gmem_attachment(cmd, cs, a, a, + tu_store_gmem_attachment(cmd, cs, a, a, fb->layers, subpass->multiview_mask, cond_exec_allowed); } @@ -988,7 +1072,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) uint32_t a = subpass->resolve_attachments[i].attachment; if (a != VK_ATTACHMENT_UNUSED) { uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, + tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, subpass->multiview_mask, false); } } @@ -1011,15 +1095,27 @@ tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs) cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; } +template static void tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { struct tu_device *dev = cmd->device; const struct tu_physical_device *phys_dev = dev->physical_device; - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + if (CHIP == A6XX) { + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); + } else { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + tu_emit_event_write(cmd, cs, FD_CCU_INVALIDATE_COLOR); + tu_emit_event_write(cmd, cs, FD_CCU_INVALIDATE_DEPTH); + tu_emit_raw_event_write(cmd, cs, UNK_40, false); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); + tu_cs_emit_wfi(cs); + } + + tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true, .ds_state = true, @@ -1042,7 +1138,7 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) cmd->state.cache.pending_flush_bits &= ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE); - tu_cs_emit_regs(cs, rb_ccu_cntl(dev, phys_dev->ccu_offset_bypass)); + tu_cs_emit_regs(cs, rb_ccu_cntl(cmd->device, false)); cmd->state.ccu_state = TU_CMD_CCU_SYSMEM; for (size_t i = 0; i < ARRAY_SIZE(phys_dev->info->a6xx.magic_raw); i++) { @@ -1059,23 +1155,29 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_SP_DBG_ECO_CNTL, phys_dev->info->a6xx.magic.SP_DBG_ECO_CNTL); tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); - tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44); + if (CHIP == A6XX) + tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44); tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL, phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0); + if (CHIP == A6XX) { + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0); + } tu_cs_emit_write_reg(cs, REG_A6XX_VPC_DBG_ECO_CNTL, phys_dev->info->a6xx.magic.VPC_DBG_ECO_CNTL); tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL, phys_dev->info->a6xx.magic.GRAS_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.HLSQ_DBG_ECO_CNTL); + if (CHIP == A6XX) { + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_DBG_ECO_CNTL, + phys_dev->info->a6xx.magic.HLSQ_DBG_ECO_CNTL); + } tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, phys_dev->info->a6xx.magic.SP_CHICKEN_BITS); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); // 2 on a740 ??? tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0); - tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false)); + if (CHIP == A6XX) + tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false)); tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, phys_dev->info->a6xx.magic.UCHE_UNKNOWN_0E12); tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, @@ -1096,12 +1198,16 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0); + + if (CHIP == A6XX) { + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0); + } + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0); tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false)); @@ -1113,14 +1219,16 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0); tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0); + if (CHIP == A6XX) { + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0); + } tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0); tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL, 0x000000a0 | A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL)); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); + tu_cs_emit_regs(cs, HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfc)); tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000); @@ -1138,6 +1246,14 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo, .bo_offset = gb_offset(bcolor_builtin))); + if (CHIP == A7XX) { + tu_cs_emit_regs(cs, A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_0(0), + A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_1(0x3fe05ff4), + A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_2(0x3fa0ebee), + A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_3(0x3f5193ed), + A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4(0x3f0243f0), ); + } + tu_cs_sanity_check(cs); } @@ -1200,6 +1316,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); } +template static void tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -1293,7 +1410,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as * part of draws). */ - tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS); + tu_emit_event_write(cmd, cs, FD_CACHE_FLUSH); tu_cs_emit_wfi(cs); @@ -1458,7 +1575,6 @@ tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *sub tu_emit_input_attachments(cmd, subpass, false)); } - static void tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd) { @@ -1470,6 +1586,7 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd) cmd->state.dirty |= TU_CMD_DIRTY_FDM; } +template static void tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result) @@ -1480,19 +1597,40 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, assert(fb->width > 0 && fb->height > 0); tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); - tu6_emit_window_offset(cs, 0, 0); + tu6_emit_window_offset(cs, 0, 0); - tu6_emit_bin_size(cs, 0, 0, - A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) | - A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS); + tu6_emit_bin_size(cs, 0, 0, { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = true, + .buffers_location = BUFFERS_IN_SYSMEM, + .lrz_feedback_zmode_mask = 0x0, + }); + + if (CHIP == A7XX) { + tu_cs_emit_regs(cs, + A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem + tu_cs_emit_regs(cs, + A7XX_RB_UNKNOWN_88E5(0x50120004)); + tu_cs_emit_regs(cs, + A7XX_RB_UNKNOWN_8E06(0x2080000)); + + /* These three have something to do with lrz/depth */ + tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8007(0x0)); + tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_810B(0x3)); + tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8113(0x4)); + + tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2)); + tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8E09(0x4)); + } tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); + /* A7XX TODO: blob doesn't use CP_SKIP_IB2_ENABLE_* */ tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); tu_cs_emit(cs, 0x1); @@ -1505,6 +1643,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_sanity_check(cs); } +template static void tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result) @@ -1514,7 +1653,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, /* Do any resolves of the last subpass. These are handled in the * tile_store_cs in the gmem path. */ - tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); + tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); @@ -1526,6 +1665,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_sanity_check(cs); } +template static void tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result) @@ -1537,24 +1677,31 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); if (use_hw_binning(cmd)) { if (!cmd->vsc_initialized) { tu6_lazy_emit_vsc(cmd, cs); } - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) | - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, + { + .render_mode = BINNING_PASS, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = 0x6, + }); - tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); + tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); - tu6_emit_binning_pass(cmd, cs); + tu6_emit_binning_pass(cmd, cs); - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS | - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, + { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = true, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = 0x6, + }); tu_cs_emit_regs(cs, A6XX_VFD_MODE_CNTL(RENDERING_PASS)); @@ -1570,8 +1717,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1); tu_cs_emit(cs, 0x1); } else { - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, + { + .render_mode = RENDERING_PASS, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = 0x6, + }); if (tiling->binning_possible) { /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since @@ -1589,12 +1740,13 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_sanity_check(cs); } +template static void tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot, const struct tu_image_view *fdm) { - tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot, fdm); + tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot, fdm); trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); @@ -1602,12 +1754,12 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, * tile even with HW binning beforehand. Do not permit it. */ if (cmd->state.prim_generated_query_running_before_rp) - tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_STOP_PRIMITIVE_CTRS); tu_cs_emit_call(cs, &cmd->draw_cs); if (cmd->state.prim_generated_query_running_before_rp) - tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); + tu_emit_event_write(cmd, cs, FD_START_PRIMITIVE_CTRS); if (use_hw_binning(cmd)) { tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); @@ -1631,6 +1783,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs); } +template static void tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result) @@ -1641,11 +1794,12 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_lrz_tiling_end(cmd, cs); - tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS); + tu_emit_event_write(cmd, cs, FD_CCU_FLUSH_BLIT_CACHE); tu_cs_sanity_check(cs); } +template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, struct tu_renderpass_result *autotune_result) @@ -1663,12 +1817,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, * called from tu6_render_tile(). */ tu_cs_begin(&cmd->tile_store_cs); - tu6_emit_tile_store(cmd, &cmd->tile_store_cs); + tu6_emit_tile_store(cmd, &cmd->tile_store_cs); tu_cs_end(&cmd->tile_store_cs); cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - tu6_tile_render_begin(cmd, &cmd->cs, autotune_result); + tu6_tile_render_begin(cmd, &cmd->cs, autotune_result); /* Note: we reverse the order of walking the pipes and tiles on every * other row, to improve texture cache locality compared to raster order. @@ -1696,14 +1850,14 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, else tx = tile_row_i; uint32_t slot = slot_row + tx; - tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot, fdm); + tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot, fdm); } slot_row += tile_row_stride; } } } - tu6_tile_render_end(cmd, &cmd->cs, autotune_result); + tu6_tile_render_end(cmd, &cmd->cs, autotune_result); trace_end_render_pass(&cmd->trace, &cmd->cs); @@ -1725,13 +1879,14 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu_cs_discard_entries(&cmd->tile_store_cs); } +template static void tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, struct tu_renderpass_result *autotune_result) { cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); + tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs); @@ -1739,11 +1894,12 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs); - tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); + tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); trace_end_render_pass(&cmd->trace, &cmd->cs); } +template void tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) { @@ -1752,9 +1908,9 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) struct tu_renderpass_result *autotune_result = NULL; if (use_sysmem_rendering(cmd_buffer, &autotune_result)) - tu_cmd_render_sysmem(cmd_buffer, autotune_result); + tu_cmd_render_sysmem(cmd_buffer, autotune_result); else - tu_cmd_render_tiles(cmd_buffer, autotune_result); + tu_cmd_render_tiles(cmd_buffer, autotune_result); /* Outside of renderpasses we assume all draw states are disabled. We do * this outside the draw CS for the normal case where 3d gmem stores aren't @@ -1971,7 +2127,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, switch (cmd_buffer->queue_family_index) { case TU_QUEUE_GENERAL: - tu6_init_hw(cmd_buffer, &cmd_buffer->cs); + TU_CALLX(cmd_buffer->device, tu6_init_hw)(cmd_buffer, &cmd_buffer->cs); break; default: break; @@ -2171,49 +2327,62 @@ tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, cmd->state.index_size = index_size; } +template static void tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd, VkPipelineBindPoint bind_point) { struct tu_descriptor_state *descriptors_state = tu_get_descriptors_state(cmd, bind_point); - uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value; + uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg; struct tu_cs *cs, state_cs; if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { - sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); + sp_bindless_base_reg = __SP_BINDLESS_BASE_DESCRIPTOR(0, {}).reg; hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); - cmd->state.desc_sets = - tu_cs_draw_state(&cmd->sub_cs, &state_cs, - 4 + 4 * descriptors_state->max_sets_bound + + if (CHIP == A6XX) { + cmd->state.desc_sets = + tu_cs_draw_state(&cmd->sub_cs, &state_cs, + 4 + 4 * descriptors_state->max_sets_bound + (descriptors_state->dynamic_bound ? 6 : 0)); + } else { + cmd->state.desc_sets = + tu_cs_draw_state(&cmd->sub_cs, &state_cs, + 3 + 2 * descriptors_state->max_sets_bound + + (descriptors_state->dynamic_bound ? 3 : 0)); + } cs = &state_cs; } else { assert(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); - sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); + sp_bindless_base_reg = __SP_CS_BINDLESS_BASE_DESCRIPTOR(0, {}).reg; hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f); cs = &cmd->cs; } tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound); tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); + if (CHIP == A6XX) { + tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound); + tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); + } /* Dynamic descriptors get the last descriptor set. */ if (descriptors_state->dynamic_bound) { tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2); tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + if (CHIP == A6XX) { + tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); + tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + } } - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value)); + tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, + .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? 0x1f : 0, + .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? 0x1f : 0, + )); if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { assert(cs->cur == cs->end); /* validate draw state size */ @@ -2625,6 +2794,7 @@ tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, @@ -2646,7 +2816,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); tu_cs_emit_qw(cs, global_iova_arr(cmd, flush_base, i)); - tu6_emit_event_write(cmd, cs, (enum vgt_event_type) (FLUSH_SO_0 + i)); + tu_emit_event_write(cmd, cs, (enum fd_gpu_event) (FD_FLUSH_SO_0 + i)); } for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) { @@ -2662,7 +2832,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */ tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | - CP_MEM_TO_REG_0_SHIFT_BY_2 | + COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) | 0x40000 | /* ??? */ CP_MEM_TO_REG_0_UNK31 | CP_MEM_TO_REG_0_CNT(1)); @@ -2686,6 +2856,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, cmd->state.rp.xfb_used = true; } +TU_GENX(tu_CmdEndTransformFeedbackEXT); VKAPI_ATTR void VKAPI_CALL tu_CmdPushConstants(VkCommandBuffer commandBuffer, @@ -2710,6 +2881,7 @@ tu_flush_all_pending(struct tu_cache_state *cache) cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH; } +template VKAPI_ATTR VkResult VKAPI_CALL tu_EndCommandBuffer(VkCommandBuffer commandBuffer) { @@ -2729,7 +2901,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) */ if (cmd_buffer->state.pass) { tu_flush_all_pending(&cmd_buffer->state.renderpass_cache); - tu_emit_cache_flush_renderpass(cmd_buffer); + tu_emit_cache_flush_renderpass(cmd_buffer); trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->draw_cs); } else { @@ -2737,7 +2909,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) cmd_buffer->state.cache.flush_bits |= TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH; - tu_emit_cache_flush(cmd_buffer); + tu_emit_cache_flush(cmd_buffer); trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs); } @@ -2748,6 +2920,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) return vk_command_buffer_end(&cmd_buffer->vk); } +TU_GENX(tu_EndCommandBuffer); VKAPI_ATTR void VKAPI_CALL tu_CmdBindPipeline(VkCommandBuffer commandBuffer, @@ -3376,10 +3549,10 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, /* Emit any pending flushes. */ if (cmd->state.pass) { tu_flush_all_pending(&cmd->state.renderpass_cache); - tu_emit_cache_flush_renderpass(cmd); + TU_CALLX(cmd->device, tu_emit_cache_flush_renderpass)(cmd); } else { tu_flush_all_pending(&cmd->state.cache); - tu_emit_cache_flush(cmd); + TU_CALLX(cmd->device, tu_emit_cache_flush)(cmd); } for (uint32_t i = 0; i < commandBufferCount; i++) { @@ -3480,7 +3653,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, */ tu_restore_suspended_pass(cmd, cmd); - tu_cmd_render(cmd); + TU_CALLX(cmd->device, tu_cmd_render)(cmd); if (cmd->state.suspend_resume == SR_IN_CHAIN) cmd->state.suspend_resume = SR_NONE; else @@ -3574,6 +3747,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, tu_flush_for_stage(cache, src_stage, dst_stage); } +template static void tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) { @@ -3603,7 +3777,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) tu6_emit_blit_scissor(cmd, cs, true); emitted_scissor = true; } - tu_load_gmem_attachment(cmd, cs, i, cond_load_allowed, false); + tu_load_gmem_attachment(cmd, cs, i, cond_load_allowed, false); } } @@ -3616,7 +3790,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) tu6_emit_blit_scissor(cmd, cs, false); emitted_scissor = true; } - tu_clear_gmem_attachment(cmd, cs, i); + tu_clear_gmem_attachment(cmd, cs, i); } } @@ -3624,6 +3798,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) } /* Emits sysmem clears that are first used in this subpass. */ +template static void tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) { @@ -3634,7 +3809,7 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) { struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i]; if (att->clear_mask && att->first_subpass_idx == subpass_idx) - tu_clear_sysmem_attachment(cmd, cs, i); + tu_clear_sysmem_attachment(cmd, cs, i); } tu_cond_exec_end(cs); /* sysmem */ } @@ -3646,17 +3821,18 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) * VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT the loads may depend on the output of * a previous aliased attachment's store. */ +template static void tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) { tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass); - tu_emit_subpass_begin_gmem(cmd); - tu_emit_subpass_begin_sysmem(cmd); + tu_emit_subpass_begin_gmem(cmd); + tu_emit_subpass_begin_sysmem(cmd); - tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); + tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); - tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); + tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); tu_set_input_attachments(cmd, cmd->state.subpass); @@ -3665,6 +3841,7 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) cmd->state.dirty |= TU_CMD_DIRTY_SUBPASS; } +template VKAPI_ATTR void VKAPI_CALL tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo *pRenderPassBegin, @@ -3735,12 +3912,14 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); tu_emit_renderpass_begin(cmd); - tu_emit_subpass_begin(cmd); + tu_emit_subpass_begin(cmd); if (pass->has_fdm) cmd->patchpoints_ctx = ralloc_parent(NULL); } +TU_GENX(tu_CmdBeginRenderPass2); +template VKAPI_ATTR void VKAPI_CALL tu_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo) @@ -3868,7 +4047,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, if (!resuming) { tu_emit_renderpass_begin(cmd); - tu_emit_subpass_begin(cmd); + tu_emit_subpass_begin(cmd); } if (suspending && !resuming) { @@ -3891,7 +4070,9 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, if (resuming && cmd->state.suspend_resume == SR_NONE) cmd->state.suspend_resume = SR_IN_PRE_CHAIN; } +TU_GENX(tu_CmdBeginRendering); +template VKAPI_ATTR void VKAPI_CALL tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo, @@ -3938,7 +4119,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, + tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, subpass->multiview_mask, false); if (!pass->attachments[a].gmem) @@ -3948,7 +4129,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. */ perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n"); - tu_load_gmem_attachment(cmd, cs, a, false, true); + tu_load_gmem_attachment(cmd, cs, a, false, true); } } @@ -3960,7 +4141,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); } - tu6_emit_sysmem_resolves(cmd, cs, subpass); + tu6_emit_sysmem_resolves(cmd, cs, subpass); if (cmd->state.tiling->possible) tu_cond_exec_end(cs); @@ -3971,8 +4152,9 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, if (cmd->state.subpass->feedback_invalidate) cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; - tu_emit_subpass_begin(cmd); + tu_emit_subpass_begin(cmd); } +TU_GENX(tu_CmdNextSubpass2); static uint32_t tu6_user_consts_size(const struct tu_pipeline *pipeline, @@ -4350,6 +4532,7 @@ tu6_emit_fs_params(struct tu_cmd_buffer *cmd) tu_cs_set_writeable(&cmd->sub_cs, false); } +template static VkResult tu6_draw_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -4359,12 +4542,12 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, { const struct tu_pipeline *pipeline = &cmd->state.pipeline->base; struct tu_render_pass_state *rp = &cmd->state.rp; - + /* Emit state first, because it's needed for bandwidth calculations */ uint32_t dynamic_draw_state_dirty = 0; if (!BITSET_IS_EMPTY(cmd->vk.dynamic_graphics_state.dirty) || (cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS)) { - dynamic_draw_state_dirty = tu_emit_draw_state(cmd); + dynamic_draw_state_dirty = tu_emit_draw_state(cmd); } /* Fill draw stats for autotuner */ @@ -4386,7 +4569,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, if (cmd->vk.dynamic_graphics_state.ds.stencil.test_enable) rp->drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2; - tu_emit_cache_flush_renderpass(cmd); + tu_emit_cache_flush_renderpass(cmd); if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE) || @@ -4465,7 +4648,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false); if (dirty & TU_CMD_DIRTY_DESC_SETS) - tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); + tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) || @@ -4695,6 +4878,7 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd, cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS; } +template VKAPI_ATTR void VKAPI_CALL tu_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -4707,14 +4891,16 @@ tu_CmdDraw(VkCommandBuffer commandBuffer, tu6_emit_vs_params(cmd, 0, firstVertex, firstInstance); - tu6_draw_common(cmd, cs, false, vertexCount); + tu6_draw_common(cmd, cs, false, vertexCount); tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); tu_cs_emit(cs, instanceCount); tu_cs_emit(cs, vertexCount); } +TU_GENX(tu_CmdDraw); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, @@ -4745,7 +4931,7 @@ tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, tu6_emit_vs_params(cmd, i, draw->firstVertex, firstInstance); if (i == 0) - tu6_draw_common(cmd, cs, false, max_vertex_count); + tu6_draw_common(cmd, cs, false, max_vertex_count); if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) { tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); @@ -4759,7 +4945,9 @@ tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, tu_cs_emit(cs, draw->vertexCount); } } +TU_GENX(tu_CmdDrawMultiEXT); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, @@ -4773,7 +4961,7 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, tu6_emit_vs_params(cmd, 0, vertexOffset, firstInstance); - tu6_draw_common(cmd, cs, true, indexCount); + tu6_draw_common(cmd, cs, true, indexCount); tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); @@ -4783,7 +4971,9 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, tu_cs_emit_qw(cs, cmd->state.index_va); tu_cs_emit(cs, cmd->state.max_index_count); } +TU_GENX(tu_CmdDrawIndexed); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, @@ -4816,7 +5006,7 @@ tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, tu6_emit_vs_params(cmd, i, vertexOffset, firstInstance); if (i == 0) - tu6_draw_common(cmd, cs, true, max_index_count); + tu6_draw_common(cmd, cs, true, max_index_count); if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) { tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); @@ -4833,6 +5023,7 @@ tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, tu_cs_emit(cs, cmd->state.max_index_count); } } +TU_GENX(tu_CmdDrawMultiIndexedEXT); /* Various firmware bugs/inconsistencies mean that some indirect draw opcodes * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if @@ -4848,6 +5039,7 @@ draw_wfm(struct tu_cmd_buffer *cmd) cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME; } +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -4864,7 +5056,7 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) draw_wfm(cmd); - tu6_draw_common(cmd, cs, false, 0); + tu6_draw_common(cmd, cs, false, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); @@ -4874,7 +5066,9 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, tu_cs_emit_qw(cs, buf->iova + offset); tu_cs_emit(cs, stride); } +TU_GENX(tu_CmdDrawIndirect); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -4891,7 +5085,7 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) draw_wfm(cmd); - tu6_draw_common(cmd, cs, true, 0); + tu6_draw_common(cmd, cs, true, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); @@ -4903,7 +5097,9 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, tu_cs_emit_qw(cs, buf->iova + offset); tu_cs_emit(cs, stride); } +TU_GENX(tu_CmdDrawIndexedIndirect); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -4927,7 +5123,7 @@ tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, */ draw_wfm(cmd); - tu6_draw_common(cmd, cs, false, 0); + tu6_draw_common(cmd, cs, false, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); @@ -4938,7 +5134,9 @@ tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset); tu_cs_emit(cs, stride); } +TU_GENX(tu_CmdDrawIndirectCount); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -4957,7 +5155,7 @@ tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, draw_wfm(cmd); - tu6_draw_common(cmd, cs, true, 0); + tu6_draw_common(cmd, cs, true, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); @@ -4970,7 +5168,9 @@ tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset); tu_cs_emit(cs, stride); } +TU_GENX(tu_CmdDrawIndexedIndirectCount); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, @@ -4993,7 +5193,7 @@ tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, tu6_emit_vs_params(cmd, 0, 0, firstInstance); - tu6_draw_common(cmd, cs, false, 0); + tu6_draw_common(cmd, cs, false, 0); tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB)); @@ -5002,6 +5202,7 @@ tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, tu_cs_emit(cs, counterOffset); tu_cs_emit(cs, vertexStride); } +TU_GENX(tu_CmdDrawIndirectByteCountEXT); struct tu_dispatch_info { @@ -5027,6 +5228,7 @@ struct tu_dispatch_info uint64_t indirect_offset; }; +template static void tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_compute_pipeline *pipeline, @@ -5099,7 +5301,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, } tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | @@ -5136,6 +5338,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, } } +template static void tu_dispatch(struct tu_cmd_buffer *cmd, const struct tu_dispatch_info *info) @@ -5175,21 +5378,21 @@ tu_dispatch(struct tu_cmd_buffer *cmd, */ if (emit_instrlen_workaround) { tu_cs_emit_regs(cs, A6XX_SP_FS_INSTRLEN(pipeline->instrlen)); - tu6_emit_event_write(cmd, cs, LABEL); + tu_emit_event_write(cmd, cs, FD_LABEL); } /* TODO: We could probably flush less if we add a compute_flush_bits * bitfield. */ - tu_emit_cache_flush(cmd); + tu_emit_cache_flush(cmd); /* note: no reason to have this in a separate IB */ tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, &pipeline->base, true)); - tu_emit_compute_driver_params(cmd, cs, pipeline, info); + tu_emit_compute_driver_params(cmd, cs, pipeline, info); if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS) { - tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE); + tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE); tu_cs_emit_state_ib(cs, pipeline->base.load_state); } @@ -5201,21 +5404,21 @@ tu_dispatch(struct tu_cmd_buffer *cmd, const uint32_t *local_size = pipeline->local_size; const uint32_t *num_groups = info->blocks; tu_cs_emit_regs(cs, - A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3, - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1), - A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]), - A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0), - A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]), - A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0), - A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]), - A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0)); + HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1), + HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]), + HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0), + HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]), + HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0), + HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]), + HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0)); tu_cs_emit_regs(cs, - A6XX_HLSQ_CS_KERNEL_GROUP_X(1), - A6XX_HLSQ_CS_KERNEL_GROUP_Y(1), - A6XX_HLSQ_CS_KERNEL_GROUP_Z(1)); + HLSQ_CS_KERNEL_GROUP_X(CHIP, 1), + HLSQ_CS_KERNEL_GROUP_Y(CHIP, 1), + HLSQ_CS_KERNEL_GROUP_Z(CHIP, 1)); trace_start_compute(&cmd->trace, cs, info->indirect != NULL, local_size[0], local_size[1], local_size[2], info->blocks[0], @@ -5248,12 +5451,13 @@ tu_dispatch(struct tu_cmd_buffer *cmd, * anyway when the next renderpass starts. */ if (emit_instrlen_workaround) { - tu6_emit_event_write(cmd, cs, LABEL); + tu_emit_event_write(cmd, cs, FD_LABEL); } tu_cs_emit_wfi(cs); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, @@ -5273,9 +5477,11 @@ tu_CmdDispatchBase(VkCommandBuffer commandBuffer, info.offsets[0] = base_x; info.offsets[1] = base_y; info.offsets[2] = base_z; - tu_dispatch(cmd_buffer, &info); + tu_dispatch(cmd_buffer, &info); } +TU_GENX(tu_CmdDispatchBase); +template VKAPI_ATTR void VKAPI_CALL tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -5288,8 +5494,9 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, info.indirect = buffer; info.indirect_offset = offset; - tu_dispatch(cmd_buffer, &info); + tu_dispatch(cmd_buffer, &info); } +TU_GENX(tu_CmdDispatchIndirect); VKAPI_ATTR void VKAPI_CALL tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, @@ -5304,7 +5511,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); - tu_cmd_render(cmd_buffer); + TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer); cmd_buffer->state.cache.pending_flush_bits |= cmd_buffer->state.renderpass_cache.pending_flush_bits; @@ -5336,7 +5543,7 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer) */ tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); } else { - tu_cmd_render(cmd_buffer); + TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer); } tu_reset_render_pass(cmd_buffer); @@ -5481,6 +5688,7 @@ tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, tu_barrier(cmd_buffer, pDependencyInfo); } +template static void write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, VkPipelineStageFlags2 stageMask, unsigned value) @@ -5490,7 +5698,7 @@ write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */ assert(!cmd->state.pass); - tu_emit_cache_flush(cmd); + tu_emit_cache_flush(cmd); /* Flags that only require a top-of-pipe event. DrawIndirect parameters are * read by the CP, so the draw indirect stage counts as top-of-pipe too. @@ -5505,13 +5713,23 @@ write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, tu_cs_emit(cs, value); } else { /* Use a RB_DONE_TS event to wait for everything to complete. */ - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); + if (CHIP == A6XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); + tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); + } else { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = RB_DONE_TS, + .write_src = EV_WRITE_USER_32B, + .write_dst = EV_DST_RAM, + .write_enabled = true).value); + } + tu_cs_emit_qw(cs, event->bo->iova); tu_cs_emit(cs, value); } } +template VKAPI_ATTR void VKAPI_CALL tu_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, @@ -5528,9 +5746,11 @@ tu_CmdSetEvent2(VkCommandBuffer commandBuffer, for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; - write_event(cmd, event, src_stage_mask, 1); + write_event(cmd, event, src_stage_mask, 1); } +TU_GENX(tu_CmdSetEvent2); +template VKAPI_ATTR void VKAPI_CALL tu_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, @@ -5539,8 +5759,9 @@ tu_CmdResetEvent2(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_event, event, _event); - write_event(cmd, event, stageMask, 0); + write_event(cmd, event, stageMask, 0); } +TU_GENX(tu_CmdResetEvent2); VKAPI_ATTR void VKAPI_CALL tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, @@ -5566,6 +5787,7 @@ tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, tu_barrier(cmd, pDependencyInfos); } +template VKAPI_ATTR void VKAPI_CALL tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) @@ -5581,9 +5803,9 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, /* Wait for any writes to the predicate to land */ if (cmd->state.pass) - tu_emit_cache_flush_renderpass(cmd); + tu_emit_cache_flush_renderpass(cmd); else - tu_emit_cache_flush(cmd); + tu_emit_cache_flush(cmd); TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer); uint64_t iova = buf->iova + pConditionalRenderingBegin->offset; @@ -5607,6 +5829,7 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); tu_cs_emit_qw(cs, global_iova(cmd, predicate)); } +TU_GENX(tu_CmdBeginConditionalRenderingEXT); VKAPI_ATTR void VKAPI_CALL tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) @@ -5621,6 +5844,7 @@ tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) tu_cs_emit(cs, 0); } +template void tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits2 pipelineStage, @@ -5671,9 +5895,9 @@ tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, } if (cmd->state.pass) { - tu_emit_cache_flush_renderpass(cmd); + tu_emit_cache_flush_renderpass(cmd); } else { - tu_emit_cache_flush(cmd); + tu_emit_cache_flush(cmd); } if (is_top_of_pipe) { @@ -5682,8 +5906,16 @@ tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, tu_cs_emit(cs, marker); } else { /* Use a RB_DONE_TS event to wait for everything to complete. */ - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); + if (CHIP == A6XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); + tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); + } else { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = RB_DONE_TS, + .write_src = EV_WRITE_USER_32B, + .write_dst = EV_DST_RAM, + .write_enabled = true).value); + } tu_cs_emit_qw(cs, va); tu_cs_emit(cs, marker); } @@ -5691,3 +5923,4 @@ tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, /* Make sure the result of this write is visible to others. */ tu_flush_for_access(cache, TU_ACCESS_CP_WRITE, TU_ACCESS_NONE); } +TU_GENX(tu_CmdWriteBufferMarker2AMD); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index f0abeeeeb63..52d8cc46f81 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -599,11 +599,14 @@ void tu_render_pass_state_merge(struct tu_render_pass_state *dst, VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, const VkCommandBufferBeginInfo *pBeginInfo); +template void tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer); +template void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer); +template void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, struct tu_cs *cs, enum tu_cmd_ccu_state ccu_state); @@ -624,12 +627,16 @@ void tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, struct tu_cmd_buffer *suspended); +template void tu_cmd_render(struct tu_cmd_buffer *cmd); +enum fd_gpu_event : uint32_t; + +template void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event); +tu_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum fd_gpu_event event); static inline struct tu_descriptor_state * tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h index 3d6a503bd80..0d99fe7b1c5 100644 --- a/src/freedreno/vulkan/tu_cs.h +++ b/src/freedreno/vulkan/tu_cs.h @@ -498,6 +498,7 @@ struct tu_reg_value { #define __bo_type struct tu_bo * #include "a6xx-pack.xml.h" +#include "adreno-pm4-pack.xml.h" #define __assert_eq(a, b) \ do { \ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2f56d0f926d..c2f319f0c7d 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -612,7 +612,8 @@ tu_physical_device_init(struct tu_physical_device *device, goto fail_free_name; } switch (fd_dev_gen(&device->dev_id)) { - case 6: { + case 6: + case 7: { device->info = info; uint32_t depth_cache_size = device->info->num_ccu * device->info->a6xx.sysmem_per_ccu_cache_size; diff --git a/src/freedreno/vulkan/tu_dynamic_rendering.cc b/src/freedreno/vulkan/tu_dynamic_rendering.cc index a215131da41..841b7bf9ec2 100644 --- a/src/freedreno/vulkan/tu_dynamic_rendering.cc +++ b/src/freedreno/vulkan/tu_dynamic_rendering.cc @@ -152,14 +152,14 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev, old_cmds[i]->pre_chain.trace_renderpass_end); } - tu_cmd_render(cmd_buffer); + TU_CALLX(dev, tu_cmd_render)(cmd_buffer); tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3); tu_cs_emit_qw(&cmd_buffer->cs, global_iova(cmd_buffer, dynamic_rendering_fence)); tu_cs_emit(&cmd_buffer->cs, dev->dynamic_rendering_fence); - tu_EndCommandBuffer(tu_cmd_buffer_to_handle(cmd_buffer)); + TU_CALLX(dev, tu_EndCommandBuffer)(tu_cmd_buffer_to_handle(cmd_buffer)); util_dynarray_append(&cmds, struct tu_cmd_buffer *, cmd_buffer); cmd_buffer = NULL; break; @@ -223,4 +223,3 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev, return VK_SUCCESS; } - diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc index ef98afdf4f7..cd6f365e6c4 100644 --- a/src/freedreno/vulkan/tu_image.cc +++ b/src/freedreno/vulkan/tu_image.cc @@ -139,16 +139,18 @@ tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint3 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer); } +template void tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src) { tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer); /* SP_PS_2D_SRC_PITCH has shifted pitch field */ if (src) - tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_PITCH(.pitch = iview->pitch).value); + tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->pitch).value); else tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->pitch).value); } +TU_GENX(tu_cs_image_ref_2d); void tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer) diff --git a/src/freedreno/vulkan/tu_image.h b/src/freedreno/vulkan/tu_image.h index fbcb244e83c..d5ee3dc9f71 100644 --- a/src/freedreno/vulkan/tu_image.h +++ b/src/freedreno/vulkan/tu_image.h @@ -90,6 +90,7 @@ enum pipe_format tu_format_for_aspect(enum pipe_format format, void tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); +template void tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src); diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc index f4dda1752cc..a5f6454e28c 100644 --- a/src/freedreno/vulkan/tu_lrz.cc +++ b/src/freedreno/vulkan/tu_lrz.cc @@ -10,6 +10,8 @@ #include "tu_cs.h" #include "tu_image.h" +#include "common/freedreno_gpu_event.h" + /* See lrz.rst for how HW works. Here are only the implementation notes. * * There are a number of limitations when LRZ cannot be used: @@ -101,8 +103,8 @@ tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs) .disable_on_wrong_dir = true, )); - tu6_emit_event_write(cmd, cs, LRZ_CLEAR); - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + tu_emit_event_write(cmd, cs, FD_LRZ_CLEAR); + tu_emit_event_write(cmd, cs, FD_LRZ_FLUSH); } static void @@ -319,12 +321,11 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to * CUR_DIR_UNSET. */ - tu6_emit_event_write(cmd, cs, LRZ_CLEAR); + tu_emit_event_write(cmd, cs, FD_LRZ_CLEAR); } if (!lrz->fast_clear && !invalidate_lrz) { - tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); - + tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); /* Even though we disable fast-clear we still have to dirty * fast-clear buffer because both secondary cmdbufs and following * renderpasses won't know that fast-clear is disabled. @@ -333,7 +334,7 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) * expect secondary cmdbufs. */ if (lrz->image_view->image->lrz_fc_size) { - tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image); + tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image); } } } @@ -359,7 +360,7 @@ tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0)); } - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + tu_emit_event_write(cmd, cs, FD_LRZ_FLUSH); /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point, * additionally clears direction buffer: @@ -400,10 +401,10 @@ tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) .enable = true, .fc_enable = true, )); - tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR); - tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); + tu_emit_event_write(cmd, &cmd->cs, FD_LRZ_CLEAR); + tu_emit_event_write(cmd, &cmd->cs, FD_LRZ_FLUSH); } else { - tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); + tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); } } } @@ -411,7 +412,7 @@ tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) void tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); + tu_emit_event_write(cmd, &cmd->cs, FD_LRZ_FLUSH); } /* Disable LRZ outside of renderpass. */ @@ -473,11 +474,11 @@ tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd, .disable_on_wrong_dir = true, )); - tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR); - tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); + tu_emit_event_write(cmd, &cmd->cs, FD_LRZ_CLEAR); + tu_emit_event_write(cmd, &cmd->cs, FD_LRZ_FLUSH); if (!fast_clear) { - tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil); + tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil); } } diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 2a9928c902e..284639adb12 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -317,19 +317,23 @@ tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb) return false; } -static const struct xs_config { +template +struct xs_config { uint16_t reg_sp_xs_ctrl; uint16_t reg_sp_xs_config; uint16_t reg_sp_xs_instrlen; uint16_t reg_hlsq_xs_ctrl; uint16_t reg_sp_xs_first_exec_offset; uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; -} xs_config[] = { +}; + +template +const xs_config xs_configs[] = { [MESA_SHADER_VERTEX] = { REG_A6XX_SP_VS_CTRL_REG0, REG_A6XX_SP_VS_CONFIG, REG_A6XX_SP_VS_INSTRLEN, - REG_A6XX_HLSQ_VS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL, REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET, }, @@ -337,7 +341,7 @@ static const struct xs_config { REG_A6XX_SP_HS_CTRL_REG0, REG_A6XX_SP_HS_CONFIG, REG_A6XX_SP_HS_INSTRLEN, - REG_A6XX_HLSQ_HS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL, REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET, }, @@ -345,7 +349,7 @@ static const struct xs_config { REG_A6XX_SP_DS_CTRL_REG0, REG_A6XX_SP_DS_CONFIG, REG_A6XX_SP_DS_INSTRLEN, - REG_A6XX_HLSQ_DS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL, REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET, }, @@ -353,7 +357,7 @@ static const struct xs_config { REG_A6XX_SP_GS_CTRL_REG0, REG_A6XX_SP_GS_CONFIG, REG_A6XX_SP_GS_INSTRLEN, - REG_A6XX_HLSQ_GS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL, REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET, }, @@ -361,7 +365,7 @@ static const struct xs_config { REG_A6XX_SP_FS_CTRL_REG0, REG_A6XX_SP_FS_CONFIG, REG_A6XX_SP_FS_INSTRLEN, - REG_A6XX_HLSQ_FS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL, REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET, }, @@ -369,7 +373,7 @@ static const struct xs_config { REG_A6XX_SP_CS_CTRL_REG0, REG_A6XX_SP_CS_CONFIG, REG_A6XX_SP_CS_INSTRLEN, - REG_A6XX_HLSQ_CS_CNTL, + CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL, REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, }, @@ -412,12 +416,13 @@ tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) return size; } +template void tu6_emit_xs_config(struct tu_cs *cs, gl_shader_stage stage, /* xs->type, but xs may be NULL */ const struct ir3_shader_variant *xs) { - const struct xs_config *cfg = &xs_config[stage]; + const struct xs_config *cfg = &xs_configs[stage]; if (!xs) { /* shader stage disabled */ @@ -442,7 +447,9 @@ tu6_emit_xs_config(struct tu_cs *cs, tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | A6XX_HLSQ_VS_CNTL_ENABLED); } +TU_GENX(tu6_emit_xs_config); +template void tu6_emit_xs(struct tu_cs *cs, gl_shader_stage stage, /* xs->type, but xs may be NULL */ @@ -450,7 +457,7 @@ tu6_emit_xs(struct tu_cs *cs, const struct tu_pvtmem_config *pvtmem, uint64_t binary_iova) { - const struct xs_config *cfg = &xs_config[stage]; + const struct xs_config *cfg = &xs_configs[stage]; if (!xs) { /* shader stage disabled */ @@ -540,16 +547,18 @@ tu6_emit_xs(struct tu_cs *cs, tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); - uint32_t shader_preload_size = - MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size); + if (CHIP == A6XX) { + uint32_t shader_preload_size = + MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size); - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); - tu_cs_emit_qw(cs, binary_iova); + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); + tu_cs_emit_qw(cs, binary_iova); + } /* emit immediates */ @@ -636,6 +645,7 @@ tu6_emit_xs(struct tu_cs *cs, } } } +TU_GENX(tu6_emit_xs); static void tu6_emit_dynamic_offset(struct tu_cs *cs, @@ -661,16 +671,23 @@ tu6_emit_dynamic_offset(struct tu_cs *cs, } } +template static void tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable) { - /* Enable/disable shared constants */ - tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable)); + if (CHIP == A6XX) { + /* Enable/disable shared constants */ + tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable)); + } else { + assert(!enable); + } + tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, .isammode = ISAMMODE_GL, .shared_consts_enable = enable)); } +template static void tu6_emit_cs_config(struct tu_cs *cs, const struct ir3_shader_variant *v, @@ -678,22 +695,22 @@ tu6_emit_cs_config(struct tu_cs *cs, uint64_t binary_iova) { bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable; - tu6_emit_shared_consts_enable(cs, shared_consts_enable); + tu6_emit_shared_consts_enable(cs, shared_consts_enable); - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, .cs_state = true, .cs_ibo = true, .cs_shared_const = shared_consts_enable)); - tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); - tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); + tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); + tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | A6XX_SP_CS_UNKNOWN_A9B1_UNK6); - if (cs->device->physical_device->info->a6xx.has_lpac) { + if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) { tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1); tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) | A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6); @@ -712,28 +729,65 @@ tu6_emit_cs_config(struct tu_cs *cs, enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx .supports_double_threadsize ? thrsz : THREAD128; - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); - tu_cs_emit(cs, - A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs)); - if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) { - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); - tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz)); - } - - if (cs->device->physical_device->info->a6xx.has_lpac) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); + if (CHIP == A6XX) { + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); tu_cs_emit(cs, - A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); + A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs)); + if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) { + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); + tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz)); + } + + if (cs->device->physical_device->info->a6xx.has_lpac) { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); + tu_cs_emit(cs, + A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | + A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | + A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | + A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); + } + } else { + enum a7xx_cs_yalign yalign = (v->local_size[1] % 8 == 0) ? CS_YALIGN_8 + : (v->local_size[1] % 4 == 0) ? CS_YALIGN_4 + : (v->local_size[1] % 2 == 0) ? CS_YALIGN_2 + : CS_YALIGN_1; + tu_cs_emit_regs( + cs, A7XX_HLSQ_CS_CNTL_1( + .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs, + /* A7XX TODO: blob either sets all of these unknowns + * together or doesn't set them at all. + */ + .unk11 = true, .unk22 = true, .yalign = yalign, )); + + tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64)); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1); + tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | + A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | + A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + + tu_cs_emit_regs(cs, + A7XX_SP_CS_CNTL_1( + .linearlocalidregid = regid(63, 0), + .threadsize = thrsz_cs, + /* A7XX TODO: enable UNK15 when we don't use subgroup ops. */ + .unk15 = false, )); + + tu_cs_emit_regs( + cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1, + .localsizey = v->local_size[1] - 1, + .localsizez = v->local_size[2] - 1, )); + + tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000 + tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9C5(0)); // Sometimes is 0x00000401 } } @@ -1137,6 +1191,7 @@ tu6_emit_vpc_varying_modes(struct tu_cs *cs, } } +template void tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *vs, @@ -1421,11 +1476,13 @@ tu6_emit_vpc(struct tu_cs *cs, A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); - tu_cs_emit(cs, 0xff); + if (CHIP == A6XX) { + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); + tu_cs_emit(cs, 0xff); - tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + } uint32_t prim_size = prev_stage_output_size; if (prim_size > 64) @@ -1438,6 +1495,7 @@ tu6_emit_vpc(struct tu_cs *cs, tu6_emit_vpc_varying_modes(cs, fs, last_shader); } +TU_GENX(tu6_emit_vpc); static enum a6xx_tex_prefetch_cmd tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc) @@ -1450,6 +1508,7 @@ tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc) } } +template void tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) { @@ -1476,21 +1535,21 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | + COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) | + COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) | COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]), A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) | COND(fs->prefetch_end_of_quad, A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD)); for (int i = 0; i < fs->num_sampler_prefetch; i++) { const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | - A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | - A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | - A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | - A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | - COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | - COND(prefetch->bindless, A6XX_SP_FS_PREFETCH_CMD_BINDLESS) | - A6XX_SP_FS_PREFETCH_CMD_CMD( - tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc))); + tu_cs_emit( + cs, SP_FS_PREFETCH_CMD( + CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id, + .tex_id = prefetch->tex_id, .dst = prefetch->dst, + .wrmask = prefetch->wrmask, .half = prefetch->half_precision, + .bindless = prefetch->bindless, + .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value); } if (fs->num_sampler_prefetch > 0) { @@ -1503,27 +1562,26 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) } } - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD( - cs->device->physical_device->info->a6xx.prim_alloc_threshold)); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | - A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW])); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | - A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | - A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | - A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); - tu_cs_emit(cs, 0xfcfc); + tu_cs_emit_regs(cs, + HLSQ_CONTROL_1_REG(CHIP, + .primallocthreshold = + cs->device->physical_device->info->a6xx.prim_alloc_threshold), + HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid, + .sampleid = samp_id_regid, + .samplemask = smask_in_regid, + .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]), + HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL], + .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL], + .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID], + .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]), + HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE], + .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE], + .xycoordregid = coord_regid, + .zwcoordregid = zwcoord_regid), + HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfcfc), ); enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); - tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) | - COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); + tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings)); bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; bool need_size_persamp = false; @@ -1579,6 +1637,7 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1); tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); } +TU_GENX(tu6_emit_fs_inputs) static void tu6_emit_fs_outputs(struct tu_cs *cs, @@ -1694,7 +1753,8 @@ static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS, }; -static unsigned +template +static unsigned tu6_patch_control_points_size(struct tu_device *dev, const struct tu_pipeline *pipeline, uint32_t patch_control_points) @@ -1705,6 +1765,7 @@ tu6_patch_control_points_size(struct tu_device *dev, #undef EMIT_CONST_DWORDS } +template void tu6_emit_patch_control_points(struct tu_cs *cs, const struct tu_pipeline *pipeline, @@ -1834,6 +1895,7 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs, } } +template static void tu6_emit_program_config(struct tu_cs *cs, struct tu_pipeline_builder *builder) @@ -1842,9 +1904,9 @@ tu6_emit_program_config(struct tu_cs *cs, bool shared_consts_enable = tu6_shared_constants_enable(&builder->layout, builder->device->compiler); - tu6_emit_shared_consts_enable(cs, shared_consts_enable); + tu6_emit_shared_consts_enable(cs, shared_consts_enable); - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true, .ds_state = true, @@ -1855,10 +1917,11 @@ tu6_emit_program_config(struct tu_cs *cs, for (size_t stage_idx = MESA_SHADER_VERTEX; stage_idx < ARRAY_SIZE(builder->shader_iova); stage_idx++) { gl_shader_stage stage = (gl_shader_stage) stage_idx; - tu6_emit_xs_config(cs, stage, builder->variants[stage]); + tu6_emit_xs_config(cs, stage, builder->variants[stage]); } } +template static void tu6_emit_program(struct tu_cs *cs, struct tu_pipeline_builder *builder, @@ -1879,7 +1942,7 @@ tu6_emit_program(struct tu_cs *cs, */ if (binning_pass && !gs) { vs = bs; - tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); + tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); tu6_emit_dynamic_offset(cs, bs, builder); stage = (gl_shader_stage) (stage + 1); } @@ -1891,7 +1954,7 @@ tu6_emit_program(struct tu_cs *cs, if (stage == MESA_SHADER_FRAGMENT && binning_pass) fs = xs = NULL; - tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); + tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); tu6_emit_dynamic_offset(cs, xs, builder); } @@ -1929,16 +1992,22 @@ tu6_emit_program(struct tu_cs *cs, tu6_emit_vfd_dest(cs, vs); - tu6_emit_vpc(cs, vs, hs, ds, gs, fs); + tu6_emit_vpc(cs, vs, hs, ds, gs, fs); + + if (CHIP >= A7XX) { + tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.unk0 = 0x2, .unk8 = 1)); + tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2)); + tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false)); + } if (fs) { - tu6_emit_fs_inputs(cs, fs); + tu6_emit_fs_inputs(cs, fs); tu6_emit_fs_outputs(cs, fs, pipeline); pipeline->program.per_samp = fs->per_samp || fs->key.sample_shading; } else { /* TODO: check if these can be skipped if fs is disabled */ struct ir3_shader_variant dummy_variant = {}; - tu6_emit_fs_inputs(cs, &dummy_variant); + tu6_emit_fs_inputs(cs, &dummy_variant); tu6_emit_fs_outputs(cs, &dummy_variant, NULL); } @@ -3299,6 +3368,7 @@ tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, link->constlen = v->constlen; } +template static void tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) @@ -3317,15 +3387,15 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, * and draw passes. */ tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program_config(&prog_cs, builder); + tu6_emit_program_config(&prog_cs, builder); pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); - tu6_emit_program(&prog_cs, builder, false, pipeline); + tu6_emit_program(&prog_cs, builder, false, pipeline); pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); - tu6_emit_program(&prog_cs, builder, true, pipeline); + tu6_emit_program(&prog_cs, builder, true, pipeline); pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) { @@ -3410,6 +3480,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = { MESA_VK_DYNAMIC_VI, }; +template static unsigned tu6_vertex_input_size(struct tu_device *dev, const struct vk_vertex_input_state *vi) @@ -3417,6 +3488,7 @@ tu6_vertex_input_size(struct tu_device *dev, return 1 + 2 * util_last_bit(vi->attributes_valid); } +template static void tu6_emit_vertex_input(struct tu_cs *cs, const struct vk_vertex_input_state *vi) @@ -3455,6 +3527,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = { MESA_VK_DYNAMIC_VI_BINDING_STRIDES, }; +template static unsigned tu6_vertex_stride_size(struct tu_device *dev, const struct vk_vertex_input_state *vi) @@ -3462,6 +3535,7 @@ tu6_vertex_stride_size(struct tu_device *dev, return 1 + 2 * util_last_bit(vi->bindings_valid); } +template static void tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi) { @@ -3475,6 +3549,7 @@ tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi) } } +template static unsigned tu6_vertex_stride_size_dyn(struct tu_device *dev, const uint16_t *vi_binding_stride, @@ -3483,6 +3558,7 @@ tu6_vertex_stride_size_dyn(struct tu_device *dev, return 1 + 2 * util_last_bit(bindings_valid); } +template static void tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride, uint32_t bindings_valid) @@ -3503,6 +3579,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = { MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE, }; +template static unsigned tu6_viewport_size(struct tu_device *dev, const struct vk_viewport_state *vp) { @@ -3510,6 +3587,7 @@ tu6_viewport_size(struct tu_device *dev, const struct vk_viewport_state *vp) 1 + vp->viewport_count * 2 + 5; } +template static void tu6_emit_viewport(struct tu_cs *cs, const struct vk_viewport_state *vp) { @@ -3682,7 +3760,7 @@ fdm_apply_viewports(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views, vp.viewports[i].y = scale_y * viewport.y + offset.y; } - tu6_emit_viewport(cs, &vp); + TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp); } static void @@ -3696,7 +3774,7 @@ tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd, }; if (!state.share_scale) state.vp.viewport_count = num_views; - unsigned size = tu6_viewport_size(cmd->device, &state.vp); + unsigned size = TU_CALLX(cs->device, tu6_viewport_size)(cmd->device, &state.vp); tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs); tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state); } @@ -3706,12 +3784,14 @@ static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = { MESA_VK_DYNAMIC_VP_SCISSOR_COUNT, }; +template static unsigned tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp) { return 1 + vp->scissor_count * 2; } +template void tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp) { @@ -3791,7 +3871,7 @@ fdm_apply_scissors(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views, MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y; } - tu6_emit_scissor(cs, &vp); + TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp); } static void @@ -3805,7 +3885,7 @@ tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd, }; if (!state.share_scale) state.vp.scissor_count = num_views; - unsigned size = tu6_scissor_size(cmd->device, &state.vp); + unsigned size = TU_CALLX(cs->device, tu6_scissor_size)(cmd->device, &state.vp); tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs); tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state); } @@ -3814,12 +3894,14 @@ static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_enable_stat MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE, }; +template static unsigned tu6_sample_locations_enable_size(struct tu_device *dev, bool enable) { return 6; } +template void tu6_emit_sample_locations_enable(struct tu_cs *cs, bool enable) { @@ -3840,6 +3922,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = { MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS, }; +template static unsigned tu6_sample_locations_size(struct tu_device *dev, const struct vk_sample_locations_state *samp_loc) @@ -3847,6 +3930,7 @@ tu6_sample_locations_size(struct tu_device *dev, return 6; } +template void tu6_emit_sample_locations(struct tu_cs *cs, const struct vk_sample_locations_state *samp_loc) { @@ -3892,6 +3976,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = { MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS, }; +template static unsigned tu6_depth_bias_size(struct tu_device *dev, const struct vk_rasterization_state *rs) @@ -3899,6 +3984,7 @@ tu6_depth_bias_size(struct tu_device *dev, return 4; } +template void tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs) { @@ -4024,6 +4110,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = { MESA_VK_DYNAMIC_MS_SAMPLE_MASK, }; +template static unsigned tu6_blend_size(struct tu_device *dev, const struct vk_color_blend_state *cb, @@ -4036,6 +4123,7 @@ tu6_blend_size(struct tu_device *dev, return 8 + 3 * num_rts; } +template static void tu6_emit_blend(struct tu_cs *cs, const struct vk_color_blend_state *cb, @@ -4125,6 +4213,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = { MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS, }; +template static unsigned tu6_blend_constants_size(struct tu_device *dev, const struct vk_color_blend_state *cb) @@ -4132,6 +4221,7 @@ tu6_blend_constants_size(struct tu_device *dev, return 5; } +template static void tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb) { @@ -4150,6 +4240,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = { MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE, }; +template uint32_t tu6_rast_size(struct tu_device *dev, const struct vk_rasterization_state *rs, @@ -4157,9 +4248,10 @@ tu6_rast_size(struct tu_device *dev, bool multiview, bool per_view_viewport) { - return 11 + (dev->physical_device->info->a6xx.has_shading_rate ? 8 : 0); + return 11 + (CHIP == A6XX && dev->physical_device->info->a6xx.has_shading_rate ? 8 : 0); } +template void tu6_emit_rast(struct tu_cs *cs, const struct vk_rasterization_state *rs, @@ -4184,7 +4276,7 @@ tu6_emit_rast(struct tu_cs *cs, bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs); - tu_cs_emit_regs(cs, + tu_cs_emit_regs(cs, A6XX_GRAS_CL_CNTL( .znear_clip_disable = !depth_clip_enable, .zfar_clip_disable = !depth_clip_enable, @@ -4198,14 +4290,14 @@ tu6_emit_rast(struct tu_cs *cs, A6XX_VPC_POLYGON_MODE(polygon_mode)); tu_cs_emit_regs(cs, - A6XX_PC_POLYGON_MODE(polygon_mode)); + PC_POLYGON_MODE(CHIP, polygon_mode)); /* move to hw ctx init? */ tu_cs_emit_regs(cs, A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f), A6XX_GRAS_SU_POINT_SIZE(1.0f)); - if (cs->device->physical_device->info->a6xx.has_shading_rate) { + if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_shading_rate) { tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00()); tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10()); tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20()); @@ -4218,22 +4310,26 @@ static const enum mesa_vk_dynamic_graphics_state tu_pc_raster_cntl_state[] = { MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM, }; +template static unsigned tu6_pc_raster_cntl_size(struct tu_device *dev, const struct vk_rasterization_state *rs) { - return 4; + return CHIP == A6XX ? 4 : 2; } +template static void tu6_emit_pc_raster_cntl(struct tu_cs *cs, const struct vk_rasterization_state *rs) { - tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL( + tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP, .stream = rs->rasterization_stream, .discard = rs->rasterizer_discard_enable)); - tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107( - .raster_discard = rs->rasterizer_discard_enable)); + if (CHIP == A6XX) { + tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107( + .raster_discard = rs->rasterizer_discard_enable)); + } } static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = { @@ -4246,6 +4342,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = { MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE, }; +template static unsigned tu6_ds_size(struct tu_device *dev, const struct vk_depth_stencil_state *ds, @@ -4255,6 +4352,7 @@ tu6_ds_size(struct tu_device *dev, return 4; } +template static void tu6_emit_ds(struct tu_cs *cs, const struct vk_depth_stencil_state *ds, @@ -4308,6 +4406,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_depth_bounds_state[] = { MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS, }; +template static unsigned tu6_depth_bounds_size(struct tu_device *dev, const struct vk_depth_stencil_state *ds) @@ -4315,6 +4414,7 @@ tu6_depth_bounds_size(struct tu_device *dev, return 3; } +template static void tu6_emit_depth_bounds(struct tu_cs *cs, const struct vk_depth_stencil_state *ds) @@ -4328,6 +4428,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_stencil_compare_mask_state[] MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK, }; +template static unsigned tu6_stencil_compare_mask_size(struct tu_device *dev, const struct vk_depth_stencil_state *ds) @@ -4335,6 +4436,7 @@ tu6_stencil_compare_mask_size(struct tu_device *dev, return 2; } +template static void tu6_emit_stencil_compare_mask(struct tu_cs *cs, const struct vk_depth_stencil_state *ds) @@ -4348,6 +4450,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_stencil_write_mask_state[] = MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK, }; +template static unsigned tu6_stencil_write_mask_size(struct tu_device *dev, const struct vk_depth_stencil_state *ds) @@ -4355,6 +4458,7 @@ tu6_stencil_write_mask_size(struct tu_device *dev, return 2; } +template static void tu6_emit_stencil_write_mask(struct tu_cs *cs, const struct vk_depth_stencil_state *ds) @@ -4368,6 +4472,7 @@ static const enum mesa_vk_dynamic_graphics_state tu_stencil_reference_state[] = MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE, }; +template static unsigned tu6_stencil_reference_size(struct tu_device *dev, const struct vk_depth_stencil_state *ds) @@ -4375,6 +4480,7 @@ tu6_stencil_reference_size(struct tu_device *dev, return 2; } +template static void tu6_emit_stencil_reference(struct tu_cs *cs, const struct vk_depth_stencil_state *ds) @@ -4425,6 +4531,7 @@ emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove, return true; } +template static void tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) @@ -4442,10 +4549,10 @@ tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder, #define DRAW_STATE_COND(name, id, extra_cond, ...) \ if (EMIT_STATE(name, extra_cond)) { \ - unsigned size = tu6_##name##_size(builder->device, __VA_ARGS__); \ + unsigned size = tu6_##name##_size(builder->device, __VA_ARGS__); \ if (size > 0) { \ tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); \ - tu6_emit_##name(&cs, __VA_ARGS__); \ + tu6_emit_##name(&cs, __VA_ARGS__); \ pipeline->dynamic_state[id] = \ tu_cs_end_draw_state(&pipeline->cs, &cs); \ } \ @@ -4610,11 +4717,12 @@ emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state, return !BITSET_IS_EMPTY(temp); } +template uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd) { struct tu_cs cs; - uint32_t dirty_draw_states = 0; + uint32_t dirty_draw_states = 0; #define EMIT_STATE(name) \ emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state, \ @@ -4622,10 +4730,10 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd) #define DRAW_STATE_COND(name, id, extra_cond, ...) \ if ((EMIT_STATE(name) || extra_cond) && \ !(cmd->state.pipeline->base.set_state_mask & (1u << id))) { \ - unsigned size = tu6_##name##_size(cmd->device, __VA_ARGS__); \ + unsigned size = tu6_##name##_size(cmd->device, __VA_ARGS__); \ if (size > 0) { \ tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \ - tu6_emit_##name(&cs, __VA_ARGS__); \ + tu6_emit_##name(&cs, __VA_ARGS__); \ cmd->state.dynamic_state[id] = \ tu_cs_end_draw_state(&cmd->sub_cs, &cs); \ } else { \ @@ -4643,19 +4751,19 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd) cmd->state.dynamic_state[id] = \ tu_cs_end_draw_state(&cmd->sub_cs, &cs); \ } else { \ - unsigned size = tu6_##name##_size(cmd->device, __VA_ARGS__); \ + unsigned size = tu6_##name##_size(cmd->device, __VA_ARGS__); \ if (size > 0) { \ tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \ - tu6_emit_##name(&cs, __VA_ARGS__); \ + tu6_emit_##name(&cs, __VA_ARGS__); \ cmd->state.dynamic_state[id] = \ tu_cs_end_draw_state(&cmd->sub_cs, &cs); \ } else { \ cmd->state.dynamic_state[id] = {}; \ } \ tu_cs_begin_sub_stream(&cmd->sub_cs, \ - tu6_##name##_size(cmd->device, __VA_ARGS__), \ + tu6_##name##_size(cmd->device, __VA_ARGS__), \ &cs); \ - tu6_emit_##name(&cs, __VA_ARGS__); \ + tu6_emit_##name(&cs, __VA_ARGS__); \ cmd->state.dynamic_state[id] = \ tu_cs_end_draw_state(&cmd->sub_cs, &cs); \ } \ @@ -4747,6 +4855,7 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd) return dirty_draw_states; } +TU_GENX(tu_emit_draw_state); static void tu_pipeline_builder_parse_depth_stencil( @@ -4928,6 +5037,7 @@ vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage) } } +template static VkResult tu_pipeline_builder_build(struct tu_pipeline_builder *builder, struct tu_pipeline **pipeline) @@ -5031,8 +5141,14 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, return result; } - tu_pipeline_builder_parse_shader_stages(builder, *pipeline); - tu6_emit_load_state(*pipeline, &builder->layout); + tu_pipeline_builder_parse_shader_stages(builder, *pipeline); + + if (CHIP == A6XX) { + /* Blob doesn't preload state on A7XX, likely preloading either + * doesn't work or doesn't provide benefits. + */ + tu6_emit_load_state(*pipeline, &builder->layout); + } } if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) { @@ -5050,7 +5166,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); } - tu_pipeline_builder_emit_state(builder, *pipeline); + tu_pipeline_builder_emit_state(builder, *pipeline); if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) { struct tu_graphics_lib_pipeline *library = @@ -5292,6 +5408,7 @@ tu_pipeline_builder_init_graphics( } } +template static VkResult tu_graphics_pipeline_create(VkDevice device, VkPipelineCache pipelineCache, @@ -5309,7 +5426,7 @@ tu_graphics_pipeline_create(VkDevice device, pCreateInfo, pAllocator); struct tu_pipeline *pipeline = NULL; - VkResult result = tu_pipeline_builder_build(&builder, &pipeline); + VkResult result = tu_pipeline_builder_build(&builder, &pipeline); tu_pipeline_builder_finish(&builder); if (result == VK_SUCCESS) @@ -5320,6 +5437,7 @@ tu_graphics_pipeline_create(VkDevice device, return result; } +template VKAPI_ATTR VkResult VKAPI_CALL tu_CreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, @@ -5333,7 +5451,7 @@ tu_CreateGraphicsPipelines(VkDevice device, uint32_t i = 0; for (; i < count; i++) { - VkResult result = tu_graphics_pipeline_create(device, pipelineCache, + VkResult result = tu_graphics_pipeline_create(device, pipelineCache, &pCreateInfos[i], pAllocator, &pPipelines[i]); @@ -5352,7 +5470,9 @@ tu_CreateGraphicsPipelines(VkDevice device, return final_result; } +TU_GENX(tu_CreateGraphicsPipelines); +template static VkResult tu_compute_pipeline_create(VkDevice device, VkPipelineCache pipelineCache, @@ -5508,10 +5628,12 @@ tu_compute_pipeline_create(VkDevice device, struct tu_cs prog_cs; additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v); tu_cs_begin_sub_stream(&pipeline->base.cs, 64 + additional_reserve_size, &prog_cs); - tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova); + tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova); pipeline->base.program.state = tu_cs_end_draw_state(&pipeline->base.cs, &prog_cs); - tu6_emit_load_state(&pipeline->base, layout); + if (CHIP == A6XX) { + tu6_emit_load_state(&pipeline->base, layout); + } tu_append_executable(&pipeline->base, v, nir_initial_disasm); @@ -5535,6 +5657,7 @@ fail: return result; } +template VKAPI_ATTR VkResult VKAPI_CALL tu_CreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, @@ -5548,7 +5671,7 @@ tu_CreateComputePipelines(VkDevice device, uint32_t i = 0; for (; i < count; i++) { - VkResult result = tu_compute_pipeline_create(device, pipelineCache, + VkResult result = tu_compute_pipeline_create(device, pipelineCache, &pCreateInfos[i], pAllocator, &pPipelines[i]); if (result != VK_SUCCESS) { @@ -5566,6 +5689,7 @@ tu_CreateComputePipelines(VkDevice device, return final_result; } +TU_GENX(tu_CreateComputePipelines); VKAPI_ATTR void VKAPI_CALL tu_DestroyPipeline(VkDevice _device, diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index d860ef78b09..8f9dce600d6 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -263,6 +263,7 @@ TU_DECL_PIPELINE_DOWNCAST(compute, TU_PIPELINE_COMPUTE) VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin); +template uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd); struct tu_pvtmem_config { @@ -272,11 +273,13 @@ struct tu_pvtmem_config { bool per_wave; }; +template void tu6_emit_xs_config(struct tu_cs *cs, gl_shader_stage stage, const struct ir3_shader_variant *xs); +template void tu6_emit_xs(struct tu_cs *cs, gl_shader_stage stage, @@ -284,6 +287,7 @@ tu6_emit_xs(struct tu_cs *cs, const struct tu_pvtmem_config *pvtmem, uint64_t binary_iova); +template void tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *vs, @@ -292,6 +296,7 @@ tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *gs, const struct ir3_shader_variant *fs); +template void tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); diff --git a/src/freedreno/vulkan/tu_query.cc b/src/freedreno/vulkan/tu_query.cc index 03954abd98f..c26a2a2b2d1 100644 --- a/src/freedreno/vulkan/tu_query.cc +++ b/src/freedreno/vulkan/tu_query.cc @@ -19,6 +19,8 @@ #include "tu_cs.h" #include "tu_device.h" +#include "common/freedreno_gpu_event.h" + #define NSEC_PER_SEC 1000000000ull #define WAIT_TIMEOUT 5 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1) @@ -603,6 +605,7 @@ copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, src_iova); } +template static void emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs, @@ -615,7 +618,7 @@ emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, VkQueryResultFlags flags) { /* Flush cache for the buffer to copy to. */ - tu_emit_cache_flush(cmdbuf); + tu_emit_cache_flush(cmdbuf); /* From the Vulkan 1.1.130 spec: * @@ -697,6 +700,7 @@ emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, } } +template VKAPI_ATTR void VKAPI_CALL tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -719,14 +723,16 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: - return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, - queryCount, buffer, dstOffset, stride, flags); + return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, + queryCount, buffer, dstOffset, + stride, flags); case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: unreachable("allowCommandBufferQueryCopies is false"); default: assert(!"Invalid query type"); } } +TU_GENX(tu_CmdCopyQueryPoolResults); static void emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, @@ -847,6 +853,7 @@ emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit(cs, ZPASS_DONE); } +template static void emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -868,7 +875,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, CP_COND_REG_EXEC_0_BINNING); } - tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_START_PRIMITIVE_CTRS); tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); @@ -880,11 +887,11 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, } if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_START_FRAGMENT_CTRS); } if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_START_COMPUTE_CTRS); } tu_cs_emit_wfi(cs); @@ -985,6 +992,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, tu_cond_exec_end(cs); } +template static void emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -995,9 +1003,10 @@ emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0); tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova)); - tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); + tu_emit_event_write(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS); } +template static void emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -1023,7 +1032,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, CP_COND_REG_EXEC_0_BINNING); } - tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_START_PRIMITIVE_CTRS); tu_cs_emit_wfi(cs); @@ -1038,6 +1047,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, } } +template VKAPI_ATTR void VKAPI_CALL tu_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1057,16 +1067,16 @@ tu_CmdBeginQuery(VkCommandBuffer commandBuffer, emit_begin_occlusion_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_begin_xfb_query(cmdbuf, pool, query, 0); + emit_begin_xfb_query(cmdbuf, pool, query, 0); break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_begin_prim_generated_query(cmdbuf, pool, query); + emit_begin_prim_generated_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: emit_begin_perf_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: - emit_begin_stat_query(cmdbuf, pool, query); + emit_begin_stat_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_TIMESTAMP: unreachable("Unimplemented query type"); @@ -1074,7 +1084,9 @@ tu_CmdBeginQuery(VkCommandBuffer commandBuffer, assert(!"Invalid query type"); } } +TU_GENX(tu_CmdBeginQuery); +template VKAPI_ATTR void VKAPI_CALL tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1088,15 +1100,16 @@ tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, switch (pool->type) { case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_begin_xfb_query(cmdbuf, pool, query, index); + emit_begin_xfb_query(cmdbuf, pool, query, index); break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_begin_prim_generated_query(cmdbuf, pool, query); + emit_begin_prim_generated_query(cmdbuf, pool, query); break; default: assert(!"Invalid query type"); } } +TU_GENX(tu_CmdBeginQueryIndexedEXT); static void emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, @@ -1181,6 +1194,7 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, * query inside of secondary cmd buffer - for such case we ought to track * the status of pipeline stats query. */ +template static void emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs, @@ -1195,7 +1209,7 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics); if (!need_cond_exec) { - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); } else { tu_cs_reserve(cs, 7 + 2); /* Check that pipeline stats query is not running, only then @@ -1207,7 +1221,7 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); } } @@ -1218,6 +1232,7 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, } } +template static void emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -1235,15 +1250,15 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a * renderpass, because it is already stopped. */ - emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); + emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); } if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS); } if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS); + tu_emit_event_write(cmdbuf, cs, FD_STOP_COMPUTE_CTRS); } tu_cs_emit_wfi(cs); @@ -1360,6 +1375,7 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, 0x1); } +template static void emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -1378,10 +1394,10 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, uint64_t available_iova = query_available_iova(pool, query); tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova)); - tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); + tu_emit_event_write(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS); tu_cs_emit_wfi(cs); - tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); + tu_emit_event_write(cmdbuf, cs, FD_CACHE_FLUSH); /* Set the count of written primitives */ tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); @@ -1392,7 +1408,7 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, end_written_iova); tu_cs_emit_qw(cs, begin_written_iova); - tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); + tu_emit_event_write(cmdbuf, cs, FD_CACHE_FLUSH); /* Set the count of generated primitives */ tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); @@ -1409,6 +1425,7 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, 0x1); } +template static void emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -1452,7 +1469,7 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, /* Should be after waiting for mem writes to have up to date info * about which query is running. */ - emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); + emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); if (cmdbuf->state.pass) { tu_cond_exec_end(cs); @@ -1505,6 +1522,7 @@ handle_multiview_queries(struct tu_cmd_buffer *cmd, } } +template VKAPI_ATTR void VKAPI_CALL tu_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1519,16 +1537,16 @@ tu_CmdEndQuery(VkCommandBuffer commandBuffer, emit_end_occlusion_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_end_xfb_query(cmdbuf, pool, query, 0); + emit_end_xfb_query(cmdbuf, pool, query, 0); break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_end_prim_generated_query(cmdbuf, pool, query); + emit_end_prim_generated_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: emit_end_perf_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: - emit_end_stat_query(cmdbuf, pool, query); + emit_end_stat_query(cmdbuf, pool, query); break; case VK_QUERY_TYPE_TIMESTAMP: unreachable("Unimplemented query type"); @@ -1538,7 +1556,9 @@ tu_CmdEndQuery(VkCommandBuffer commandBuffer, handle_multiview_queries(cmdbuf, pool, query); } +TU_GENX(tu_CmdEndQuery); +template VKAPI_ATTR void VKAPI_CALL tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1552,15 +1572,16 @@ tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, switch (pool->type) { case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: assert(index <= 4); - emit_end_xfb_query(cmdbuf, pool, query, index); + emit_end_xfb_query(cmdbuf, pool, query, index); break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_end_prim_generated_query(cmdbuf, pool, query); + emit_end_prim_generated_query(cmdbuf, pool, query); break; default: assert(!"Invalid query type"); } } +TU_GENX(tu_CmdEndQueryIndexedEXT); VKAPI_ATTR void VKAPI_CALL tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,