tu/a7xx: Support concurrent binning

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
This commit is contained in:
Connor Abbott 2025-05-22 12:54:42 -04:00 committed by Marge Bot
parent 4ac666eaa7
commit ea24dce5e3
19 changed files with 1061 additions and 98 deletions

View file

@ -7,6 +7,7 @@ tests_per_group = 10000
# force-gmem testing
# Autotuner forces sysmem on most CTS tests
# Also force-enable concurrent binning.
[[deqp]]
deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk"
caselists = ["/deqp-vk/mustpass/vk-main.txt"]
@ -14,7 +15,7 @@ prefix = "gmem-"
fraction = 20
tests_per_group = 10000
[deqp.env]
TU_DEBUG = "gmem,forcebin"
TU_DEBUG = "gmem,forcebin,forcecb"
# force-gmem with unaligned gmem store testing
[[deqp]]

View file

@ -42,6 +42,7 @@ enum fd_gpu_event : uint32_t {
FD_CCU_CLEAN_DEPTH,
FD_CCU_CLEAN_COLOR,
FD_LRZ_CLEAR,
FD_LRZ_FLIP,
FD_LRZ_FLUSH,
FD_LRZ_INVALIDATE,
FD_VSC_BINNING_START,
@ -84,6 +85,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A6XX>[FD_GPU_EVENT_MAX]
{PC_CCU_FLUSH_DEPTH_TS, true}, /* FD_CCU_CLEAN_DEPTH */
{PC_CCU_FLUSH_COLOR_TS, true}, /* FD_CCU_CLEAN_COLOR */
{LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */
{LRZ_FLUSH, false}, /* FD_LRZ_FLIP */
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
{LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */
{VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */
@ -115,6 +117,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A7XX>[FD_GPU_EVENT_MAX]
{CCU_CLEAN_DEPTH, false}, /* FD_CCU_CLEAN_DEPTH */
{CCU_CLEAN_COLOR, false}, /* FD_CCU_CLEAN_COLOR */
{LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */
{LRZ_FLIP_BUFFER, false}, /* FD_LRZ_FLIP */
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
{LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */
{VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */

View file

@ -80,7 +80,10 @@ fdl6_lrz_layout_init(struct fdl_lrz_layout *lrz_layout,
lrz_layout->lrz_fc_size = 0;
}
uint32_t lrz_size = lrz_layout->lrz_buffer_size;
/* Allocate 2 LRZ buffers for double-buffering on a7xx. */
uint32_t lrz_size = lrz_layout->lrz_buffer_size *
(dev_info->chip >= 7 ? 2 : 1);
if (dev_info->a6xx.enable_lrz_fast_clear ||
dev_info->a6xx.has_lrz_dir_tracking) {
lrz_layout->lrz_fc_offset =

View file

@ -2133,18 +2133,22 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
*/
tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT);
ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
image->iova + image->lrz_layout.lrz_offset,
image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
(VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
ops->run(cmd, cs);
ops->teardown(cmd, cs);
const unsigned lrz_buffers = CHIP >= A7XX ? 2 : 1;
for (unsigned i = 0; i < lrz_buffers; i++) {
ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT);
ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
image->iova + image->lrz_layout.lrz_offset +
i * image->lrz_layout.lrz_buffer_size,
image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
(VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
ops->run(cmd, cs);
ops->teardown(cmd, cs);
}
/* Clearing writes via CCU color in the PS stage, and LRZ is read via
* UCHE in the earlier GRAS stage.

View file

@ -220,6 +220,7 @@ tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
uint32_t offset)
{
struct tu_vis_stream_patchpoint patchpoint = {
.render_pass_idx = cmd->state.tile_render_pass_count,
.data = cs->cur,
.iova = tu_cs_get_cur_iova(cs),
.offset = offset,
@ -339,12 +340,72 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
}
static void
tu7_write_onchip_val(struct tu_cs *cs, enum tu_onchip_addr addr,
uint32_t val)
{
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
tu_cs_emit_qw(cs, addr);
tu_cs_emit(cs, val);
}
/* "Normal" cache flushes outside the renderpass, that don't require any special handling */
template <chip CHIP>
void
tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer)
{
tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache);
struct tu_cs *cs = &cmd_buffer->cs;
struct tu_cache_state *cache = &cmd_buffer->state.cache;
BITMASK_ENUM(tu_cmd_flush_bits) flushes = cache->flush_bits;
tu6_emit_flushes<CHIP>(cmd_buffer, cs, cache);
if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
tu7_thread_control(cs, CP_SET_THREAD_BV);
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
tu7_thread_control(cs, CP_SET_THREAD_BR);
/* Wait for the previous WAIT_FOR_BR to execute on BV and reset the wait
* value.
*/
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
/* Signal the wait value. */
tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 1);
tu7_thread_control(cs, CP_SET_THREAD_BV);
/* Wait for the value. Note that we must use CP_WAIT_REG_MEM due to a
* firmware bug which makes CP_WAIT_TIMESTAMP on BV deadlock with
* preemption when BV waits for BR. Without this bug the whole thing
* would be much, much simpler.
*/
tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 1);
/* Reset the wait value. */
tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 0);
/* Resetting the wait value happens asynchronously (since it's an
* EVENT_WRITE), but waiting for it happens synchronously. We need to
* prevent BV from racing ahead to the next wait before it's reset.
*/
tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 0);
tu7_thread_control(cs, CP_SET_THREAD_BR);
}
}
TU_GENX(tu_emit_cache_flush);
@ -356,8 +417,11 @@ tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer)
if (!cmd_buffer->state.renderpass_cache.flush_bits &&
likely(!tu_env.debug))
return;
tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->draw_cs,
&cmd_buffer->state.renderpass_cache);
struct tu_cs *cs = &cmd_buffer->draw_cs;
struct tu_cache_state *cache = &cmd_buffer->state.renderpass_cache;
tu6_emit_flushes<CHIP>(cmd_buffer, cs, cache);
if (cmd_buffer->state.renderpass_cache.flush_bits &
TU_CMD_FLAG_BLIT_CACHE_CLEAN) {
cmd_buffer->state.blit_cache_cleaned = true;
@ -491,7 +555,7 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
(CHIP == A6XX ? TU_CMD_FLAG_WAIT_FOR_IDLE : 0));
}
tu6_emit_flushes<CHIP>(cmd_buffer, cs, &cmd_buffer->state.cache);
tu_emit_cache_flush<CHIP>(cmd_buffer);
if (ccu_state != cmd_buffer->state.ccu_state) {
emit_rb_ccu_cntl<CHIP>(cs, cmd_buffer->device,
@ -2116,6 +2180,25 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
}
if (CHIP >= A7XX) {
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, true);
/* This sets the amount BV is allowed to be ahead of BR when we do
* BV_WAIT_FOR_BR. By setting it based on the vis stream count we
* prevent write-after-read races with the vis stream.
*/
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 2);
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_SET_BR_OFFSET));
struct tu_vis_stream_patchpoint *patchpoint =
&cmd->vis_stream_count_patchpoint;
patchpoint->data = cs->cur;
patchpoint->iova = tu_cs_get_cur_iova(cs);
tu_cs_emit(cs, 1);
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, false);
tu7_thread_control(cs, CP_SET_THREAD_BR);
}
@ -2137,6 +2220,10 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE));
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING));
}
tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3);
@ -2238,7 +2325,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
template <chip CHIP>
static void
tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
const VkOffset2D *fdm_offsets)
const VkOffset2D *fdm_offsets, bool use_cb)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_framebuffer *fb = cmd->state.framebuffer;
@ -2336,12 +2423,18 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_regs(cs,
A6XX_TPL1_WINDOW_OFFSET(.x = 0, .y = 0));
trace_start_binning_ib(&cmd->trace, cs, cmd);
if (use_cb)
trace_start_concurrent_binning_ib(&cmd->trace, cs, cmd);
else
trace_start_binning_ib(&cmd->trace, cs, cmd);
/* emit IB to binning drawcmds: */
tu_cs_emit_call(cs, &cmd->draw_cs);
trace_end_binning_ib(&cmd->trace, cs);
if (use_cb)
trace_end_concurrent_binning_ib(&cmd->trace, cs);
else
trace_end_binning_ib(&cmd->trace, cs);
/* switching from binning pass to GMEM pass will cause a switch from
* PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
@ -2667,6 +2760,46 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd)
cmd->state.fdm_enabled = cmd->state.pass->has_fdm;
}
static bool
tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
bool disable_cb)
{
if (disable_cb ||
/* LRZ can only be cleared via fast clear in BV. Disable CB if we can't
* use it.
*/
!cmd->state.lrz.fast_clear ||
TU_DEBUG(NO_CONCURRENT_BINNING)) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
return false;
}
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
/* Increment timestamp to make it unique in subsequent commands */
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
/* We initialize the "is concurrent binning enabled?" predicate to true and
* disable it later if necessary.
*/
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, true);
tu7_thread_control(cs, CP_SET_THREAD_BV);
/* If there was an overflow in the BR resource table the register will be
* set to 1 by CP_RESOURCE_LIST. Wait for it to clear here.
*/
tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0);
tu_lrz_cb_begin(cmd, cs);
return true;
}
template <chip CHIP>
static void
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
@ -2674,8 +2807,40 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
/* It seems that for sysmem render passes we have to use BV to clear LRZ
* before the renderpass. Otherwise the clear doesn't become visible to
* subsequent draws when LRZ has been flipped an odd number of times.
* Presumably this works if concurrent binning is disabled, because the
* blob relies on this, but that requires synchronizing BR and BV
* unnecessarily, and we want BV to skip ahead across sysmem renderpasses.
*
* In the future, we may also support writing LRZ in BV.
*/
bool concurrent_binning = false;
if (CHIP >= A7XX) {
concurrent_binning = tu7_emit_concurrent_binning(cmd, cs, false);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
}
tu_lrz_sysmem_begin<CHIP>(cmd, cs);
if (concurrent_binning) {
tu_lrz_after_bv<CHIP>(cmd, cs);
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END));
tu7_thread_control(cs, CP_SET_THREAD_BR);
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
tu_lrz_before_sysmem_br<CHIP>(cmd, cs);
}
assert(fb->width > 0 && fb->height > 0);
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
tu6_emit_window_offset<CHIP>(cs, 0, 0);
@ -2758,9 +2923,155 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_lrz_sysmem_end<CHIP>(cmd, cs);
/* Clear the resource list for any LRZ resources we emitted at the
* beginning.
*/
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
tu_cs_emit(cs, 0); /* value */
}
tu_cs_sanity_check(cs);
}
static void
tu7_write_and_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
{
tu7_write_onchip_timestamp(cs, onchip_addr);
tu7_wait_onchip_timestamp(cs, onchip_addr);
}
static bool
tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
bool use_hw_binning)
{
/* xfb queries use data from the binning pass. If they are running outside
* of a RP then we may have to deal with a mix of GMEM/sysmem renderpasses
* where the counters increase on different processors. Just disable CB so
* that everything happens on BR and we don't need difficult merging of BV
* and BR results. In addition, RBBM primitive counters seem to not work
* at all with concurrent binning, so disable if they are running before
* the RP.
*/
bool disable_cb =
cmd->state.xfb_query_running_before_rp ||
cmd->state.rp.has_prim_generated_query_in_rp ||
cmd->state.rp.has_vtx_stats_query_in_rp ||
cmd->state.prim_counters_running > 0;
if (!tu7_emit_concurrent_binning(cmd, cs, disable_cb || !use_hw_binning))
return false;
/* We want to disable concurrent binning if BV isn't far enough ahead of
* BR. The core idea is to write a timestamp in BR and BV, and compare the
* BR and BV timestamps for equality. if BR is fast enough, it will write
* the timestamp ahead of BV and then when BV compares for equality it will
* find them equal. BR cannot race too far ahead of BV because it must wait
* for BV's determination to finish, which we do via another timestamp, so
* either BV is ahead of BR or the timestamps are equal.
*
* We need to communicate the determination from BV to BR so they both
* agree on whether concurrent binning is enabled or not. The easiest way
* to do it is via a "when was concurrent binning last disabled" timestamp,
* because we only have to set it when disabling concurrent binning.
*/
if (!TU_DEBUG(FORCE_CONCURRENT_BINNING)) {
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
tu7_thread_control(cs, CP_SET_THREAD_BR);
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BR_TIMESTAMP);
tu7_thread_control(cs, CP_SET_THREAD_BV);
/* If in a secondary, dynamically disable CB if a vtx stats query is
* running.
*/
if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_RUNNING));
}
const uint32_t bv_cond_dwords = 3 + 4 + 4;
tu_cs_reserve(cs, 4 + bv_cond_dwords);
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3);
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) |
CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) |
CP_COND_REG_EXEC_0_ONCHIP_MEM);
tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_TIMESTAMP) |
REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM);
tu_cs_emit(cs, bv_cond_dwords);
if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
tu_cond_exec_end(cs);
/* if (BR_TIMESTAMP == BV_TIMESTAMP) */ {
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP);
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
}
tu7_write_onchip_timestamp(cs,
TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP);
tu7_thread_control(cs, CP_SET_THREAD_BR);
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP);
const uint32_t br_cond_dwords = 4;
tu_cs_reserve(cs, 4 + br_cond_dwords);
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3);
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) |
CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) |
CP_COND_REG_EXEC_0_ONCHIP_MEM);
tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP) |
REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM);
tu_cs_emit(cs, br_cond_dwords);
/* if (BR_TIMESTAMP == BV_DISABLED_TIMESTAMP) */ {
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
}
}
/* At this point BV and BR are agreed on whether CB is enabled. If CB is
* enabled, set the thread to BV for the binning pass, otherwise set BR and
* disable concurrent binning.
*/
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
const uint32_t if_dwords = 5;
const uint32_t else_dwords = 2;
tu_cs_reserve(cs, 3 + if_dwords + else_dwords);
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED) |
CP_COND_REG_EXEC_0_SKIP_WAIT_FOR_ME);
tu_cs_emit(cs, if_dwords);
/* if (CB is enabled) */ {
tu7_thread_control(cs, CP_SET_THREAD_BV);
/* Wait for BR vis stream reads to finish */
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1);
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BV_WAIT_FOR_BR));
/* This is the NOP-as-else trick. If CB is disabled, this CP_NOP is
* skipped and its body (the else) is executed.
*/
tu_cs_emit_pkt7(cs, CP_NOP, else_dwords);
} /* else */ {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
}
return true;
}
template <chip CHIP>
static void
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
@ -2771,31 +3082,38 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
const struct tu_render_pass *pass = cmd->state.pass;
bool use_binning = use_hw_binning(cmd);
tu_lrz_tiling_begin<CHIP>(cmd, cs);
/* User flushes should always be executed on BR. */
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
bool use_cb = false;
if (CHIP >= A7XX) {
tu7_emit_tile_render_begin_regs(cs);
use_cb = tu7_emit_concurrent_binning_gmem(cmd, cs, use_binning);
}
if (!use_cb)
tu_trace_start_render_pass(cmd);
tu_lrz_tiling_begin<CHIP>(cmd, cs);
/* tu_lrz_tiling_begin() can accumulate additional flushes. If that happens
* CB should be disabled, so it's safe to just emit them here.
*/
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
/* Reset bin scaling. */
if (phys_dev->info->a7xx.has_hw_bin_scaling) {
tu_cs_emit_regs(cs, A7XX_GRAS_BIN_FOVEAT());
tu_cs_emit_regs(cs, A7XX_RB_BIN_FOVEAT());
}
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
}
if (use_hw_binning(cmd)) {
if (use_binning) {
if (!cmd->vsc_initialized) {
tu6_lazy_init_vsc(cmd);
}
@ -2833,7 +3151,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);
tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets);
tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets, use_cb);
if (CHIP == A6XX) {
tu_cs_emit_regs(cs,
@ -2897,6 +3215,40 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
if (use_binning) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV));
tu_lrz_after_bv<CHIP>(cmd, cs);
/* Signal that BV is done for this render pass. This always has to
* be executed, even when CB is dynamically disabled, because we
* need to keep BR and BV counts in sync with which visibility
* streams are in use.
*/
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_INC_BV_COUNT);
/* This mode seems to be only used by BV and signals that a
* simpler save/restore procedure can be used in between render
* passes.
*/
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END));
}
tu7_thread_control(cs, CP_SET_THREAD_BR);
if (use_binning) {
/* Wait for the BV to be done for this render pass. */
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1);
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BR_WAIT_FOR_BV));
/* Emit vis stream on BR */
tu_emit_vsc<CHIP>(cmd, cs);
}
tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4);
tu_cs_emit(cs, num_vsc_pipes); /* count */
tu_cs_emit(cs, 0); /* offset */
@ -2906,8 +3258,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
if (CHIP >= A7XX &&
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
tu_cs_set_writeable(cs, false);
} else if (CHIP >= A7XX) {
/* Earlier we disabled concurrent binning to make LRZ fast-clear work
* with no HW binning, now re-enable it while staying on BR.
*/
tu7_thread_control(cs, CP_SET_THREAD_BR);
}
tu_lrz_before_tiles<CHIP>(cmd, cs, use_cb);
if (use_cb)
tu_trace_start_render_pass(cmd);
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
tu_cs_sanity_check(cs);
@ -2982,12 +3344,30 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_lrz_tiling_end<CHIP>(cmd, cs);
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
if (CHIP >= A7XX) {
tu7_thread_control(cs, CP_SET_THREAD_BR);
bool hw_binning = use_hw_binning(cmd);
if (hw_binning) {
cmd->state.tile_render_pass_count++;
}
/* If we are using HW binning, signal that we are done with reading the vis
* stream for this render pass by advancing the counter. Also clear render
* resources, currently only used for LRZ, and reset the overflow onchip
* register.
*/
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
COND(hw_binning, CP_EVENT_WRITE7_0_INC_BR_COUNT) |
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
tu_cs_emit(cs, 0); /* value */
}
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
tu_cs_sanity_check(cs);
}
@ -3354,8 +3734,6 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
tu_cs_end(&cmd->tile_store_cs);
tu_trace_start_render_pass(cmd);
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
/* Note: we reverse the order of walking the pipes and tiles on every
@ -5370,7 +5748,8 @@ sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)
}
static enum tu_stage
vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
vk2tu_single_stage(struct tu_device *dev,
VkPipelineStageFlags2 vk_stage, bool dst)
{
/* If the destination stage is executed on the CP, then the CP also has to
* wait for any WFI's to finish. This is already done for draw calls,
@ -5394,24 +5773,40 @@ vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT ||
vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT)
return TU_STAGE_CP;
return TU_STAGE_BV_CP;
if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
return dst ? TU_STAGE_CP : TU_STAGE_GPU;
return dst ? TU_STAGE_BV_CP : TU_STAGE_BR;
if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT)
return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP;
return dst ? TU_STAGE_BOTTOM : TU_STAGE_BV_CP;
return TU_STAGE_GPU;
if (dev->physical_device->info->chip >= 7) {
if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT ||
vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT ||
vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) {
return dst ? TU_STAGE_BV : TU_STAGE_BR;
}
}
return TU_STAGE_BR;
}
static enum tu_stage
vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
vk2tu_src_stage(struct tu_device *dev,
VkPipelineStageFlags2 vk_stages)
{
enum tu_stage stage = TU_STAGE_CP;
enum tu_stage stage = TU_STAGE_BV_CP;
u_foreach_bit64 (bit, vk_stages) {
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, false);
stage = MAX2(stage, new_stage);
}
@ -5419,11 +5814,12 @@ vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
}
static enum tu_stage
vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages)
vk2tu_dst_stage(struct tu_device *dev,
VkPipelineStageFlags2 vk_stages)
{
enum tu_stage stage = TU_STAGE_BOTTOM;
u_foreach_bit64 (bit, vk_stages) {
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, true);
stage = MIN2(stage, new_stage);
}
@ -5437,14 +5833,17 @@ tu_flush_for_stage(struct tu_cache_state *cache,
/* Even if the source is the host or CP, the destination access could
* generate invalidates that we have to wait to complete.
*/
if (src_stage == TU_STAGE_CP &&
if (src_stage < TU_STAGE_BR &&
(cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE))
src_stage = TU_STAGE_GPU;
src_stage = TU_STAGE_BR;
if (src_stage >= dst_stage) {
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
if (dst_stage == TU_STAGE_CP)
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
if (dst_stage <= TU_STAGE_BV) {
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR;
if (dst_stage == TU_STAGE_BV_CP)
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
}
}
}
@ -5455,6 +5854,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst,
dst->xfb_used |= src->xfb_used;
dst->has_tess |= src->has_tess;
dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
dst->has_vtx_stats_query_in_rp |= src->has_vtx_stats_query_in_rp;
dst->has_zpass_done_sample_count_write_in_rp |= src->has_zpass_done_sample_count_write_in_rp;
dst->disable_gmem |= src->disable_gmem;
dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
@ -5653,6 +6053,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
secondary_patchpoint) {
struct tu_vis_stream_patchpoint patchpoint =
*secondary_patchpoint;
patchpoint.render_pass_idx += cmd->state.tile_render_pass_count;
if (simultaneous_use) {
tu_cs_reserve_space(cs, 5);
@ -5682,6 +6083,8 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
}
}
cmd->state.tile_render_pass_count +=
secondary->state.tile_render_pass_count;
cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size);
switch (secondary->state.suspend_resume) {
@ -5844,8 +6247,8 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
tu_flush_for_access(cache, src_flags, dst_flags);
enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk);
enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk);
enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk);
tu_flush_for_stage(cache, src_stage, dst_stage);
}
@ -5975,6 +6378,10 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
CP_COND_REG_EXEC_0_GMEM |
CP_COND_REG_EXEC_0_SYSMEM);
bool emitted_scissor = false;
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
struct tu_render_pass_attachment *att =
@ -5987,6 +6394,8 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
}
}
tu_cond_exec_end(cs);
}
static void
@ -8906,8 +9315,8 @@ tu_barrier(struct tu_cmd_buffer *cmd,
tu_flush_for_access(cache, src_flags, dst_flags);
enum tu_stage src_stage = vk2tu_src_stage(srcStage);
enum tu_stage dst_stage = vk2tu_dst_stage(dstStage);
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
tu_flush_for_stage(cache, src_stage, dst_stage);
}
@ -8973,9 +9382,6 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 1);
/* Wait for any writes to the predicate to land */
if (cmd->state.pass)
tu_emit_cache_flush_renderpass<CHIP>(cmd);
@ -8989,23 +9395,72 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
* mandates 32-bit comparisons. Our workaround is to copy the the reference
* value to the low 32-bits of a location where the high 32 bits are known
* to be 0 and then compare that.
*
* BR and BV use separate predicate values so that setting the predicate
* doesn't have to be synchronized between them.
*/
if (CHIP >= A7XX) {
if (!cmd->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
}
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BR);
}
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
tu_cs_emit(cs, 0);
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
tu_cs_emit_qw(cs, iova);
if (CHIP >= A7XX) {
tu_cond_exec_end(cs);
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BV);
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
tu_cs_emit(cs, 0);
tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate));
tu_cs_emit_qw(cs, iova);
tu_cond_exec_end(cs);
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 1);
bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
if (CHIP >= A7XX) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BR);
}
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
if (CHIP >= A7XX) {
tu_cond_exec_end(cs);
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BV);
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate));
tu_cond_exec_end(cs);
}
/* Restore original BR thread after setting BOTH */
if (CHIP >= A7XX && !cmd->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
}
}
TU_GENX(tu_CmdBeginConditionalRenderingEXT);
template <chip CHIP>
VKAPI_ATTR void VKAPI_CALL
tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
{
@ -9015,9 +9470,20 @@ tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
if (CHIP >= A7XX && !cmd->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
}
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0);
if (CHIP >= A7XX && !cmd->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
}
}
TU_GENX(tu_CmdEndConditionalRenderingEXT);
template <chip CHIP>
void

View file

@ -193,12 +193,15 @@ enum tu_stage {
* wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
* As a source stage, it is for things needing no waits.
*/
TU_STAGE_CP,
TU_STAGE_BV_CP,
/* This is for operations executed on BV. */
TU_STAGE_BV,
/* This is for most operations, which WFI will wait to finish and will not
* start until any pending WFIs are finished.
*/
TU_STAGE_GPU,
TU_STAGE_BR,
/* This is only used as a destination stage and is for things needing no
* waits on the GPU (e.g. host operations).
@ -223,6 +226,7 @@ enum tu_cmd_flush_bits {
*/
TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
TU_CMD_FLAG_WAIT_FOR_BR = 1 << 13,
TU_CMD_FLAG_ALL_CLEAN =
TU_CMD_FLAG_CCU_CLEAN_DEPTH |
@ -268,6 +272,7 @@ struct tu_cache_state {
BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
/* Pending flushes */
BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
BITMASK_ENUM(tu_cmd_flush_bits) bv_flush_bits;
};
struct tu_vs_params {
@ -293,6 +298,7 @@ struct tu_render_pass_state
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool has_vtx_stats_query_in_rp;
bool has_zpass_done_sample_count_write_in_rp;
bool disable_gmem;
bool sysmem_single_prim_mode;
@ -578,6 +584,8 @@ struct tu_cmd_state
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
bool vtx_stats_query_running_before_rp;
bool xfb_query_running_before_rp;
bool occlusion_query_may_be_running;
@ -601,6 +609,15 @@ struct tu_cmd_state
uint32_t total_renderpasses;
uint32_t total_dispatches;
unsigned tile_render_pass_count;
};
struct tu_vis_stream_patchpoint {
unsigned render_pass_idx;
uint32_t *data;
uint64_t iova;
uint32_t offset;
};
struct tu_cmd_buffer
@ -618,6 +635,7 @@ struct tu_cmd_buffer
void *patchpoints_ctx;
struct util_dynarray fdm_bin_patchpoints;
struct tu_vis_stream_patchpoint vis_stream_count_patchpoint;
struct util_dynarray vis_stream_patchpoints;
struct util_dynarray vis_stream_bos;
struct util_dynarray vis_stream_cs_bos;
@ -838,12 +856,6 @@ struct tu_fdm_bin_patchpoint {
tu_fdm_bin_apply_t apply;
};
struct tu_vis_stream_patchpoint {
uint32_t *data;
uint64_t iova;
uint32_t offset;
};
struct tu_vis_stream_patchpoint_cs {
struct tu_suballoc_bo cs_bo;
struct tu_suballoc_bo fence_bo;

View file

@ -93,6 +93,8 @@
(MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
A6XX_TEX_CONST_DWORDS
#define TU_MAX_VIS_STREAMS 4
/* With dynamic rendering, input attachment indices are shifted by 1 and
* attachment 0 is used for input attachments without an InputAttachmentIndex
* (which can only be depth/stencil).
@ -151,8 +153,31 @@
enum tu_predicate_bit {
TU_PREDICATE_LOAD_STORE = 0,
TU_PREDICATE_PERFCNTRS = 1,
TU_PREDICATE_CB_ENABLED = 2,
TU_PREDICATE_VTX_STATS_RUNNING = 3,
TU_PREDICATE_VTX_STATS_NOT_RUNNING = 4,
TU_PREDICATE_FIRST_TILE = 5,
};
/* Onchip timestamp register layout. */
enum tu_onchip_addr {
/* Registers 0-7 are defined by firmware to be shared between BR/BV.
*/
/* See tu7_emit_concurrent_binning */
TU_ONCHIP_CB_BR_TIMESTAMP,
TU_ONCHIP_CB_BV_TIMESTAMP,
TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP,
TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP,
TU_ONCHIP_BARRIER,
TU_ONCHIP_CB_RESLIST_OVERFLOW,
/* Registers 8-15 are defined by firmware to be split between BR and BV.
* Each has their own copy.
*/
};
#define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)
#define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)

View file

@ -493,6 +493,55 @@ tu7_thread_control(struct tu_cs *cs, enum cp_thread thread)
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(thread));
}
static inline void
tu7_set_pred_mask(struct tu_cs *cs, uint32_t mask, uint32_t val)
{
tu_cs_emit_pkt7(cs, CP_REG_TEST, 3);
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_PRED_UPDATE |
A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
tu_cs_emit(cs, mask);
tu_cs_emit(cs, val);
}
static inline void
tu7_set_pred_bit(struct tu_cs *cs, enum tu_predicate_bit bit, bool val)
{
tu7_set_pred_mask(cs, 1u << bit, val ? (1u << bit) : 0);
}
static inline void
tu7_write_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
{
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 2);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_TIMESTAMP_SUM) |
CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
tu_cs_emit(cs, onchip_addr);
}
static inline void
tu7_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
{
tu_cs_emit_pkt7(cs, CP_WAIT_TIMESTAMP, 3);
tu_cs_emit(cs, CP_WAIT_TIMESTAMP_0_WAIT_DST(TS_WAIT_ONCHIP) |
CP_WAIT_TIMESTAMP_0_WAIT_VALUE_SRC(TS_WAIT_GE_TIMESTAMP_SUM));
tu_cs_emit_qw(cs, onchip_addr);
}
static inline void
tu7_wait_onchip_val(struct tu_cs *cs, enum tu_onchip_addr onchip_addr,
uint32_t val)
{
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
CP_WAIT_REG_MEM_0_POLL(POLL_ON_CHIP));
tu_cs_emit_qw(cs, onchip_addr);
tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(val));
tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0));
}
uint64_t
tu_cs_emit_data_nop(struct tu_cs *cs,
const uint32_t *data,

View file

@ -3046,6 +3046,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->vk.flush_buffer_write_cp = tu_flush_buffer_write_cp;
device->vk.cmd_fill_buffer_addr = tu_cmd_fill_buffer_addr;
device->vis_stream_count = 0;
*pDevice = tu_device_to_handle(device);
return VK_SUCCESS;

View file

@ -255,6 +255,8 @@ struct tu6_global
uint32_t vsc_state[32];
uint64_t bv_predicate;
volatile uint32_t vtx_stats_query_not_running;
/* To know when renderpass stats for autotune are valid */
@ -487,6 +489,9 @@ struct tu_device
/* This is an internal queue for mapping/unmapping non-sparse BOs */
uint32_t vm_bind_queue_id;
uint32_t vis_stream_count;
uint32_t vis_stream_size;
};
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

View file

@ -234,6 +234,7 @@ tu_lrz_init_state(struct tu_cmd_buffer *cmd,
* enabled and there will be a NULL/garbage LRZ buffer.
*/
cmd->state.lrz.image_view = view;
cmd->state.lrz.store = att->store;
if (!clears_depth && !att->load)
return;
@ -412,6 +413,51 @@ tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
}
}
void
tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
/* The LRZ double-buffering guarantees that passes that clear or discard
* depth don't have to worry about LRZ dependencies. However we do have to
* worry about renderpasses that load depth, because we cannot flip LRZ
* then and have to reuse what the previous pass wrote. There is then a
* write-after-read dependency from an earlier subpass reading LRZ. We
* solve this using CP_RESOURCE_LIST, because the Vulkan user doesn't have
* to track render-and-clear dependencies vs. render-and-render depdencies
* (LOAD_OP_CLEAR happens in the same stage as rendering).
*/
if (!cmd->state.lrz.image_view)
return;
uint64_t iova =
cmd->state.lrz.image_view->image->iova +
cmd->state.lrz.image_view->image->lrz_layout.lrz_offset;
uint64_t fc_iova =
cmd->state.lrz.image_view->image->iova +
cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset;
if (cmd->state.lrz.reuse_previous_state) {
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
tu_cs_emit(cs, 1); /* BV count */
tu_cs_emit_qw(cs, iova | CP_BV_RESOURCE_0_WRITE);
tu_cs_emit(cs, 0); /* BR count */
}
if (cmd->state.lrz.store) {
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
tu_cs_emit(cs, 0); /* BV count */
tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) |
CP_RESOURCE_LIST_BR_0_OVERFLOW |
CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW));
tu_cs_emit_qw(cs, iova);
}
/* See tu_lrz_before_tiles() */
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
tu_cs_emit(cs, 1); /* BV count */
tu_cs_emit_qw(cs, CP_BV_RESOURCE_0_ENCODING(BV_RES_LRZ) | fc_iova);
tu_cs_emit(cs, 0); /* BR count */
}
template <chip CHIP>
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@ -439,6 +485,16 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
return;
}
/* If CB is dynamically enabled, then this is executed on BV. Flip the
* buffer BV is using.
*/
if (CHIP >= A7XX) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
tu_cond_exec_end(cs);
}
if (!lrz->valid_at_start) {
/* If LRZ was never valid so disable it manually here.
* This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
@ -488,6 +544,98 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
}
TU_GENX(tu_lrz_tiling_begin);
template <chip CHIP>
void
tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
if (CHIP < A7XX)
return;
/* BV and BR have different LRZ caches, so flush LRZ cache to be read by
* BR.
*/
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
tu_cond_exec_end(cs);
}
TU_GENX(tu_lrz_after_bv);
static void
tu_lrz_clear_resource(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
uint64_t fc_iova =
cmd->state.lrz.image_view->image->iova +
cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset;
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_CLEAR_LRZ_RESOURCE);
tu_cs_emit_qw(cs, fc_iova); /* resource to clear */
}
template <chip CHIP>
void
tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb)
{
if (CHIP < A7XX)
return;
tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, true);
if (!cmd->state.lrz.image_view)
return;
/* By clearing the LRZ resource before rendering, we make any future
* binning pass writing to the same LRZ image wait for all renderpasses
* before this one. Crucially this includes any earlier renderpass reading
* from the same LRZ buffer. Because LRZ is only double-buffered but it's
* possible to have more than two visibility streams, this is necessary to
* prevent write-after-read hazards if BV writes the same LRZ image more
* than once before BR reads it.
*
* For example, consider the sequence:
*
* RP 1 clears + writes depth image A
* - BV: Clear + write LRZ image A
* - BR: Read LRZ image A
* RP 2 clears + writes depth image A
* - BV: Clear + write LRZ image A
* - BR: Read LRZ image A
* RP 3 clears + writes depth image A
* - BV: Clear + write LRZ image A
* - BR: Read LRZ image A
*
* RP 1 BV will write to one LRZ image, RP 2 BV will write to the other,
* and then RP 3 BV must stall until RP 1 BR is done reading/writing the
* first LRZ image. Specifiying the LRZ resource before BV starts and
* clearing it before BR starts will cause RP 2 BV to stall until RP 1 BR
* starts, which technically isn't necessary, but it will also cause RP 3
* BV to stall until RP 2 BR has started and RP 1 BR has finished.
*
* This pairs with the last CP_RESOURCE_LIST in tu_lrz_cb_begin().
*/
if (use_cb)
tu_lrz_clear_resource(cmd, cs);
}
TU_GENX(tu_lrz_before_tiles);
static void
tu_lrz_emit_view_info(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
struct tu_lrz_state *lrz = &cmd->state.lrz;
if (lrz->gpu_dir_tracking) {
if (!lrz->valid_at_start) {
/* Make sure we fail the comparison of depth views */
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0));
} else {
tu6_write_lrz_reg(cmd, cs,
A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO));
}
}
}
/* We need to re-emit LRZ state before each tile due to skipsaverestore.
*/
template <chip CHIP>
@ -501,19 +649,104 @@ tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
} else {
tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
if (lrz->gpu_dir_tracking) {
if (!lrz->valid_at_start) {
/* Make sure we fail the comparison of depth views */
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0));
} else {
tu6_write_lrz_reg(cmd, cs,
A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO));
if (CHIP >= A7XX) {
/* If CB is dynamically enabled, then flip the buffer BR is using.
* This pairs with the LRZ flip in tu_lrz_tiling_begin. FIRST_TILE is
* cleared in tu_lrz_before_tiles().
*/
if (!lrz->reuse_previous_state) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_FIRST_TILE));
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
tu_cond_exec_end(cs);
tu_cond_exec_end(cs);
}
tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, false);
}
tu_lrz_emit_view_info(cmd, cs);
}
}
TU_GENX(tu_lrz_before_tile);
template <chip CHIP>
void
tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
struct tu_lrz_state *lrz = &cmd->state.lrz;
if (!lrz->image_view) {
tu6_emit_lrz_buffer<CHIP>(cs, NULL);
} else {
tu_lrz_clear_resource(cmd, cs);
tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
tu_lrz_emit_view_info(cmd, cs);
/* If CB is dynamically enabled, then flip the buffer BR is using.
* This pairs with the LRZ flip in tu_lrz_sysmem_begin.
*/
if (!lrz->reuse_previous_state) {
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
/* This shouldn't be necessary, because we should be able to clear
* LRZ on BV and then BR should use the clear value written by BV,
* but there seems to be a HW errata where the value from the
* register instead of the clear value is sometimes used when LRZ
* writes are disabled. This doesn't seem to be a problem in GMEM
* mode, however.
*
* This is seen with
* dEQP-VK.pipeline.monolithic.color_write_enable.alpha_channel.static.*
*/
if (lrz->fast_clear)
tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_CLEAR(lrz->depth_clear_value.depthStencil.depth));
} else {
/* To workaround the same HW errata as above, but where we don't know
* the clear value, copy the clear value from memory to the register.
* This is tricky because there are two and we have to select the
* right one using CP_COND_EXEC.
*/
const unsigned if_dwords = 4, else_dwords = if_dwords;
uint64_t lrz_fc_iova =
lrz->image_view->image->iova + lrz->image_view->image->lrz_layout.lrz_fc_offset;
uint64_t br_cur_buffer_iova =
lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>, br_cur_buffer);
/* Make sure the value is written to memory. */
tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_CLEAN);
tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
/* if (br_cur_buffer != 0) { */
tu_cs_reserve(cs, 7 + if_dwords + 1 + else_dwords);
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
tu_cs_emit_qw(cs, br_cur_buffer_iova);
tu_cs_emit_qw(cs, br_cur_buffer_iova);
tu_cs_emit(cs, 2); /* REF */
tu_cs_emit(cs, if_dwords + 1);
/* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[1].depth_clear_val */
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR));
tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>,
buffer[1].depth_clear_val));
/* } else { */
tu_cs_emit_pkt7(cs, CP_NOP, else_dwords);
/* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[0].depth_clear_val */
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR));
tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>,
buffer[0].depth_clear_val));
/* } */
}
}
}
TU_GENX(tu_lrz_before_sysmem_br);
template <chip CHIP>
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
@ -635,8 +868,51 @@ tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
if (!image->lrz_layout.lrz_total_size)
return;
uint64_t lrz_iova = image->iova + image->lrz_layout.lrz_offset;
/* Synchronize writes in BV with subsequent render passes against this
* write in BR.
*/
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV));
tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0);
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
tu_cs_emit(cs, 0); /* BV count */
tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) |
CP_RESOURCE_LIST_BR_0_OVERFLOW |
CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW));
tu_cs_emit_qw(cs, lrz_iova);
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
tu7_thread_control(cs, CP_SET_THREAD_BR);
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
}
tu6_emit_lrz_buffer<CHIP>(cs, image);
tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
tu_cs_emit(cs, 0); /* value */
}
}
TU_GENX(tu_disable_lrz);

View file

@ -51,6 +51,8 @@ struct tu_lrz_state
bool color_written_with_z_test : 1;
bool has_lrz_write_with_skipped_color_writes : 1;
bool store : 1;
enum tu_lrz_direction prev_direction;
};
@ -86,14 +88,29 @@ tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd);
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);
void
tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
template <chip CHIP>
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
template <chip CHIP>
void
tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
template <chip CHIP>
void
tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb);
template <chip CHIP>
void
tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
template <chip CHIP>
void
tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
template <chip CHIP>
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

View file

@ -38,8 +38,10 @@ tu_device_get_u_trace(struct tu_device *device);
/**
* Queue-id's
*/
enum {
DEFAULT_HW_QUEUE_ID,
enum tu_queue_id {
BR_HW_QUEUE_ID,
BV_HW_QUEUE_ID,
/* Labels set via VK_EXT_debug_utils are in a separate track due to the
* following part of the spec:
* "An application may open a debug label region in one command buffer and
@ -67,6 +69,7 @@ enum tu_stage_id {
SECONDARY_CMD_BUFFER_STAGE_ID,
CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
BINNING_STAGE_ID,
CONCURRENT_BINNING_STAGE_ID,
GMEM_STAGE_ID,
BYPASS_STAGE_ID,
BLIT_STAGE_ID,
@ -85,7 +88,8 @@ static const struct {
const char *name;
const char *desc;
} queues[] = {
[DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
[BR_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
[BV_HW_QUEUE_ID] = {"GPU Queue 1", "Adreno Bin Visibility Queue"},
[ANNOTATIONS_QUEUE_ID] = {"Annotations", "Annotations Queue"},
};
@ -99,6 +103,7 @@ static const struct {
[SECONDARY_CMD_BUFFER_STAGE_ID] = { "Secondary Command Buffer" },
[CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID] = { "Annotation", "Render Pass Command Buffer Annotation" },
[BINNING_STAGE_ID] = { "Binning", "Perform Visibility pass and determine target bins" },
[CONCURRENT_BINNING_STAGE_ID] = { "Concurrent Binning", "Perform concurrent Visibility pass and determine target bins" },
[GMEM_STAGE_ID] = { "GMEM", "Rendering to GMEM" },
[BYPASS_STAGE_ID] = { "Bypass", "Rendering to system memory" },
[BLIT_STAGE_ID] = { "Blit", "Performing a Blit operation" },
@ -323,12 +328,17 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
emit_sync_timestamp(clocks);
}
uint32_t queue_id = DEFAULT_HW_QUEUE_ID;
uint32_t queue_id = BR_HW_QUEUE_ID;
switch (stage->stage_id) {
case CMD_BUFFER_ANNOTATION_STAGE_ID:
case CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID:
queue_id = ANNOTATIONS_QUEUE_ID;
break;
/* We only know dynamically whether concurrent binning was enabled. Just
* assume it is and always make binning appear on the BV timeline.
*/
case CONCURRENT_BINNING_STAGE_ID:
queue_id = BV_HW_QUEUE_ID;
default:
break;
}
@ -577,6 +587,7 @@ CREATE_EVENT_CALLBACK(cmd_buffer, CMD_BUFFER_STAGE_ID)
CREATE_EVENT_CALLBACK(secondary_cmd_buffer, SECONDARY_CMD_BUFFER_STAGE_ID)
CREATE_EVENT_CALLBACK(render_pass, RENDER_PASS_STAGE_ID)
CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
CREATE_EVENT_CALLBACK(concurrent_binning_ib, CONCURRENT_BINNING_STAGE_ID)
CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)

View file

@ -1099,6 +1099,9 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
cmdbuf->state.prim_counters_running++;
if (cmdbuf->state.pass)
cmdbuf->state.rp.has_vtx_stats_query_in_rp = true;
/* Prevent starting primitive counters when it is supposed to be stopped
* for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
*/
@ -1110,9 +1113,26 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 0);
if (CHIP >= A7XX) {
/* We need the predicate for determining whether to enable CB, so set
* it for both BR and BV.
*/
if (!cmdbuf->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
}
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
(1u << TU_PREDICATE_VTX_STATS_RUNNING));
if (!cmdbuf->state.pass) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
}
} else {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 0);
}
if (need_cond_exec) {
tu_cond_exec_end(cs);
@ -1312,6 +1332,9 @@ emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = begin_iova));
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
if (!cmdbuf->state.pass)
cmdbuf->state.xfb_query_running_before_rp = true;
}
template <chip CHIP>
@ -1545,24 +1568,39 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
if (!need_cond_exec) {
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
} else {
tu_cs_reserve(cs, 7 + 2);
/* Check that pipeline stats query is not running, only then
* we count stop the counter.
*/
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
if (CHIP >= A7XX) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_NOT_RUNNING));
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
tu_cond_exec_end(cs);
} else {
tu_cs_reserve(cs, 7 + 2);
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
}
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
}
}
if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 1);
if (CHIP >= A7XX) {
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING));
} else {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 1);
}
}
}
@ -1822,6 +1860,9 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
uint64_t available_iova = query_available_iova(pool, query);
if (!cmdbuf->state.pass)
cmdbuf->state.xfb_query_running_before_rp = false;
tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = end_iova));
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);

View file

@ -104,7 +104,7 @@ get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd,
/* See below for the commands emitted to the CS. */
uint32_t cs_size = 5 *
util_dynarray_num_elements(&cmd->vis_stream_patchpoints,
struct tu_vis_stream_patchpoint) + 6;
struct tu_vis_stream_patchpoint) + 4 + 6;
util_dynarray_foreach (&cmd->vis_stream_cs_bos,
struct tu_vis_stream_patchpoint_cs,
@ -165,8 +165,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
struct tu_device *dev = queue->device;
uint32_t max_size = 0;
for (unsigned i = 0; i < cmdbuf_count; i++)
uint32_t rp_count = 0;
for (unsigned i = 0; i < cmdbuf_count; i++) {
max_size = MAX2(max_size, cmd_buffers[i]->vsc_size);
rp_count += cmd_buffers[i]->state.tile_render_pass_count;
}
if (max_size == 0)
return VK_SUCCESS;
@ -174,17 +177,32 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
struct tu_bo *bo = NULL;
VkResult result = VK_SUCCESS;
/* Note, we want to make the vis stream count at least 1 because an
* BV_BR_OFFSET of 0 can lead to hangs even if not using visibility
* streams and therefore should be avoided.
*/
uint32_t min_vis_stream_count =
(TU_DEBUG(NO_CONCURRENT_BINNING) || dev->physical_device->info->chip < 7) ?
1 : MIN2(MAX2(rp_count, 1), TU_MAX_VIS_STREAMS);
uint32_t vis_stream_count;
mtx_lock(&dev->vis_stream_mtx);
if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) {
if (!dev->vis_stream_bo || max_size > dev->vis_stream_size ||
min_vis_stream_count > dev->vis_stream_count) {
dev->vis_stream_count = MAX2(dev->vis_stream_count,
min_vis_stream_count);
dev->vis_stream_size = MAX2(dev->vis_stream_size, max_size);
if (dev->vis_stream_bo)
tu_bo_finish(dev, dev->vis_stream_bo);
result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo,
max_size, TU_BO_ALLOC_INTERNAL_RESOURCE,
dev->vis_stream_size * dev->vis_stream_count,
TU_BO_ALLOC_INTERNAL_RESOURCE,
"visibility stream");
}
bo = dev->vis_stream_bo;
vis_stream_count = dev->vis_stream_count;
mtx_unlock(&dev->vis_stream_mtx);
@ -210,6 +228,8 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
}
}
unsigned render_pass_idx = queue->render_pass_idx;
for (unsigned i = 0; i < cmdbuf_count; i++) {
struct tu_cs cs, sub_cs;
uint64_t fence_iova = 0;
@ -224,7 +244,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints,
struct tu_vis_stream_patchpoint,
patchpoint) {
uint64_t final_iova = bo->iova + patchpoint->offset;
unsigned vis_stream_idx =
(render_pass_idx + patchpoint->render_pass_idx) %
vis_stream_count;
uint64_t final_iova =
bo->iova + vis_stream_idx * max_size + patchpoint->offset;
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
@ -237,6 +261,19 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
}
}
struct tu_vis_stream_patchpoint *count_patchpoint =
&cmd_buffers[i]->vis_stream_count_patchpoint;
if (count_patchpoint->data) {
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(&sub_cs, count_patchpoint->iova);
tu_cs_emit(&sub_cs, vis_stream_count);
} else {
count_patchpoint->data[0] = vis_stream_count;
}
}
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0);
@ -250,8 +287,12 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs);
submit_add_entries(queue->device, submit, dump_cmds, &entry, 1);
}
render_pass_idx += cmd_buffers[i]->state.tile_render_pass_count;
}
queue->render_pass_idx = render_pass_idx;
return VK_SUCCESS;
}

View file

@ -33,6 +33,8 @@ struct tu_queue
uint32_t sparse_syncobj, gfx_syncobj;
uint64_t sparse_timepoint, gfx_timepoint;
unsigned render_pass_idx;
int fence; /* timestamp/fence of the last queue submission */
};
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)

View file

@ -135,6 +135,7 @@ begin_end_tp('draw',
], tp_default_enabled=False)
begin_end_tp('binning_ib')
begin_end_tp('concurrent_binning_ib')
begin_end_tp('draw_ib_sysmem')
begin_end_tp('draw_ib_gmem')

View file

@ -54,6 +54,8 @@ static const struct debug_control tu_debug_options[] = {
{ "check_cmd_buffer_status", TU_DEBUG_CHECK_CMD_BUFFER_STATUS },
{ "comm", TU_DEBUG_COMM },
{ "nofdm", TU_DEBUG_NOFDM },
{ "nocb", TU_DEBUG_NO_CONCURRENT_BINNING },
{ "forcecb", TU_DEBUG_FORCE_CONCURRENT_BINNING },
{ NULL, 0 }
};

View file

@ -73,6 +73,8 @@ enum tu_debug_flags : uint64_t
TU_DEBUG_CHECK_CMD_BUFFER_STATUS = BITFIELD64_BIT(32),
TU_DEBUG_COMM = BITFIELD64_BIT(33),
TU_DEBUG_NOFDM = BITFIELD64_BIT(34),
TU_DEBUG_NO_CONCURRENT_BINNING = BITFIELD64_BIT(35),
TU_DEBUG_FORCE_CONCURRENT_BINNING = BITFIELD64_BIT(36),
};
struct tu_env {