mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-31 12:00:12 +01:00
tu/a7xx: Support concurrent binning
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
This commit is contained in:
parent
4ac666eaa7
commit
ea24dce5e3
19 changed files with 1061 additions and 98 deletions
|
|
@ -7,6 +7,7 @@ tests_per_group = 10000
|
|||
|
||||
# force-gmem testing
|
||||
# Autotuner forces sysmem on most CTS tests
|
||||
# Also force-enable concurrent binning.
|
||||
[[deqp]]
|
||||
deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk"
|
||||
caselists = ["/deqp-vk/mustpass/vk-main.txt"]
|
||||
|
|
@ -14,7 +15,7 @@ prefix = "gmem-"
|
|||
fraction = 20
|
||||
tests_per_group = 10000
|
||||
[deqp.env]
|
||||
TU_DEBUG = "gmem,forcebin"
|
||||
TU_DEBUG = "gmem,forcebin,forcecb"
|
||||
|
||||
# force-gmem with unaligned gmem store testing
|
||||
[[deqp]]
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ enum fd_gpu_event : uint32_t {
|
|||
FD_CCU_CLEAN_DEPTH,
|
||||
FD_CCU_CLEAN_COLOR,
|
||||
FD_LRZ_CLEAR,
|
||||
FD_LRZ_FLIP,
|
||||
FD_LRZ_FLUSH,
|
||||
FD_LRZ_INVALIDATE,
|
||||
FD_VSC_BINNING_START,
|
||||
|
|
@ -84,6 +85,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A6XX>[FD_GPU_EVENT_MAX]
|
|||
{PC_CCU_FLUSH_DEPTH_TS, true}, /* FD_CCU_CLEAN_DEPTH */
|
||||
{PC_CCU_FLUSH_COLOR_TS, true}, /* FD_CCU_CLEAN_COLOR */
|
||||
{LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */
|
||||
{LRZ_FLUSH, false}, /* FD_LRZ_FLIP */
|
||||
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
|
||||
{LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */
|
||||
{VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */
|
||||
|
|
@ -115,6 +117,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A7XX>[FD_GPU_EVENT_MAX]
|
|||
{CCU_CLEAN_DEPTH, false}, /* FD_CCU_CLEAN_DEPTH */
|
||||
{CCU_CLEAN_COLOR, false}, /* FD_CCU_CLEAN_COLOR */
|
||||
{LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */
|
||||
{LRZ_FLIP_BUFFER, false}, /* FD_LRZ_FLIP */
|
||||
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
|
||||
{LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */
|
||||
{VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */
|
||||
|
|
|
|||
|
|
@ -80,7 +80,10 @@ fdl6_lrz_layout_init(struct fdl_lrz_layout *lrz_layout,
|
|||
lrz_layout->lrz_fc_size = 0;
|
||||
}
|
||||
|
||||
uint32_t lrz_size = lrz_layout->lrz_buffer_size;
|
||||
/* Allocate 2 LRZ buffers for double-buffering on a7xx. */
|
||||
uint32_t lrz_size = lrz_layout->lrz_buffer_size *
|
||||
(dev_info->chip >= 7 ? 2 : 1);
|
||||
|
||||
if (dev_info->a6xx.enable_lrz_fast_clear ||
|
||||
dev_info->a6xx.has_lrz_dir_tracking) {
|
||||
lrz_layout->lrz_fc_offset =
|
||||
|
|
|
|||
|
|
@ -2133,18 +2133,22 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
|
|||
*/
|
||||
tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
|
||||
|
||||
ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
|
||||
VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
|
||||
VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT);
|
||||
ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
|
||||
ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
|
||||
image->iova + image->lrz_layout.lrz_offset,
|
||||
image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
|
||||
uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
|
||||
ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
|
||||
(VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
|
||||
ops->run(cmd, cs);
|
||||
ops->teardown(cmd, cs);
|
||||
const unsigned lrz_buffers = CHIP >= A7XX ? 2 : 1;
|
||||
for (unsigned i = 0; i < lrz_buffers; i++) {
|
||||
ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
|
||||
VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
|
||||
VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT);
|
||||
ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
|
||||
ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
|
||||
image->iova + image->lrz_layout.lrz_offset +
|
||||
i * image->lrz_layout.lrz_buffer_size,
|
||||
image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
|
||||
uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
|
||||
ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
|
||||
(VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
|
||||
ops->run(cmd, cs);
|
||||
ops->teardown(cmd, cs);
|
||||
}
|
||||
|
||||
/* Clearing writes via CCU color in the PS stage, and LRZ is read via
|
||||
* UCHE in the earlier GRAS stage.
|
||||
|
|
|
|||
|
|
@ -220,6 +220,7 @@ tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
uint32_t offset)
|
||||
{
|
||||
struct tu_vis_stream_patchpoint patchpoint = {
|
||||
.render_pass_idx = cmd->state.tile_render_pass_count,
|
||||
.data = cs->cur,
|
||||
.iova = tu_cs_get_cur_iova(cs),
|
||||
.offset = offset,
|
||||
|
|
@ -339,12 +340,72 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
|||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
tu7_write_onchip_val(struct tu_cs *cs, enum tu_onchip_addr addr,
|
||||
uint32_t val)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
|
||||
CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_WRITE_ENABLED);
|
||||
tu_cs_emit_qw(cs, addr);
|
||||
tu_cs_emit(cs, val);
|
||||
}
|
||||
|
||||
/* "Normal" cache flushes outside the renderpass, that don't require any special handling */
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache);
|
||||
struct tu_cs *cs = &cmd_buffer->cs;
|
||||
struct tu_cache_state *cache = &cmd_buffer->state.cache;
|
||||
BITMASK_ENUM(tu_cmd_flush_bits) flushes = cache->flush_bits;
|
||||
|
||||
tu6_emit_flushes<CHIP>(cmd_buffer, cs, cache);
|
||||
|
||||
if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
|
||||
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
|
||||
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BV);
|
||||
|
||||
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
|
||||
/* Wait for the previous WAIT_FOR_BR to execute on BV and reset the wait
|
||||
* value.
|
||||
*/
|
||||
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
/* Signal the wait value. */
|
||||
tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 1);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BV);
|
||||
|
||||
/* Wait for the value. Note that we must use CP_WAIT_REG_MEM due to a
|
||||
* firmware bug which makes CP_WAIT_TIMESTAMP on BV deadlock with
|
||||
* preemption when BV waits for BR. Without this bug the whole thing
|
||||
* would be much, much simpler.
|
||||
*/
|
||||
tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 1);
|
||||
|
||||
/* Reset the wait value. */
|
||||
tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 0);
|
||||
|
||||
/* Resetting the wait value happens asynchronously (since it's an
|
||||
* EVENT_WRITE), but waiting for it happens synchronously. We need to
|
||||
* prevent BV from racing ahead to the next wait before it's reset.
|
||||
*/
|
||||
tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 0);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_emit_cache_flush);
|
||||
|
||||
|
|
@ -356,8 +417,11 @@ tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer)
|
|||
if (!cmd_buffer->state.renderpass_cache.flush_bits &&
|
||||
likely(!tu_env.debug))
|
||||
return;
|
||||
tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->draw_cs,
|
||||
&cmd_buffer->state.renderpass_cache);
|
||||
|
||||
struct tu_cs *cs = &cmd_buffer->draw_cs;
|
||||
struct tu_cache_state *cache = &cmd_buffer->state.renderpass_cache;
|
||||
|
||||
tu6_emit_flushes<CHIP>(cmd_buffer, cs, cache);
|
||||
if (cmd_buffer->state.renderpass_cache.flush_bits &
|
||||
TU_CMD_FLAG_BLIT_CACHE_CLEAN) {
|
||||
cmd_buffer->state.blit_cache_cleaned = true;
|
||||
|
|
@ -491,7 +555,7 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
|
|||
(CHIP == A6XX ? TU_CMD_FLAG_WAIT_FOR_IDLE : 0));
|
||||
}
|
||||
|
||||
tu6_emit_flushes<CHIP>(cmd_buffer, cs, &cmd_buffer->state.cache);
|
||||
tu_emit_cache_flush<CHIP>(cmd_buffer);
|
||||
|
||||
if (ccu_state != cmd_buffer->state.ccu_state) {
|
||||
emit_rb_ccu_cntl<CHIP>(cs, cmd_buffer->device,
|
||||
|
|
@ -2116,6 +2180,25 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
}
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, true);
|
||||
|
||||
/* This sets the amount BV is allowed to be ahead of BR when we do
|
||||
* BV_WAIT_FOR_BR. By setting it based on the vis stream count we
|
||||
* prevent write-after-read races with the vis stream.
|
||||
*/
|
||||
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 2);
|
||||
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_SET_BR_OFFSET));
|
||||
|
||||
struct tu_vis_stream_patchpoint *patchpoint =
|
||||
&cmd->vis_stream_count_patchpoint;
|
||||
patchpoint->data = cs->cur;
|
||||
patchpoint->iova = tu_cs_get_cur_iova(cs);
|
||||
tu_cs_emit(cs, 1);
|
||||
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, false);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
}
|
||||
|
||||
|
|
@ -2137,6 +2220,10 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE));
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
|
||||
|
||||
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
|
||||
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
|
||||
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING));
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3);
|
||||
|
|
@ -2238,7 +2325,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
const VkOffset2D *fdm_offsets)
|
||||
const VkOffset2D *fdm_offsets, bool use_cb)
|
||||
{
|
||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
|
|
@ -2336,12 +2423,18 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit_regs(cs,
|
||||
A6XX_TPL1_WINDOW_OFFSET(.x = 0, .y = 0));
|
||||
|
||||
trace_start_binning_ib(&cmd->trace, cs, cmd);
|
||||
if (use_cb)
|
||||
trace_start_concurrent_binning_ib(&cmd->trace, cs, cmd);
|
||||
else
|
||||
trace_start_binning_ib(&cmd->trace, cs, cmd);
|
||||
|
||||
/* emit IB to binning drawcmds: */
|
||||
tu_cs_emit_call(cs, &cmd->draw_cs);
|
||||
|
||||
trace_end_binning_ib(&cmd->trace, cs);
|
||||
if (use_cb)
|
||||
trace_end_concurrent_binning_ib(&cmd->trace, cs);
|
||||
else
|
||||
trace_end_binning_ib(&cmd->trace, cs);
|
||||
|
||||
/* switching from binning pass to GMEM pass will cause a switch from
|
||||
* PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
|
||||
|
|
@ -2667,6 +2760,46 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd)
|
|||
cmd->state.fdm_enabled = cmd->state.pass->has_fdm;
|
||||
}
|
||||
|
||||
static bool
|
||||
tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
bool disable_cb)
|
||||
{
|
||||
if (disable_cb ||
|
||||
/* LRZ can only be cleared via fast clear in BV. Disable CB if we can't
|
||||
* use it.
|
||||
*/
|
||||
!cmd->state.lrz.fast_clear ||
|
||||
TU_DEBUG(NO_CONCURRENT_BINNING)) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
|
||||
return false;
|
||||
}
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
|
||||
|
||||
/* Increment timestamp to make it unique in subsequent commands */
|
||||
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
|
||||
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
|
||||
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
|
||||
|
||||
/* We initialize the "is concurrent binning enabled?" predicate to true and
|
||||
* disable it later if necessary.
|
||||
*/
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, true);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BV);
|
||||
|
||||
/* If there was an overflow in the BR resource table the register will be
|
||||
* set to 1 by CP_RESOURCE_LIST. Wait for it to clear here.
|
||||
*/
|
||||
tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0);
|
||||
|
||||
tu_lrz_cb_begin(cmd, cs);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
|
|
@ -2674,8 +2807,40 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
{
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
|
||||
/* It seems that for sysmem render passes we have to use BV to clear LRZ
|
||||
* before the renderpass. Otherwise the clear doesn't become visible to
|
||||
* subsequent draws when LRZ has been flipped an odd number of times.
|
||||
* Presumably this works if concurrent binning is disabled, because the
|
||||
* blob relies on this, but that requires synchronizing BR and BV
|
||||
* unnecessarily, and we want BV to skip ahead across sysmem renderpasses.
|
||||
*
|
||||
* In the future, we may also support writing LRZ in BV.
|
||||
*/
|
||||
bool concurrent_binning = false;
|
||||
if (CHIP >= A7XX) {
|
||||
concurrent_binning = tu7_emit_concurrent_binning(cmd, cs, false);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
|
||||
}
|
||||
|
||||
tu_lrz_sysmem_begin<CHIP>(cmd, cs);
|
||||
|
||||
if (concurrent_binning) {
|
||||
tu_lrz_after_bv<CHIP>(cmd, cs);
|
||||
|
||||
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END));
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
|
||||
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
tu_lrz_before_sysmem_br<CHIP>(cmd, cs);
|
||||
}
|
||||
|
||||
assert(fb->width > 0 && fb->height > 0);
|
||||
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
|
||||
tu6_emit_window_offset<CHIP>(cs, 0, 0);
|
||||
|
|
@ -2758,9 +2923,155 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
|
||||
tu_lrz_sysmem_end<CHIP>(cmd, cs);
|
||||
|
||||
/* Clear the resource list for any LRZ resources we emitted at the
|
||||
* beginning.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
|
||||
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
|
||||
CP_EVENT_WRITE7_0_WRITE_ENABLED);
|
||||
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
|
||||
tu_cs_emit(cs, 0); /* value */
|
||||
}
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
||||
static void
|
||||
tu7_write_and_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
|
||||
{
|
||||
tu7_write_onchip_timestamp(cs, onchip_addr);
|
||||
tu7_wait_onchip_timestamp(cs, onchip_addr);
|
||||
}
|
||||
|
||||
static bool
|
||||
tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
bool use_hw_binning)
|
||||
{
|
||||
/* xfb queries use data from the binning pass. If they are running outside
|
||||
* of a RP then we may have to deal with a mix of GMEM/sysmem renderpasses
|
||||
* where the counters increase on different processors. Just disable CB so
|
||||
* that everything happens on BR and we don't need difficult merging of BV
|
||||
* and BR results. In addition, RBBM primitive counters seem to not work
|
||||
* at all with concurrent binning, so disable if they are running before
|
||||
* the RP.
|
||||
*/
|
||||
bool disable_cb =
|
||||
cmd->state.xfb_query_running_before_rp ||
|
||||
cmd->state.rp.has_prim_generated_query_in_rp ||
|
||||
cmd->state.rp.has_vtx_stats_query_in_rp ||
|
||||
cmd->state.prim_counters_running > 0;
|
||||
|
||||
|
||||
if (!tu7_emit_concurrent_binning(cmd, cs, disable_cb || !use_hw_binning))
|
||||
return false;
|
||||
|
||||
/* We want to disable concurrent binning if BV isn't far enough ahead of
|
||||
* BR. The core idea is to write a timestamp in BR and BV, and compare the
|
||||
* BR and BV timestamps for equality. if BR is fast enough, it will write
|
||||
* the timestamp ahead of BV and then when BV compares for equality it will
|
||||
* find them equal. BR cannot race too far ahead of BV because it must wait
|
||||
* for BV's determination to finish, which we do via another timestamp, so
|
||||
* either BV is ahead of BR or the timestamps are equal.
|
||||
*
|
||||
* We need to communicate the determination from BV to BR so they both
|
||||
* agree on whether concurrent binning is enabled or not. The easiest way
|
||||
* to do it is via a "when was concurrent binning last disabled" timestamp,
|
||||
* because we only have to set it when disabling concurrent binning.
|
||||
*/
|
||||
|
||||
if (!TU_DEBUG(FORCE_CONCURRENT_BINNING)) {
|
||||
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BR_TIMESTAMP);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BV);
|
||||
|
||||
/* If in a secondary, dynamically disable CB if a vtx stats query is
|
||||
* running.
|
||||
*/
|
||||
if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_RUNNING));
|
||||
}
|
||||
|
||||
const uint32_t bv_cond_dwords = 3 + 4 + 4;
|
||||
tu_cs_reserve(cs, 4 + bv_cond_dwords);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3);
|
||||
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) |
|
||||
CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) |
|
||||
CP_COND_REG_EXEC_0_ONCHIP_MEM);
|
||||
tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_TIMESTAMP) |
|
||||
REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM);
|
||||
tu_cs_emit(cs, bv_cond_dwords);
|
||||
if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
|
||||
tu_cond_exec_end(cs);
|
||||
/* if (BR_TIMESTAMP == BV_TIMESTAMP) */ {
|
||||
tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP);
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
|
||||
}
|
||||
tu7_write_onchip_timestamp(cs,
|
||||
TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
|
||||
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP);
|
||||
|
||||
const uint32_t br_cond_dwords = 4;
|
||||
tu_cs_reserve(cs, 4 + br_cond_dwords);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3);
|
||||
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) |
|
||||
CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) |
|
||||
CP_COND_REG_EXEC_0_ONCHIP_MEM);
|
||||
tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP) |
|
||||
REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM);
|
||||
tu_cs_emit(cs, br_cond_dwords);
|
||||
/* if (BR_TIMESTAMP == BV_DISABLED_TIMESTAMP) */ {
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false);
|
||||
}
|
||||
}
|
||||
|
||||
/* At this point BV and BR are agreed on whether CB is enabled. If CB is
|
||||
* enabled, set the thread to BV for the binning pass, otherwise set BR and
|
||||
* disable concurrent binning.
|
||||
*/
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BOTH);
|
||||
|
||||
const uint32_t if_dwords = 5;
|
||||
const uint32_t else_dwords = 2;
|
||||
tu_cs_reserve(cs, 3 + if_dwords + else_dwords);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
|
||||
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED) |
|
||||
CP_COND_REG_EXEC_0_SKIP_WAIT_FOR_ME);
|
||||
tu_cs_emit(cs, if_dwords);
|
||||
/* if (CB is enabled) */ {
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BV);
|
||||
|
||||
/* Wait for BR vis stream reads to finish */
|
||||
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1);
|
||||
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BV_WAIT_FOR_BR));
|
||||
|
||||
/* This is the NOP-as-else trick. If CB is disabled, this CP_NOP is
|
||||
* skipped and its body (the else) is executed.
|
||||
*/
|
||||
tu_cs_emit_pkt7(cs, CP_NOP, else_dwords);
|
||||
} /* else */ {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
|
|
@ -2771,31 +3082,38 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
||||
const struct tu_render_pass *pass = cmd->state.pass;
|
||||
bool use_binning = use_hw_binning(cmd);
|
||||
|
||||
tu_lrz_tiling_begin<CHIP>(cmd, cs);
|
||||
/* User flushes should always be executed on BR. */
|
||||
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 0x0);
|
||||
bool use_cb = false;
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu7_emit_tile_render_begin_regs(cs);
|
||||
use_cb = tu7_emit_concurrent_binning_gmem(cmd, cs, use_binning);
|
||||
}
|
||||
|
||||
if (!use_cb)
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu_lrz_tiling_begin<CHIP>(cmd, cs);
|
||||
|
||||
/* tu_lrz_tiling_begin() can accumulate additional flushes. If that happens
|
||||
* CB should be disabled, so it's safe to just emit them here.
|
||||
*/
|
||||
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 0x0);
|
||||
|
||||
/* Reset bin scaling. */
|
||||
if (phys_dev->info->a7xx.has_hw_bin_scaling) {
|
||||
tu_cs_emit_regs(cs, A7XX_GRAS_BIN_FOVEAT());
|
||||
tu_cs_emit_regs(cs, A7XX_RB_BIN_FOVEAT());
|
||||
}
|
||||
|
||||
tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
|
||||
}
|
||||
|
||||
if (use_hw_binning(cmd)) {
|
||||
if (use_binning) {
|
||||
if (!cmd->vsc_initialized) {
|
||||
tu6_lazy_init_vsc(cmd);
|
||||
}
|
||||
|
|
@ -2833,7 +3151,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
|
||||
tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);
|
||||
|
||||
tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets);
|
||||
tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets, use_cb);
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
tu_cs_emit_regs(cs,
|
||||
|
|
@ -2897,6 +3215,40 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
if (use_binning) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV));
|
||||
|
||||
tu_lrz_after_bv<CHIP>(cmd, cs);
|
||||
|
||||
/* Signal that BV is done for this render pass. This always has to
|
||||
* be executed, even when CB is dynamically disabled, because we
|
||||
* need to keep BR and BV counts in sync with which visibility
|
||||
* streams are in use.
|
||||
*/
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_INC_BV_COUNT);
|
||||
|
||||
/* This mode seems to be only used by BV and signals that a
|
||||
* simpler save/restore procedure can be used in between render
|
||||
* passes.
|
||||
*/
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END));
|
||||
}
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
|
||||
if (use_binning) {
|
||||
/* Wait for the BV to be done for this render pass. */
|
||||
tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1);
|
||||
tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BR_WAIT_FOR_BV));
|
||||
|
||||
/* Emit vis stream on BR */
|
||||
tu_emit_vsc<CHIP>(cmd, cs);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4);
|
||||
tu_cs_emit(cs, num_vsc_pipes); /* count */
|
||||
tu_cs_emit(cs, 0); /* offset */
|
||||
|
|
@ -2906,8 +3258,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
if (CHIP >= A7XX &&
|
||||
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
|
||||
tu_cs_set_writeable(cs, false);
|
||||
} else if (CHIP >= A7XX) {
|
||||
/* Earlier we disabled concurrent binning to make LRZ fast-clear work
|
||||
* with no HW binning, now re-enable it while staying on BR.
|
||||
*/
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
}
|
||||
|
||||
tu_lrz_before_tiles<CHIP>(cmd, cs, use_cb);
|
||||
|
||||
if (use_cb)
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
|
|
@ -2982,12 +3344,30 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
|
||||
tu_lrz_tiling_end<CHIP>(cmd, cs);
|
||||
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
bool hw_binning = use_hw_binning(cmd);
|
||||
if (hw_binning) {
|
||||
cmd->state.tile_render_pass_count++;
|
||||
}
|
||||
|
||||
/* If we are using HW binning, signal that we are done with reading the vis
|
||||
* stream for this render pass by advancing the counter. Also clear render
|
||||
* resources, currently only used for LRZ, and reset the overflow onchip
|
||||
* register.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
COND(hw_binning, CP_EVENT_WRITE7_0_INC_BR_COUNT) |
|
||||
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
|
||||
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
|
||||
CP_EVENT_WRITE7_0_WRITE_ENABLED);
|
||||
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
|
||||
tu_cs_emit(cs, 0); /* value */
|
||||
}
|
||||
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
||||
|
|
@ -3354,8 +3734,6 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
||||
tu_cs_end(&cmd->tile_store_cs);
|
||||
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
|
||||
|
||||
/* Note: we reverse the order of walking the pipes and tiles on every
|
||||
|
|
@ -5370,7 +5748,8 @@ sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)
|
|||
}
|
||||
|
||||
static enum tu_stage
|
||||
vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
|
||||
vk2tu_single_stage(struct tu_device *dev,
|
||||
VkPipelineStageFlags2 vk_stage, bool dst)
|
||||
{
|
||||
/* If the destination stage is executed on the CP, then the CP also has to
|
||||
* wait for any WFI's to finish. This is already done for draw calls,
|
||||
|
|
@ -5394,24 +5773,40 @@ vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
|
|||
if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT)
|
||||
return TU_STAGE_CP;
|
||||
return TU_STAGE_BV_CP;
|
||||
|
||||
if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
|
||||
return dst ? TU_STAGE_CP : TU_STAGE_GPU;
|
||||
return dst ? TU_STAGE_BV_CP : TU_STAGE_BR;
|
||||
|
||||
if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT)
|
||||
return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP;
|
||||
return dst ? TU_STAGE_BOTTOM : TU_STAGE_BV_CP;
|
||||
|
||||
return TU_STAGE_GPU;
|
||||
if (dev->physical_device->info->chip >= 7) {
|
||||
if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT ||
|
||||
vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) {
|
||||
return dst ? TU_STAGE_BV : TU_STAGE_BR;
|
||||
}
|
||||
}
|
||||
|
||||
return TU_STAGE_BR;
|
||||
}
|
||||
|
||||
static enum tu_stage
|
||||
vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
|
||||
vk2tu_src_stage(struct tu_device *dev,
|
||||
VkPipelineStageFlags2 vk_stages)
|
||||
{
|
||||
enum tu_stage stage = TU_STAGE_CP;
|
||||
enum tu_stage stage = TU_STAGE_BV_CP;
|
||||
u_foreach_bit64 (bit, vk_stages) {
|
||||
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
|
||||
enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, false);
|
||||
stage = MAX2(stage, new_stage);
|
||||
}
|
||||
|
||||
|
|
@ -5419,11 +5814,12 @@ vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
|
|||
}
|
||||
|
||||
static enum tu_stage
|
||||
vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages)
|
||||
vk2tu_dst_stage(struct tu_device *dev,
|
||||
VkPipelineStageFlags2 vk_stages)
|
||||
{
|
||||
enum tu_stage stage = TU_STAGE_BOTTOM;
|
||||
u_foreach_bit64 (bit, vk_stages) {
|
||||
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
|
||||
enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, true);
|
||||
stage = MIN2(stage, new_stage);
|
||||
}
|
||||
|
||||
|
|
@ -5437,14 +5833,17 @@ tu_flush_for_stage(struct tu_cache_state *cache,
|
|||
/* Even if the source is the host or CP, the destination access could
|
||||
* generate invalidates that we have to wait to complete.
|
||||
*/
|
||||
if (src_stage == TU_STAGE_CP &&
|
||||
if (src_stage < TU_STAGE_BR &&
|
||||
(cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE))
|
||||
src_stage = TU_STAGE_GPU;
|
||||
src_stage = TU_STAGE_BR;
|
||||
|
||||
if (src_stage >= dst_stage) {
|
||||
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
|
||||
if (dst_stage == TU_STAGE_CP)
|
||||
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
|
||||
if (dst_stage <= TU_STAGE_BV) {
|
||||
cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR;
|
||||
if (dst_stage == TU_STAGE_BV_CP)
|
||||
cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -5455,6 +5854,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst,
|
|||
dst->xfb_used |= src->xfb_used;
|
||||
dst->has_tess |= src->has_tess;
|
||||
dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
|
||||
dst->has_vtx_stats_query_in_rp |= src->has_vtx_stats_query_in_rp;
|
||||
dst->has_zpass_done_sample_count_write_in_rp |= src->has_zpass_done_sample_count_write_in_rp;
|
||||
dst->disable_gmem |= src->disable_gmem;
|
||||
dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
|
||||
|
|
@ -5653,6 +6053,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
|
|||
secondary_patchpoint) {
|
||||
struct tu_vis_stream_patchpoint patchpoint =
|
||||
*secondary_patchpoint;
|
||||
patchpoint.render_pass_idx += cmd->state.tile_render_pass_count;
|
||||
|
||||
if (simultaneous_use) {
|
||||
tu_cs_reserve_space(cs, 5);
|
||||
|
|
@ -5682,6 +6083,8 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
|
|||
}
|
||||
}
|
||||
|
||||
cmd->state.tile_render_pass_count +=
|
||||
secondary->state.tile_render_pass_count;
|
||||
cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size);
|
||||
|
||||
switch (secondary->state.suspend_resume) {
|
||||
|
|
@ -5844,8 +6247,8 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
|
|||
|
||||
tu_flush_for_access(cache, src_flags, dst_flags);
|
||||
|
||||
enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk);
|
||||
enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk);
|
||||
tu_flush_for_stage(cache, src_stage, dst_stage);
|
||||
}
|
||||
|
||||
|
|
@ -5975,6 +6378,10 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
|
|||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
|
||||
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
|
||||
CP_COND_REG_EXEC_0_GMEM |
|
||||
CP_COND_REG_EXEC_0_SYSMEM);
|
||||
|
||||
bool emitted_scissor = false;
|
||||
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
|
||||
struct tu_render_pass_attachment *att =
|
||||
|
|
@ -5987,6 +6394,8 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
|
|||
tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
|
||||
}
|
||||
}
|
||||
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -8906,8 +9315,8 @@ tu_barrier(struct tu_cmd_buffer *cmd,
|
|||
|
||||
tu_flush_for_access(cache, src_flags, dst_flags);
|
||||
|
||||
enum tu_stage src_stage = vk2tu_src_stage(srcStage);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(dstStage);
|
||||
enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage);
|
||||
enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage);
|
||||
tu_flush_for_stage(cache, src_stage, dst_stage);
|
||||
}
|
||||
|
||||
|
|
@ -8973,9 +9382,6 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
|
|||
|
||||
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 1);
|
||||
|
||||
/* Wait for any writes to the predicate to land */
|
||||
if (cmd->state.pass)
|
||||
tu_emit_cache_flush_renderpass<CHIP>(cmd);
|
||||
|
|
@ -8989,23 +9395,72 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
|
|||
* mandates 32-bit comparisons. Our workaround is to copy the the reference
|
||||
* value to the low 32-bits of a location where the high 32 bits are known
|
||||
* to be 0 and then compare that.
|
||||
*
|
||||
* BR and BV use separate predicate values so that setting the predicate
|
||||
* doesn't have to be synchronized between them.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
if (!cmd->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
|
||||
}
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BR);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
|
||||
tu_cs_emit(cs, 0);
|
||||
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
|
||||
tu_cs_emit_qw(cs, iova);
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cond_exec_end(cs);
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BV);
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
|
||||
tu_cs_emit(cs, 0);
|
||||
tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate));
|
||||
tu_cs_emit_qw(cs, iova);
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 1);
|
||||
|
||||
bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BR);
|
||||
}
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
|
||||
tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
|
||||
CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
|
||||
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cond_exec_end(cs);
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BV);
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
|
||||
tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
|
||||
CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
|
||||
tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate));
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
/* Restore original BR thread after setting BOTH */
|
||||
if (CHIP >= A7XX && !cmd->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_CmdBeginConditionalRenderingEXT);
|
||||
|
||||
template <chip CHIP>
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
|
||||
{
|
||||
|
|
@ -9015,9 +9470,20 @@ tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
|
|||
|
||||
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
|
||||
|
||||
if (CHIP >= A7XX && !cmd->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 0);
|
||||
|
||||
if (CHIP >= A7XX && !cmd->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_CmdEndConditionalRenderingEXT);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
|
|
|
|||
|
|
@ -193,12 +193,15 @@ enum tu_stage {
|
|||
* wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
|
||||
* As a source stage, it is for things needing no waits.
|
||||
*/
|
||||
TU_STAGE_CP,
|
||||
TU_STAGE_BV_CP,
|
||||
|
||||
/* This is for operations executed on BV. */
|
||||
TU_STAGE_BV,
|
||||
|
||||
/* This is for most operations, which WFI will wait to finish and will not
|
||||
* start until any pending WFIs are finished.
|
||||
*/
|
||||
TU_STAGE_GPU,
|
||||
TU_STAGE_BR,
|
||||
|
||||
/* This is only used as a destination stage and is for things needing no
|
||||
* waits on the GPU (e.g. host operations).
|
||||
|
|
@ -223,6 +226,7 @@ enum tu_cmd_flush_bits {
|
|||
*/
|
||||
TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
|
||||
TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
|
||||
TU_CMD_FLAG_WAIT_FOR_BR = 1 << 13,
|
||||
|
||||
TU_CMD_FLAG_ALL_CLEAN =
|
||||
TU_CMD_FLAG_CCU_CLEAN_DEPTH |
|
||||
|
|
@ -268,6 +272,7 @@ struct tu_cache_state {
|
|||
BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
|
||||
/* Pending flushes */
|
||||
BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
|
||||
BITMASK_ENUM(tu_cmd_flush_bits) bv_flush_bits;
|
||||
};
|
||||
|
||||
struct tu_vs_params {
|
||||
|
|
@ -293,6 +298,7 @@ struct tu_render_pass_state
|
|||
bool xfb_used;
|
||||
bool has_tess;
|
||||
bool has_prim_generated_query_in_rp;
|
||||
bool has_vtx_stats_query_in_rp;
|
||||
bool has_zpass_done_sample_count_write_in_rp;
|
||||
bool disable_gmem;
|
||||
bool sysmem_single_prim_mode;
|
||||
|
|
@ -578,6 +584,8 @@ struct tu_cmd_state
|
|||
uint32_t prim_counters_running;
|
||||
|
||||
bool prim_generated_query_running_before_rp;
|
||||
bool vtx_stats_query_running_before_rp;
|
||||
bool xfb_query_running_before_rp;
|
||||
|
||||
bool occlusion_query_may_be_running;
|
||||
|
||||
|
|
@ -601,6 +609,15 @@ struct tu_cmd_state
|
|||
|
||||
uint32_t total_renderpasses;
|
||||
uint32_t total_dispatches;
|
||||
|
||||
unsigned tile_render_pass_count;
|
||||
};
|
||||
|
||||
struct tu_vis_stream_patchpoint {
|
||||
unsigned render_pass_idx;
|
||||
uint32_t *data;
|
||||
uint64_t iova;
|
||||
uint32_t offset;
|
||||
};
|
||||
|
||||
struct tu_cmd_buffer
|
||||
|
|
@ -618,6 +635,7 @@ struct tu_cmd_buffer
|
|||
void *patchpoints_ctx;
|
||||
struct util_dynarray fdm_bin_patchpoints;
|
||||
|
||||
struct tu_vis_stream_patchpoint vis_stream_count_patchpoint;
|
||||
struct util_dynarray vis_stream_patchpoints;
|
||||
struct util_dynarray vis_stream_bos;
|
||||
struct util_dynarray vis_stream_cs_bos;
|
||||
|
|
@ -838,12 +856,6 @@ struct tu_fdm_bin_patchpoint {
|
|||
tu_fdm_bin_apply_t apply;
|
||||
};
|
||||
|
||||
struct tu_vis_stream_patchpoint {
|
||||
uint32_t *data;
|
||||
uint64_t iova;
|
||||
uint32_t offset;
|
||||
};
|
||||
|
||||
struct tu_vis_stream_patchpoint_cs {
|
||||
struct tu_suballoc_bo cs_bo;
|
||||
struct tu_suballoc_bo fence_bo;
|
||||
|
|
|
|||
|
|
@ -93,6 +93,8 @@
|
|||
(MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
|
||||
A6XX_TEX_CONST_DWORDS
|
||||
|
||||
#define TU_MAX_VIS_STREAMS 4
|
||||
|
||||
/* With dynamic rendering, input attachment indices are shifted by 1 and
|
||||
* attachment 0 is used for input attachments without an InputAttachmentIndex
|
||||
* (which can only be depth/stencil).
|
||||
|
|
@ -151,8 +153,31 @@
|
|||
enum tu_predicate_bit {
|
||||
TU_PREDICATE_LOAD_STORE = 0,
|
||||
TU_PREDICATE_PERFCNTRS = 1,
|
||||
TU_PREDICATE_CB_ENABLED = 2,
|
||||
TU_PREDICATE_VTX_STATS_RUNNING = 3,
|
||||
TU_PREDICATE_VTX_STATS_NOT_RUNNING = 4,
|
||||
TU_PREDICATE_FIRST_TILE = 5,
|
||||
};
|
||||
|
||||
/* Onchip timestamp register layout. */
|
||||
enum tu_onchip_addr {
|
||||
/* Registers 0-7 are defined by firmware to be shared between BR/BV.
|
||||
*/
|
||||
|
||||
/* See tu7_emit_concurrent_binning */
|
||||
TU_ONCHIP_CB_BR_TIMESTAMP,
|
||||
TU_ONCHIP_CB_BV_TIMESTAMP,
|
||||
TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP,
|
||||
TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP,
|
||||
TU_ONCHIP_BARRIER,
|
||||
TU_ONCHIP_CB_RESLIST_OVERFLOW,
|
||||
|
||||
/* Registers 8-15 are defined by firmware to be split between BR and BV.
|
||||
* Each has their own copy.
|
||||
*/
|
||||
};
|
||||
|
||||
|
||||
#define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)
|
||||
|
||||
#define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)
|
||||
|
|
|
|||
|
|
@ -493,6 +493,55 @@ tu7_thread_control(struct tu_cs *cs, enum cp_thread thread)
|
|||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(thread));
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu7_set_pred_mask(struct tu_cs *cs, uint32_t mask, uint32_t val)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TEST, 3);
|
||||
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_PRED_UPDATE |
|
||||
A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
|
||||
tu_cs_emit(cs, mask);
|
||||
tu_cs_emit(cs, val);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu7_set_pred_bit(struct tu_cs *cs, enum tu_predicate_bit bit, bool val)
|
||||
{
|
||||
tu7_set_pred_mask(cs, 1u << bit, val ? (1u << bit) : 0);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu7_write_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 2);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_TIMESTAMP_SUM) |
|
||||
CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_WRITE_ENABLED);
|
||||
tu_cs_emit(cs, onchip_addr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu7_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_TIMESTAMP, 3);
|
||||
tu_cs_emit(cs, CP_WAIT_TIMESTAMP_0_WAIT_DST(TS_WAIT_ONCHIP) |
|
||||
CP_WAIT_TIMESTAMP_0_WAIT_VALUE_SRC(TS_WAIT_GE_TIMESTAMP_SUM));
|
||||
tu_cs_emit_qw(cs, onchip_addr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu7_wait_onchip_val(struct tu_cs *cs, enum tu_onchip_addr onchip_addr,
|
||||
uint32_t val)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
|
||||
CP_WAIT_REG_MEM_0_POLL(POLL_ON_CHIP));
|
||||
tu_cs_emit_qw(cs, onchip_addr);
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(val));
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
|
||||
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
tu_cs_emit_data_nop(struct tu_cs *cs,
|
||||
const uint32_t *data,
|
||||
|
|
|
|||
|
|
@ -3046,6 +3046,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
device->vk.flush_buffer_write_cp = tu_flush_buffer_write_cp;
|
||||
device->vk.cmd_fill_buffer_addr = tu_cmd_fill_buffer_addr;
|
||||
|
||||
device->vis_stream_count = 0;
|
||||
|
||||
*pDevice = tu_device_to_handle(device);
|
||||
return VK_SUCCESS;
|
||||
|
||||
|
|
|
|||
|
|
@ -255,6 +255,8 @@ struct tu6_global
|
|||
|
||||
uint32_t vsc_state[32];
|
||||
|
||||
uint64_t bv_predicate;
|
||||
|
||||
volatile uint32_t vtx_stats_query_not_running;
|
||||
|
||||
/* To know when renderpass stats for autotune are valid */
|
||||
|
|
@ -487,6 +489,9 @@ struct tu_device
|
|||
|
||||
/* This is an internal queue for mapping/unmapping non-sparse BOs */
|
||||
uint32_t vm_bind_queue_id;
|
||||
|
||||
uint32_t vis_stream_count;
|
||||
uint32_t vis_stream_size;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
|
||||
|
||||
|
|
|
|||
|
|
@ -234,6 +234,7 @@ tu_lrz_init_state(struct tu_cmd_buffer *cmd,
|
|||
* enabled and there will be a NULL/garbage LRZ buffer.
|
||||
*/
|
||||
cmd->state.lrz.image_view = view;
|
||||
cmd->state.lrz.store = att->store;
|
||||
|
||||
if (!clears_depth && !att->load)
|
||||
return;
|
||||
|
|
@ -412,6 +413,51 @@ tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
/* The LRZ double-buffering guarantees that passes that clear or discard
|
||||
* depth don't have to worry about LRZ dependencies. However we do have to
|
||||
* worry about renderpasses that load depth, because we cannot flip LRZ
|
||||
* then and have to reuse what the previous pass wrote. There is then a
|
||||
* write-after-read dependency from an earlier subpass reading LRZ. We
|
||||
* solve this using CP_RESOURCE_LIST, because the Vulkan user doesn't have
|
||||
* to track render-and-clear dependencies vs. render-and-render depdencies
|
||||
* (LOAD_OP_CLEAR happens in the same stage as rendering).
|
||||
*/
|
||||
if (!cmd->state.lrz.image_view)
|
||||
return;
|
||||
|
||||
uint64_t iova =
|
||||
cmd->state.lrz.image_view->image->iova +
|
||||
cmd->state.lrz.image_view->image->lrz_layout.lrz_offset;
|
||||
uint64_t fc_iova =
|
||||
cmd->state.lrz.image_view->image->iova +
|
||||
cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset;
|
||||
|
||||
if (cmd->state.lrz.reuse_previous_state) {
|
||||
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
|
||||
tu_cs_emit(cs, 1); /* BV count */
|
||||
tu_cs_emit_qw(cs, iova | CP_BV_RESOURCE_0_WRITE);
|
||||
tu_cs_emit(cs, 0); /* BR count */
|
||||
}
|
||||
|
||||
if (cmd->state.lrz.store) {
|
||||
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
|
||||
tu_cs_emit(cs, 0); /* BV count */
|
||||
tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) |
|
||||
CP_RESOURCE_LIST_BR_0_OVERFLOW |
|
||||
CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW));
|
||||
tu_cs_emit_qw(cs, iova);
|
||||
}
|
||||
|
||||
/* See tu_lrz_before_tiles() */
|
||||
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
|
||||
tu_cs_emit(cs, 1); /* BV count */
|
||||
tu_cs_emit_qw(cs, CP_BV_RESOURCE_0_ENCODING(BV_RES_LRZ) | fc_iova);
|
||||
tu_cs_emit(cs, 0); /* BR count */
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
|
|
@ -439,6 +485,16 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
return;
|
||||
}
|
||||
|
||||
/* If CB is dynamically enabled, then this is executed on BV. Flip the
|
||||
* buffer BV is using.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
if (!lrz->valid_at_start) {
|
||||
/* If LRZ was never valid so disable it manually here.
|
||||
* This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
|
||||
|
|
@ -488,6 +544,98 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
}
|
||||
TU_GENX(tu_lrz_tiling_begin);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
if (CHIP < A7XX)
|
||||
return;
|
||||
|
||||
/* BV and BR have different LRZ caches, so flush LRZ cache to be read by
|
||||
* BR.
|
||||
*/
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
TU_GENX(tu_lrz_after_bv);
|
||||
|
||||
static void
|
||||
tu_lrz_clear_resource(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
uint64_t fc_iova =
|
||||
cmd->state.lrz.image_view->image->iova +
|
||||
cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset;
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_CLEAR_LRZ_RESOURCE);
|
||||
tu_cs_emit_qw(cs, fc_iova); /* resource to clear */
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb)
|
||||
{
|
||||
if (CHIP < A7XX)
|
||||
return;
|
||||
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, true);
|
||||
|
||||
if (!cmd->state.lrz.image_view)
|
||||
return;
|
||||
|
||||
/* By clearing the LRZ resource before rendering, we make any future
|
||||
* binning pass writing to the same LRZ image wait for all renderpasses
|
||||
* before this one. Crucially this includes any earlier renderpass reading
|
||||
* from the same LRZ buffer. Because LRZ is only double-buffered but it's
|
||||
* possible to have more than two visibility streams, this is necessary to
|
||||
* prevent write-after-read hazards if BV writes the same LRZ image more
|
||||
* than once before BR reads it.
|
||||
*
|
||||
* For example, consider the sequence:
|
||||
*
|
||||
* RP 1 clears + writes depth image A
|
||||
* - BV: Clear + write LRZ image A
|
||||
* - BR: Read LRZ image A
|
||||
* RP 2 clears + writes depth image A
|
||||
* - BV: Clear + write LRZ image A
|
||||
* - BR: Read LRZ image A
|
||||
* RP 3 clears + writes depth image A
|
||||
* - BV: Clear + write LRZ image A
|
||||
* - BR: Read LRZ image A
|
||||
*
|
||||
* RP 1 BV will write to one LRZ image, RP 2 BV will write to the other,
|
||||
* and then RP 3 BV must stall until RP 1 BR is done reading/writing the
|
||||
* first LRZ image. Specifiying the LRZ resource before BV starts and
|
||||
* clearing it before BR starts will cause RP 2 BV to stall until RP 1 BR
|
||||
* starts, which technically isn't necessary, but it will also cause RP 3
|
||||
* BV to stall until RP 2 BR has started and RP 1 BR has finished.
|
||||
*
|
||||
* This pairs with the last CP_RESOURCE_LIST in tu_lrz_cb_begin().
|
||||
*/
|
||||
if (use_cb)
|
||||
tu_lrz_clear_resource(cmd, cs);
|
||||
}
|
||||
TU_GENX(tu_lrz_before_tiles);
|
||||
|
||||
static void
|
||||
tu_lrz_emit_view_info(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
struct tu_lrz_state *lrz = &cmd->state.lrz;
|
||||
|
||||
if (lrz->gpu_dir_tracking) {
|
||||
if (!lrz->valid_at_start) {
|
||||
/* Make sure we fail the comparison of depth views */
|
||||
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0));
|
||||
} else {
|
||||
tu6_write_lrz_reg(cmd, cs,
|
||||
A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We need to re-emit LRZ state before each tile due to skipsaverestore.
|
||||
*/
|
||||
template <chip CHIP>
|
||||
|
|
@ -501,19 +649,104 @@ tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
} else {
|
||||
tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
|
||||
|
||||
if (lrz->gpu_dir_tracking) {
|
||||
if (!lrz->valid_at_start) {
|
||||
/* Make sure we fail the comparison of depth views */
|
||||
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0));
|
||||
} else {
|
||||
tu6_write_lrz_reg(cmd, cs,
|
||||
A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO));
|
||||
if (CHIP >= A7XX) {
|
||||
/* If CB is dynamically enabled, then flip the buffer BR is using.
|
||||
* This pairs with the LRZ flip in tu_lrz_tiling_begin. FIRST_TILE is
|
||||
* cleared in tu_lrz_before_tiles().
|
||||
*/
|
||||
if (!lrz->reuse_previous_state) {
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED));
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_FIRST_TILE));
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
|
||||
tu_cond_exec_end(cs);
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, false);
|
||||
}
|
||||
|
||||
tu_lrz_emit_view_info(cmd, cs);
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_lrz_before_tile);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
struct tu_lrz_state *lrz = &cmd->state.lrz;
|
||||
|
||||
if (!lrz->image_view) {
|
||||
tu6_emit_lrz_buffer<CHIP>(cs, NULL);
|
||||
} else {
|
||||
tu_lrz_clear_resource(cmd, cs);
|
||||
|
||||
tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
|
||||
|
||||
tu_lrz_emit_view_info(cmd, cs);
|
||||
|
||||
/* If CB is dynamically enabled, then flip the buffer BR is using.
|
||||
* This pairs with the LRZ flip in tu_lrz_sysmem_begin.
|
||||
*/
|
||||
if (!lrz->reuse_previous_state) {
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLIP);
|
||||
|
||||
/* This shouldn't be necessary, because we should be able to clear
|
||||
* LRZ on BV and then BR should use the clear value written by BV,
|
||||
* but there seems to be a HW errata where the value from the
|
||||
* register instead of the clear value is sometimes used when LRZ
|
||||
* writes are disabled. This doesn't seem to be a problem in GMEM
|
||||
* mode, however.
|
||||
*
|
||||
* This is seen with
|
||||
* dEQP-VK.pipeline.monolithic.color_write_enable.alpha_channel.static.*
|
||||
*/
|
||||
if (lrz->fast_clear)
|
||||
tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_CLEAR(lrz->depth_clear_value.depthStencil.depth));
|
||||
} else {
|
||||
/* To workaround the same HW errata as above, but where we don't know
|
||||
* the clear value, copy the clear value from memory to the register.
|
||||
* This is tricky because there are two and we have to select the
|
||||
* right one using CP_COND_EXEC.
|
||||
*/
|
||||
const unsigned if_dwords = 4, else_dwords = if_dwords;
|
||||
uint64_t lrz_fc_iova =
|
||||
lrz->image_view->image->iova + lrz->image_view->image->lrz_layout.lrz_fc_offset;
|
||||
uint64_t br_cur_buffer_iova =
|
||||
lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>, br_cur_buffer);
|
||||
|
||||
/* Make sure the value is written to memory. */
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_CLEAN);
|
||||
tu_cs_emit_wfi(cs);
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
/* if (br_cur_buffer != 0) { */
|
||||
tu_cs_reserve(cs, 7 + if_dwords + 1 + else_dwords);
|
||||
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
|
||||
tu_cs_emit_qw(cs, br_cur_buffer_iova);
|
||||
tu_cs_emit_qw(cs, br_cur_buffer_iova);
|
||||
tu_cs_emit(cs, 2); /* REF */
|
||||
tu_cs_emit(cs, if_dwords + 1);
|
||||
/* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[1].depth_clear_val */
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
|
||||
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR));
|
||||
tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>,
|
||||
buffer[1].depth_clear_val));
|
||||
/* } else { */
|
||||
tu_cs_emit_pkt7(cs, CP_NOP, else_dwords);
|
||||
/* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[0].depth_clear_val */
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
|
||||
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR));
|
||||
tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout<A7XX>,
|
||||
buffer[0].depth_clear_val));
|
||||
/* } */
|
||||
}
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_lrz_before_sysmem_br);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
|
|
@ -635,8 +868,51 @@ tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
if (!image->lrz_layout.lrz_total_size)
|
||||
return;
|
||||
|
||||
uint64_t lrz_iova = image->iova + image->lrz_layout.lrz_offset;
|
||||
|
||||
/* Synchronize writes in BV with subsequent render passes against this
|
||||
* write in BR.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1);
|
||||
tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) |
|
||||
CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL));
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV));
|
||||
|
||||
tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4);
|
||||
tu_cs_emit(cs, 0); /* BV count */
|
||||
tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) |
|
||||
CP_RESOURCE_LIST_BR_0_OVERFLOW |
|
||||
CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW));
|
||||
tu_cs_emit_qw(cs, lrz_iova);
|
||||
|
||||
tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
|
||||
tu7_thread_control(cs, CP_SET_THREAD_BR);
|
||||
|
||||
tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP);
|
||||
}
|
||||
|
||||
tu6_emit_lrz_buffer<CHIP>(cs, image);
|
||||
tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
|
||||
tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) |
|
||||
CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE |
|
||||
CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
|
||||
CP_EVENT_WRITE7_0_WRITE_ENABLED);
|
||||
tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW);
|
||||
tu_cs_emit(cs, 0); /* value */
|
||||
}
|
||||
}
|
||||
TU_GENX(tu_disable_lrz);
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ struct tu_lrz_state
|
|||
bool color_written_with_z_test : 1;
|
||||
bool has_lrz_write_with_skipped_color_writes : 1;
|
||||
|
||||
bool store : 1;
|
||||
|
||||
enum tu_lrz_direction prev_direction;
|
||||
};
|
||||
|
||||
|
|
@ -86,14 +88,29 @@ tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd);
|
|||
void
|
||||
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);
|
||||
|
||||
void
|
||||
tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
|
||||
|
|
|
|||
|
|
@ -38,8 +38,10 @@ tu_device_get_u_trace(struct tu_device *device);
|
|||
/**
|
||||
* Queue-id's
|
||||
*/
|
||||
enum {
|
||||
DEFAULT_HW_QUEUE_ID,
|
||||
enum tu_queue_id {
|
||||
BR_HW_QUEUE_ID,
|
||||
BV_HW_QUEUE_ID,
|
||||
|
||||
/* Labels set via VK_EXT_debug_utils are in a separate track due to the
|
||||
* following part of the spec:
|
||||
* "An application may open a debug label region in one command buffer and
|
||||
|
|
@ -67,6 +69,7 @@ enum tu_stage_id {
|
|||
SECONDARY_CMD_BUFFER_STAGE_ID,
|
||||
CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
|
||||
BINNING_STAGE_ID,
|
||||
CONCURRENT_BINNING_STAGE_ID,
|
||||
GMEM_STAGE_ID,
|
||||
BYPASS_STAGE_ID,
|
||||
BLIT_STAGE_ID,
|
||||
|
|
@ -85,7 +88,8 @@ static const struct {
|
|||
const char *name;
|
||||
const char *desc;
|
||||
} queues[] = {
|
||||
[DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
|
||||
[BR_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
|
||||
[BV_HW_QUEUE_ID] = {"GPU Queue 1", "Adreno Bin Visibility Queue"},
|
||||
[ANNOTATIONS_QUEUE_ID] = {"Annotations", "Annotations Queue"},
|
||||
};
|
||||
|
||||
|
|
@ -99,6 +103,7 @@ static const struct {
|
|||
[SECONDARY_CMD_BUFFER_STAGE_ID] = { "Secondary Command Buffer" },
|
||||
[CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID] = { "Annotation", "Render Pass Command Buffer Annotation" },
|
||||
[BINNING_STAGE_ID] = { "Binning", "Perform Visibility pass and determine target bins" },
|
||||
[CONCURRENT_BINNING_STAGE_ID] = { "Concurrent Binning", "Perform concurrent Visibility pass and determine target bins" },
|
||||
[GMEM_STAGE_ID] = { "GMEM", "Rendering to GMEM" },
|
||||
[BYPASS_STAGE_ID] = { "Bypass", "Rendering to system memory" },
|
||||
[BLIT_STAGE_ID] = { "Blit", "Performing a Blit operation" },
|
||||
|
|
@ -323,12 +328,17 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
|
|||
emit_sync_timestamp(clocks);
|
||||
}
|
||||
|
||||
uint32_t queue_id = DEFAULT_HW_QUEUE_ID;
|
||||
uint32_t queue_id = BR_HW_QUEUE_ID;
|
||||
switch (stage->stage_id) {
|
||||
case CMD_BUFFER_ANNOTATION_STAGE_ID:
|
||||
case CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID:
|
||||
queue_id = ANNOTATIONS_QUEUE_ID;
|
||||
break;
|
||||
/* We only know dynamically whether concurrent binning was enabled. Just
|
||||
* assume it is and always make binning appear on the BV timeline.
|
||||
*/
|
||||
case CONCURRENT_BINNING_STAGE_ID:
|
||||
queue_id = BV_HW_QUEUE_ID;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -577,6 +587,7 @@ CREATE_EVENT_CALLBACK(cmd_buffer, CMD_BUFFER_STAGE_ID)
|
|||
CREATE_EVENT_CALLBACK(secondary_cmd_buffer, SECONDARY_CMD_BUFFER_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(render_pass, RENDER_PASS_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(concurrent_binning_ib, CONCURRENT_BINNING_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
|
||||
CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
|
||||
|
|
|
|||
|
|
@ -1099,6 +1099,9 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
|
|||
bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
|
||||
cmdbuf->state.prim_counters_running++;
|
||||
|
||||
if (cmdbuf->state.pass)
|
||||
cmdbuf->state.rp.has_vtx_stats_query_in_rp = true;
|
||||
|
||||
/* Prevent starting primitive counters when it is supposed to be stopped
|
||||
* for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
|
||||
*/
|
||||
|
|
@ -1110,9 +1113,26 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, 0);
|
||||
if (CHIP >= A7XX) {
|
||||
/* We need the predicate for determining whether to enable CB, so set
|
||||
* it for both BR and BV.
|
||||
*/
|
||||
if (!cmdbuf->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
|
||||
}
|
||||
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
|
||||
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
|
||||
(1u << TU_PREDICATE_VTX_STATS_RUNNING));
|
||||
if (!cmdbuf->state.pass) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR));
|
||||
}
|
||||
} else {
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, 0);
|
||||
}
|
||||
|
||||
if (need_cond_exec) {
|
||||
tu_cond_exec_end(cs);
|
||||
|
|
@ -1312,6 +1332,9 @@ emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = begin_iova));
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
|
||||
|
||||
if (!cmdbuf->state.pass)
|
||||
cmdbuf->state.xfb_query_running_before_rp = true;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
@ -1545,24 +1568,39 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
|
|||
if (!need_cond_exec) {
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
|
||||
} else {
|
||||
tu_cs_reserve(cs, 7 + 2);
|
||||
/* Check that pipeline stats query is not running, only then
|
||||
* we count stop the counter.
|
||||
*/
|
||||
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
|
||||
tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
|
||||
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_NOT_RUNNING));
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
|
||||
tu_cond_exec_end(cs);
|
||||
} else {
|
||||
tu_cs_reserve(cs, 7 + 2);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
|
||||
tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
|
||||
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
|
||||
}
|
||||
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
|
||||
}
|
||||
}
|
||||
|
||||
if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, 1);
|
||||
if (CHIP >= A7XX) {
|
||||
tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) |
|
||||
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING),
|
||||
(1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING));
|
||||
} else {
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
|
||||
tu_cs_emit(cs, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1822,6 +1860,9 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
|
|||
uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
|
||||
uint64_t available_iova = query_available_iova(pool, query);
|
||||
|
||||
if (!cmdbuf->state.pass)
|
||||
cmdbuf->state.xfb_query_running_before_rp = false;
|
||||
|
||||
tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = end_iova));
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
|
||||
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd,
|
|||
/* See below for the commands emitted to the CS. */
|
||||
uint32_t cs_size = 5 *
|
||||
util_dynarray_num_elements(&cmd->vis_stream_patchpoints,
|
||||
struct tu_vis_stream_patchpoint) + 6;
|
||||
struct tu_vis_stream_patchpoint) + 4 + 6;
|
||||
|
||||
util_dynarray_foreach (&cmd->vis_stream_cs_bos,
|
||||
struct tu_vis_stream_patchpoint_cs,
|
||||
|
|
@ -165,8 +165,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
struct tu_device *dev = queue->device;
|
||||
|
||||
uint32_t max_size = 0;
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++)
|
||||
uint32_t rp_count = 0;
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++) {
|
||||
max_size = MAX2(max_size, cmd_buffers[i]->vsc_size);
|
||||
rp_count += cmd_buffers[i]->state.tile_render_pass_count;
|
||||
}
|
||||
|
||||
if (max_size == 0)
|
||||
return VK_SUCCESS;
|
||||
|
|
@ -174,17 +177,32 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
struct tu_bo *bo = NULL;
|
||||
VkResult result = VK_SUCCESS;
|
||||
|
||||
/* Note, we want to make the vis stream count at least 1 because an
|
||||
* BV_BR_OFFSET of 0 can lead to hangs even if not using visibility
|
||||
* streams and therefore should be avoided.
|
||||
*/
|
||||
uint32_t min_vis_stream_count =
|
||||
(TU_DEBUG(NO_CONCURRENT_BINNING) || dev->physical_device->info->chip < 7) ?
|
||||
1 : MIN2(MAX2(rp_count, 1), TU_MAX_VIS_STREAMS);
|
||||
uint32_t vis_stream_count;
|
||||
|
||||
mtx_lock(&dev->vis_stream_mtx);
|
||||
|
||||
if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) {
|
||||
if (!dev->vis_stream_bo || max_size > dev->vis_stream_size ||
|
||||
min_vis_stream_count > dev->vis_stream_count) {
|
||||
dev->vis_stream_count = MAX2(dev->vis_stream_count,
|
||||
min_vis_stream_count);
|
||||
dev->vis_stream_size = MAX2(dev->vis_stream_size, max_size);
|
||||
if (dev->vis_stream_bo)
|
||||
tu_bo_finish(dev, dev->vis_stream_bo);
|
||||
result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo,
|
||||
max_size, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
dev->vis_stream_size * dev->vis_stream_count,
|
||||
TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
"visibility stream");
|
||||
}
|
||||
|
||||
bo = dev->vis_stream_bo;
|
||||
vis_stream_count = dev->vis_stream_count;
|
||||
|
||||
mtx_unlock(&dev->vis_stream_mtx);
|
||||
|
||||
|
|
@ -210,6 +228,8 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned render_pass_idx = queue->render_pass_idx;
|
||||
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++) {
|
||||
struct tu_cs cs, sub_cs;
|
||||
uint64_t fence_iova = 0;
|
||||
|
|
@ -224,7 +244,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints,
|
||||
struct tu_vis_stream_patchpoint,
|
||||
patchpoint) {
|
||||
uint64_t final_iova = bo->iova + patchpoint->offset;
|
||||
unsigned vis_stream_idx =
|
||||
(render_pass_idx + patchpoint->render_pass_idx) %
|
||||
vis_stream_count;
|
||||
uint64_t final_iova =
|
||||
bo->iova + vis_stream_idx * max_size + patchpoint->offset;
|
||||
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
|
|
@ -237,6 +261,19 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
}
|
||||
}
|
||||
|
||||
struct tu_vis_stream_patchpoint *count_patchpoint =
|
||||
&cmd_buffers[i]->vis_stream_count_patchpoint;
|
||||
if (count_patchpoint->data) {
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(&sub_cs, count_patchpoint->iova);
|
||||
tu_cs_emit(&sub_cs, vis_stream_count);
|
||||
} else {
|
||||
count_patchpoint->data[0] = vis_stream_count;
|
||||
}
|
||||
}
|
||||
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0);
|
||||
|
|
@ -250,8 +287,12 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
|||
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs);
|
||||
submit_add_entries(queue->device, submit, dump_cmds, &entry, 1);
|
||||
}
|
||||
|
||||
render_pass_idx += cmd_buffers[i]->state.tile_render_pass_count;
|
||||
}
|
||||
|
||||
queue->render_pass_idx = render_pass_idx;
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ struct tu_queue
|
|||
uint32_t sparse_syncobj, gfx_syncobj;
|
||||
uint64_t sparse_timepoint, gfx_timepoint;
|
||||
|
||||
unsigned render_pass_idx;
|
||||
|
||||
int fence; /* timestamp/fence of the last queue submission */
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
|
||||
|
|
|
|||
|
|
@ -135,6 +135,7 @@ begin_end_tp('draw',
|
|||
], tp_default_enabled=False)
|
||||
|
||||
begin_end_tp('binning_ib')
|
||||
begin_end_tp('concurrent_binning_ib')
|
||||
begin_end_tp('draw_ib_sysmem')
|
||||
begin_end_tp('draw_ib_gmem')
|
||||
|
||||
|
|
|
|||
|
|
@ -54,6 +54,8 @@ static const struct debug_control tu_debug_options[] = {
|
|||
{ "check_cmd_buffer_status", TU_DEBUG_CHECK_CMD_BUFFER_STATUS },
|
||||
{ "comm", TU_DEBUG_COMM },
|
||||
{ "nofdm", TU_DEBUG_NOFDM },
|
||||
{ "nocb", TU_DEBUG_NO_CONCURRENT_BINNING },
|
||||
{ "forcecb", TU_DEBUG_FORCE_CONCURRENT_BINNING },
|
||||
{ NULL, 0 }
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -73,6 +73,8 @@ enum tu_debug_flags : uint64_t
|
|||
TU_DEBUG_CHECK_CMD_BUFFER_STATUS = BITFIELD64_BIT(32),
|
||||
TU_DEBUG_COMM = BITFIELD64_BIT(33),
|
||||
TU_DEBUG_NOFDM = BITFIELD64_BIT(34),
|
||||
TU_DEBUG_NO_CONCURRENT_BINNING = BITFIELD64_BIT(35),
|
||||
TU_DEBUG_FORCE_CONCURRENT_BINNING = BITFIELD64_BIT(36),
|
||||
};
|
||||
|
||||
struct tu_env {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue