diff --git a/src/freedreno/ci/deqp-freedreno-a750-vk.toml b/src/freedreno/ci/deqp-freedreno-a750-vk.toml index 8f86b618bad..3e5e0205be9 100644 --- a/src/freedreno/ci/deqp-freedreno-a750-vk.toml +++ b/src/freedreno/ci/deqp-freedreno-a750-vk.toml @@ -7,6 +7,7 @@ tests_per_group = 10000 # force-gmem testing # Autotuner forces sysmem on most CTS tests +# Also force-enable concurrent binning. [[deqp]] deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk" caselists = ["/deqp-vk/mustpass/vk-main.txt"] @@ -14,7 +15,7 @@ prefix = "gmem-" fraction = 20 tests_per_group = 10000 [deqp.env] -TU_DEBUG = "gmem,forcebin" +TU_DEBUG = "gmem,forcebin,forcecb" # force-gmem with unaligned gmem store testing [[deqp]] diff --git a/src/freedreno/common/freedreno_gpu_event.h b/src/freedreno/common/freedreno_gpu_event.h index 562a7f33d39..76aff320582 100644 --- a/src/freedreno/common/freedreno_gpu_event.h +++ b/src/freedreno/common/freedreno_gpu_event.h @@ -42,6 +42,7 @@ enum fd_gpu_event : uint32_t { FD_CCU_CLEAN_DEPTH, FD_CCU_CLEAN_COLOR, FD_LRZ_CLEAR, + FD_LRZ_FLIP, FD_LRZ_FLUSH, FD_LRZ_INVALIDATE, FD_VSC_BINNING_START, @@ -84,6 +85,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] {PC_CCU_FLUSH_DEPTH_TS, true}, /* FD_CCU_CLEAN_DEPTH */ {PC_CCU_FLUSH_COLOR_TS, true}, /* FD_CCU_CLEAN_COLOR */ {LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */ + {LRZ_FLUSH, false}, /* FD_LRZ_FLIP */ {LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */ {LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */ {VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */ @@ -115,6 +117,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] {CCU_CLEAN_DEPTH, false}, /* FD_CCU_CLEAN_DEPTH */ {CCU_CLEAN_COLOR, false}, /* FD_CCU_CLEAN_COLOR */ {LRZ_CLEAR, false}, /* FD_LRZ_CLEAR */ + {LRZ_FLIP_BUFFER, false}, /* FD_LRZ_FLIP */ {LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */ {LRZ_CACHE_INVALIDATE, false}, /* FD_LRZ_INVALIDATE */ {VSC_BINNING_START, false}, /* FD_VSC_BINNING_START */ diff --git a/src/freedreno/fdl/freedreno_lrz_layout.h b/src/freedreno/fdl/freedreno_lrz_layout.h index 286e7715637..2eb6e4274be 100644 --- a/src/freedreno/fdl/freedreno_lrz_layout.h +++ b/src/freedreno/fdl/freedreno_lrz_layout.h @@ -80,7 +80,10 @@ fdl6_lrz_layout_init(struct fdl_lrz_layout *lrz_layout, lrz_layout->lrz_fc_size = 0; } - uint32_t lrz_size = lrz_layout->lrz_buffer_size; + /* Allocate 2 LRZ buffers for double-buffering on a7xx. */ + uint32_t lrz_size = lrz_layout->lrz_buffer_size * + (dev_info->chip >= 7 ? 2 : 1); + if (dev_info->a6xx.enable_lrz_fast_clear || dev_info->a6xx.has_lrz_dir_tracking) { lrz_layout->lrz_fc_offset = diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index 99d3d5dd0c9..cd5f9dc94a2 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -2133,18 +2133,22 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd, */ tu_emit_event_write(cmd, &cmd->cs, FD_CACHE_CLEAN); - ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM, - VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, - VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT); - ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value); - ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM, - image->iova + image->lrz_layout.lrz_offset, - image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM); - uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers; - ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, - (VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height }); - ops->run(cmd, cs); - ops->teardown(cmd, cs); + const unsigned lrz_buffers = CHIP >= A7XX ? 2 : 1; + for (unsigned i = 0; i < lrz_buffers; i++) { + ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM, + VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, + VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT); + ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value); + ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM, + image->iova + image->lrz_layout.lrz_offset + + i * image->lrz_layout.lrz_buffer_size, + image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM); + uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers; + ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, + (VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height }); + ops->run(cmd, cs); + ops->teardown(cmd, cs); + } /* Clearing writes via CCU color in the PS stage, and LRZ is read via * UCHE in the earlier GRAS stage. diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index d1cfe8c364d..ab5d75b45bf 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -220,6 +220,7 @@ tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t offset) { struct tu_vis_stream_patchpoint patchpoint = { + .render_pass_idx = cmd->state.tile_render_pass_count, .data = cs->cur, .iova = tu_cs_get_cur_iova(cs), .offset = offset, @@ -339,12 +340,72 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); } +static void +tu7_write_onchip_val(struct tu_cs *cs, enum tu_onchip_addr addr, + uint32_t val) +{ + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) | + CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_WRITE_ENABLED); + tu_cs_emit_qw(cs, addr); + tu_cs_emit(cs, val); +} + /* "Normal" cache flushes outside the renderpass, that don't require any special handling */ template void tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer) { - tu6_emit_flushes(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache); + struct tu_cs *cs = &cmd_buffer->cs; + struct tu_cache_state *cache = &cmd_buffer->state.cache; + BITMASK_ENUM(tu_cmd_flush_bits) flushes = cache->flush_bits; + + tu6_emit_flushes(cmd_buffer, cs, cache); + + if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH)); + + tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1); + tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) | + CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL)); + + tu7_thread_control(cs, CP_SET_THREAD_BV); + + tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + + /* Wait for the previous WAIT_FOR_BR to execute on BV and reset the wait + * value. + */ + tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + /* Signal the wait value. */ + tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 1); + + tu7_thread_control(cs, CP_SET_THREAD_BV); + + /* Wait for the value. Note that we must use CP_WAIT_REG_MEM due to a + * firmware bug which makes CP_WAIT_TIMESTAMP on BV deadlock with + * preemption when BV waits for BR. Without this bug the whole thing + * would be much, much simpler. + */ + tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 1); + + /* Reset the wait value. */ + tu7_write_onchip_val(cs, TU_ONCHIP_BARRIER, 0); + + /* Resetting the wait value happens asynchronously (since it's an + * EVENT_WRITE), but waiting for it happens synchronously. We need to + * prevent BV from racing ahead to the next wait before it's reset. + */ + tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 0); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + } } TU_GENX(tu_emit_cache_flush); @@ -356,8 +417,11 @@ tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer) if (!cmd_buffer->state.renderpass_cache.flush_bits && likely(!tu_env.debug)) return; - tu6_emit_flushes(cmd_buffer, &cmd_buffer->draw_cs, - &cmd_buffer->state.renderpass_cache); + + struct tu_cs *cs = &cmd_buffer->draw_cs; + struct tu_cache_state *cache = &cmd_buffer->state.renderpass_cache; + + tu6_emit_flushes(cmd_buffer, cs, cache); if (cmd_buffer->state.renderpass_cache.flush_bits & TU_CMD_FLAG_BLIT_CACHE_CLEAN) { cmd_buffer->state.blit_cache_cleaned = true; @@ -491,7 +555,7 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, (CHIP == A6XX ? TU_CMD_FLAG_WAIT_FOR_IDLE : 0)); } - tu6_emit_flushes(cmd_buffer, cs, &cmd_buffer->state.cache); + tu_emit_cache_flush(cmd_buffer); if (ccu_state != cmd_buffer->state.ccu_state) { emit_rb_ccu_cntl(cs, cmd_buffer->device, @@ -2116,6 +2180,25 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } if (CHIP >= A7XX) { + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, true); + + /* This sets the amount BV is allowed to be ahead of BR when we do + * BV_WAIT_FOR_BR. By setting it based on the vis stream count we + * prevent write-after-read races with the vis stream. + */ + tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 2); + tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_SET_BR_OFFSET)); + + struct tu_vis_stream_patchpoint *patchpoint = + &cmd->vis_stream_count_patchpoint; + patchpoint->data = cs->cur; + patchpoint->iova = tu_cs_get_cur_iova(cs); + tu_cs_emit(cs, 1); + + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, false); + tu7_thread_control(cs, CP_SET_THREAD_BR); } @@ -2137,6 +2220,10 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE)); tu7_thread_control(cs, CP_SET_THREAD_BOTH); + + tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) | + (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING), + (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING)); } tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3); @@ -2238,7 +2325,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) template static void tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - const VkOffset2D *fdm_offsets) + const VkOffset2D *fdm_offsets, bool use_cb) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_framebuffer *fb = cmd->state.framebuffer; @@ -2336,12 +2423,18 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_regs(cs, A6XX_TPL1_WINDOW_OFFSET(.x = 0, .y = 0)); - trace_start_binning_ib(&cmd->trace, cs, cmd); + if (use_cb) + trace_start_concurrent_binning_ib(&cmd->trace, cs, cmd); + else + trace_start_binning_ib(&cmd->trace, cs, cmd); /* emit IB to binning drawcmds: */ tu_cs_emit_call(cs, &cmd->draw_cs); - trace_end_binning_ib(&cmd->trace, cs); + if (use_cb) + trace_end_concurrent_binning_ib(&cmd->trace, cs); + else + trace_end_binning_ib(&cmd->trace, cs); /* switching from binning pass to GMEM pass will cause a switch from * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states) @@ -2667,6 +2760,46 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd) cmd->state.fdm_enabled = cmd->state.pass->has_fdm; } +static bool +tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + bool disable_cb) +{ + if (disable_cb || + /* LRZ can only be cleared via fast clear in BV. Disable CB if we can't + * use it. + */ + !cmd->state.lrz.fast_clear || + TU_DEBUG(NO_CONCURRENT_BINNING)) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); + return false; + } + tu7_thread_control(cs, CP_SET_THREAD_BOTH); + + /* Increment timestamp to make it unique in subsequent commands */ + tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1); + tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) | + CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL)); + + /* We initialize the "is concurrent binning enabled?" predicate to true and + * disable it later if necessary. + */ + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, true); + + tu7_thread_control(cs, CP_SET_THREAD_BV); + + /* If there was an overflow in the BR resource table the register will be + * set to 1 by CP_RESOURCE_LIST. Wait for it to clear here. + */ + tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0); + + tu_lrz_cb_begin(cmd, cs); + + return true; +} + template static void tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2674,8 +2807,40 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, { const struct tu_framebuffer *fb = cmd->state.framebuffer; + /* It seems that for sysmem render passes we have to use BV to clear LRZ + * before the renderpass. Otherwise the clear doesn't become visible to + * subsequent draws when LRZ has been flipped an odd number of times. + * Presumably this works if concurrent binning is disabled, because the + * blob relies on this, but that requires synchronizing BR and BV + * unnecessarily, and we want BV to skip ahead across sysmem renderpasses. + * + * In the future, we may also support writing LRZ in BV. + */ + bool concurrent_binning = false; + if (CHIP >= A7XX) { + concurrent_binning = tu7_emit_concurrent_binning(cmd, cs, false); + + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY)); + } + tu_lrz_sysmem_begin(cmd, cs); + if (concurrent_binning) { + tu_lrz_after_bv(cmd, cs); + + tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END)); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + + tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + tu_lrz_before_sysmem_br(cmd, cs); + } + assert(fb->width > 0 && fb->height > 0); tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); tu6_emit_window_offset(cs, 0, 0); @@ -2758,9 +2923,155 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_lrz_sysmem_end(cmd, cs); + /* Clear the resource list for any LRZ resources we emitted at the + * beginning. + */ + if (CHIP >= A7XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE | + CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) | + CP_EVENT_WRITE7_0_WRITE_ENABLED); + tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW); + tu_cs_emit(cs, 0); /* value */ + } + tu_cs_sanity_check(cs); } +static void +tu7_write_and_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr) +{ + tu7_write_onchip_timestamp(cs, onchip_addr); + tu7_wait_onchip_timestamp(cs, onchip_addr); +} + +static bool +tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + bool use_hw_binning) +{ + /* xfb queries use data from the binning pass. If they are running outside + * of a RP then we may have to deal with a mix of GMEM/sysmem renderpasses + * where the counters increase on different processors. Just disable CB so + * that everything happens on BR and we don't need difficult merging of BV + * and BR results. In addition, RBBM primitive counters seem to not work + * at all with concurrent binning, so disable if they are running before + * the RP. + */ + bool disable_cb = + cmd->state.xfb_query_running_before_rp || + cmd->state.rp.has_prim_generated_query_in_rp || + cmd->state.rp.has_vtx_stats_query_in_rp || + cmd->state.prim_counters_running > 0; + + + if (!tu7_emit_concurrent_binning(cmd, cs, disable_cb || !use_hw_binning)) + return false; + + /* We want to disable concurrent binning if BV isn't far enough ahead of + * BR. The core idea is to write a timestamp in BR and BV, and compare the + * BR and BV timestamps for equality. if BR is fast enough, it will write + * the timestamp ahead of BV and then when BV compares for equality it will + * find them equal. BR cannot race too far ahead of BV because it must wait + * for BV's determination to finish, which we do via another timestamp, so + * either BV is ahead of BR or the timestamps are equal. + * + * We need to communicate the determination from BV to BR so they both + * agree on whether concurrent binning is enabled or not. The easiest way + * to do it is via a "when was concurrent binning last disabled" timestamp, + * because we only have to set it when disabling concurrent binning. + */ + + if (!TU_DEBUG(FORCE_CONCURRENT_BINNING)) { + tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BR_TIMESTAMP); + + tu7_thread_control(cs, CP_SET_THREAD_BV); + + /* If in a secondary, dynamically disable CB if a vtx stats query is + * running. + */ + if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_RUNNING)); + } + + const uint32_t bv_cond_dwords = 3 + 4 + 4; + tu_cs_reserve(cs, 4 + bv_cond_dwords); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) | + CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) | + CP_COND_REG_EXEC_0_ONCHIP_MEM); + tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_TIMESTAMP) | + REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM); + tu_cs_emit(cs, bv_cond_dwords); + if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + tu_cond_exec_end(cs); + /* if (BR_TIMESTAMP == BV_TIMESTAMP) */ { + tu7_write_and_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP); + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); + } + tu7_write_onchip_timestamp(cs, + TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + + tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP); + + const uint32_t br_cond_dwords = 4; + tu_cs_reserve(cs, 4 + br_cond_dwords); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 3); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(REG_COMPARE) | + CP_COND_REG_EXEC_0_REG0(TU_ONCHIP_CB_BR_TIMESTAMP) | + CP_COND_REG_EXEC_0_ONCHIP_MEM); + tu_cs_emit(cs, REG_COMPARE_CP_COND_REG_EXEC_1_REG1(TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP) | + REG_COMPARE_CP_COND_REG_EXEC_1_ONCHIP_MEM); + tu_cs_emit(cs, br_cond_dwords); + /* if (BR_TIMESTAMP == BV_DISABLED_TIMESTAMP) */ { + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); + } + } + + /* At this point BV and BR are agreed on whether CB is enabled. If CB is + * enabled, set the thread to BV for the binning pass, otherwise set BR and + * disable concurrent binning. + */ + tu7_thread_control(cs, CP_SET_THREAD_BOTH); + + const uint32_t if_dwords = 5; + const uint32_t else_dwords = 2; + tu_cs_reserve(cs, 3 + if_dwords + else_dwords); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED) | + CP_COND_REG_EXEC_0_SKIP_WAIT_FOR_ME); + tu_cs_emit(cs, if_dwords); + /* if (CB is enabled) */ { + tu7_thread_control(cs, CP_SET_THREAD_BV); + + /* Wait for BR vis stream reads to finish */ + tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1); + tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BV_WAIT_FOR_BR)); + + /* This is the NOP-as-else trick. If CB is disabled, this CP_NOP is + * skipped and its body (the else) is executed. + */ + tu_cs_emit_pkt7(cs, CP_NOP, else_dwords); + } /* else */ { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); + } + + return true; +} + template static void tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -2771,31 +3082,38 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_tiling_config *tiling = cmd->state.tiling; const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); const struct tu_render_pass *pass = cmd->state.pass; + bool use_binning = use_hw_binning(cmd); - tu_lrz_tiling_begin(cmd, cs); + /* User flushes should always be executed on BR. */ + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); + bool use_cb = false; if (CHIP >= A7XX) { tu7_emit_tile_render_begin_regs(cs); + use_cb = tu7_emit_concurrent_binning_gmem(cmd, cs, use_binning); } + if (!use_cb) + tu_trace_start_render_pass(cmd); + + tu_lrz_tiling_begin(cmd, cs); + + /* tu_lrz_tiling_begin() can accumulate additional flushes. If that happens + * CB should be disabled, so it's safe to just emit them here. + */ + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); + + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x0); + /* Reset bin scaling. */ if (phys_dev->info->a7xx.has_hw_bin_scaling) { tu_cs_emit_regs(cs, A7XX_GRAS_BIN_FOVEAT()); tu_cs_emit_regs(cs, A7XX_RB_BIN_FOVEAT()); } - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); - - if (CHIP >= A7XX) { - tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); - tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | - CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); - } - - if (use_hw_binning(cmd)) { + if (use_binning) { if (!cmd->vsc_initialized) { tu6_lazy_init_vsc(cmd); } @@ -2833,7 +3151,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); - tu6_emit_binning_pass(cmd, cs, fdm_offsets); + tu6_emit_binning_pass(cmd, cs, fdm_offsets, use_cb); if (CHIP == A6XX) { tu_cs_emit_regs(cs, @@ -2897,6 +3215,40 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + if (use_binning) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV)); + + tu_lrz_after_bv(cmd, cs); + + /* Signal that BV is done for this render pass. This always has to + * be executed, even when CB is dynamically disabled, because we + * need to keep BR and BV counts in sync with which visibility + * streams are in use. + */ + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_INC_BV_COUNT); + + /* This mode seems to be only used by BV and signals that a + * simpler save/restore procedure can be used in between render + * passes. + */ + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM7_BIN_VISIBILITY_END)); + } + + tu7_thread_control(cs, CP_SET_THREAD_BR); + + if (use_binning) { + /* Wait for the BV to be done for this render pass. */ + tu_cs_emit_pkt7(cs, CP_BV_BR_COUNT_OPS, 1); + tu_cs_emit(cs, CP_BV_BR_COUNT_OPS_0_OP(PIPE_BR_WAIT_FOR_BV)); + + /* Emit vis stream on BR */ + tu_emit_vsc(cmd, cs); + } + tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4); tu_cs_emit(cs, num_vsc_pipes); /* count */ tu_cs_emit(cs, 0); /* offset */ @@ -2906,8 +3258,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, if (CHIP >= A7XX && (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) tu_cs_set_writeable(cs, false); + } else if (CHIP >= A7XX) { + /* Earlier we disabled concurrent binning to make LRZ fast-clear work + * with no HW binning, now re-enable it while staying on BR. + */ + tu7_thread_control(cs, CP_SET_THREAD_BR); } + tu_lrz_before_tiles(cmd, cs, use_cb); + + if (use_cb) + tu_trace_start_render_pass(cmd); + tu_autotune_begin_renderpass(cmd, cs, autotune_result); tu_cs_sanity_check(cs); @@ -2982,12 +3344,30 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_lrz_tiling_end(cmd, cs); - tu_emit_event_write(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE); - - if (CHIP >= A7XX) { - tu7_thread_control(cs, CP_SET_THREAD_BR); + bool hw_binning = use_hw_binning(cmd); + if (hw_binning) { + cmd->state.tile_render_pass_count++; } + /* If we are using HW binning, signal that we are done with reading the vis + * stream for this render pass by advancing the counter. Also clear render + * resources, currently only used for LRZ, and reset the overflow onchip + * register. + */ + if (CHIP >= A7XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + COND(hw_binning, CP_EVENT_WRITE7_0_INC_BR_COUNT) | + CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE | + CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) | + CP_EVENT_WRITE7_0_WRITE_ENABLED); + tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW); + tu_cs_emit(cs, 0); /* value */ + } + + tu_emit_event_write(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE); + tu_cs_sanity_check(cs); } @@ -3354,8 +3734,6 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu6_emit_tile_store_cs(cmd, &cmd->tile_store_cs); tu_cs_end(&cmd->tile_store_cs); - tu_trace_start_render_pass(cmd); - tu6_tile_render_begin(cmd, &cmd->cs, autotune_result, fdm_offsets); /* Note: we reverse the order of walking the pipes and tiles on every @@ -5370,7 +5748,8 @@ sanitize_dst_stage(VkPipelineStageFlags2 stage_mask) } static enum tu_stage -vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst) +vk2tu_single_stage(struct tu_device *dev, + VkPipelineStageFlags2 vk_stage, bool dst) { /* If the destination stage is executed on the CP, then the CP also has to * wait for any WFI's to finish. This is already done for draw calls, @@ -5394,24 +5773,40 @@ vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst) if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT || vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT || vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT) - return TU_STAGE_CP; + return TU_STAGE_BV_CP; if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT || vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT) - return dst ? TU_STAGE_CP : TU_STAGE_GPU; + return dst ? TU_STAGE_BV_CP : TU_STAGE_BR; if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT) - return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP; + return dst ? TU_STAGE_BOTTOM : TU_STAGE_BV_CP; - return TU_STAGE_GPU; + if (dev->physical_device->info->chip >= 7) { + if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT || + vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT || + vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT || + vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT || + vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT || + vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT || + vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT || + vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT || + vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT || + vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) { + return dst ? TU_STAGE_BV : TU_STAGE_BR; + } + } + + return TU_STAGE_BR; } static enum tu_stage -vk2tu_src_stage(VkPipelineStageFlags2 vk_stages) +vk2tu_src_stage(struct tu_device *dev, + VkPipelineStageFlags2 vk_stages) { - enum tu_stage stage = TU_STAGE_CP; + enum tu_stage stage = TU_STAGE_BV_CP; u_foreach_bit64 (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); + enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, false); stage = MAX2(stage, new_stage); } @@ -5419,11 +5814,12 @@ vk2tu_src_stage(VkPipelineStageFlags2 vk_stages) } static enum tu_stage -vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages) +vk2tu_dst_stage(struct tu_device *dev, + VkPipelineStageFlags2 vk_stages) { enum tu_stage stage = TU_STAGE_BOTTOM; u_foreach_bit64 (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); + enum tu_stage new_stage = vk2tu_single_stage(dev, 1ull << bit, true); stage = MIN2(stage, new_stage); } @@ -5437,14 +5833,17 @@ tu_flush_for_stage(struct tu_cache_state *cache, /* Even if the source is the host or CP, the destination access could * generate invalidates that we have to wait to complete. */ - if (src_stage == TU_STAGE_CP && + if (src_stage < TU_STAGE_BR && (cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE)) - src_stage = TU_STAGE_GPU; + src_stage = TU_STAGE_BR; if (src_stage >= dst_stage) { cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; - if (dst_stage == TU_STAGE_CP) - cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + if (dst_stage <= TU_STAGE_BV) { + cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_BR; + if (dst_stage == TU_STAGE_BV_CP) + cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; + } } } @@ -5455,6 +5854,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst, dst->xfb_used |= src->xfb_used; dst->has_tess |= src->has_tess; dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp; + dst->has_vtx_stats_query_in_rp |= src->has_vtx_stats_query_in_rp; dst->has_zpass_done_sample_count_write_in_rp |= src->has_zpass_done_sample_count_write_in_rp; dst->disable_gmem |= src->disable_gmem; dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode; @@ -5653,6 +6053,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, secondary_patchpoint) { struct tu_vis_stream_patchpoint patchpoint = *secondary_patchpoint; + patchpoint.render_pass_idx += cmd->state.tile_render_pass_count; if (simultaneous_use) { tu_cs_reserve_space(cs, 5); @@ -5682,6 +6083,8 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, } } + cmd->state.tile_render_pass_count += + secondary->state.tile_render_pass_count; cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size); switch (secondary->state.suspend_resume) { @@ -5844,8 +6247,8 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, tu_flush_for_access(cache, src_flags, dst_flags); - enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk); - enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk); + enum tu_stage src_stage = vk2tu_src_stage(cmd_buffer->device, src_stage_vk); + enum tu_stage dst_stage = vk2tu_dst_stage(cmd_buffer->device, dst_stage_vk); tu_flush_for_stage(cache, src_stage, dst_stage); } @@ -5975,6 +6378,10 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol struct tu_cs *cs = &cmd->draw_cs; uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_GMEM | + CP_COND_REG_EXEC_0_SYSMEM); + bool emitted_scissor = false; for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) { struct tu_render_pass_attachment *att = @@ -5987,6 +6394,8 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol tu7_generic_clear_attachment(cmd, cs, resolve_group, i); } } + + tu_cond_exec_end(cs); } static void @@ -8906,8 +9315,8 @@ tu_barrier(struct tu_cmd_buffer *cmd, tu_flush_for_access(cache, src_flags, dst_flags); - enum tu_stage src_stage = vk2tu_src_stage(srcStage); - enum tu_stage dst_stage = vk2tu_dst_stage(dstStage); + enum tu_stage src_stage = vk2tu_src_stage(cmd->device, srcStage); + enum tu_stage dst_stage = vk2tu_dst_stage(cmd->device, dstStage); tu_flush_for_stage(cache, src_stage, dst_stage); } @@ -8973,9 +9382,6 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; - tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 1); - /* Wait for any writes to the predicate to land */ if (cmd->state.pass) tu_emit_cache_flush_renderpass(cmd); @@ -8989,23 +9395,72 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, * mandates 32-bit comparisons. Our workaround is to copy the the reference * value to the low 32-bits of a location where the high 32 bits are known * to be 0 and then compare that. + * + * BR and BV use separate predicate values so that setting the predicate + * doesn't have to be synchronized between them. */ + if (CHIP >= A7XX) { + if (!cmd->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH)); + } + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BR); + } + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); tu_cs_emit(cs, 0); tu_cs_emit_qw(cs, global_iova(cmd, predicate)); tu_cs_emit_qw(cs, iova); + if (CHIP >= A7XX) { + tu_cond_exec_end(cs); + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BV); + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + tu_cs_emit(cs, 0); + tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate)); + tu_cs_emit_qw(cs, iova); + tu_cond_exec_end(cs); + } + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 1); + bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + + if (CHIP >= A7XX) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BR); + } tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3); tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) | CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); tu_cs_emit_qw(cs, global_iova(cmd, predicate)); + + if (CHIP >= A7XX) { + tu_cond_exec_end(cs); + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BV); + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3); + tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) | + CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); + tu_cs_emit_qw(cs, global_iova(cmd, bv_predicate)); + tu_cond_exec_end(cs); + } + + /* Restore original BR thread after setting BOTH */ + if (CHIP >= A7XX && !cmd->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + } } TU_GENX(tu_CmdBeginConditionalRenderingEXT); +template VKAPI_ATTR void VKAPI_CALL tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) { @@ -9015,9 +9470,20 @@ tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + if (CHIP >= A7XX && !cmd->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH)); + } + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0); + + if (CHIP >= A7XX && !cmd->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + } } +TU_GENX(tu_CmdEndConditionalRenderingEXT); template void diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 22d1b85dc63..95466c8d4a8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -193,12 +193,15 @@ enum tu_stage { * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME. * As a source stage, it is for things needing no waits. */ - TU_STAGE_CP, + TU_STAGE_BV_CP, + + /* This is for operations executed on BV. */ + TU_STAGE_BV, /* This is for most operations, which WFI will wait to finish and will not * start until any pending WFIs are finished. */ - TU_STAGE_GPU, + TU_STAGE_BR, /* This is only used as a destination stage and is for things needing no * waits on the GPU (e.g. host operations). @@ -223,6 +226,7 @@ enum tu_cmd_flush_bits { */ TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11, TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12, + TU_CMD_FLAG_WAIT_FOR_BR = 1 << 13, TU_CMD_FLAG_ALL_CLEAN = TU_CMD_FLAG_CCU_CLEAN_DEPTH | @@ -268,6 +272,7 @@ struct tu_cache_state { BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits; /* Pending flushes */ BITMASK_ENUM(tu_cmd_flush_bits) flush_bits; + BITMASK_ENUM(tu_cmd_flush_bits) bv_flush_bits; }; struct tu_vs_params { @@ -293,6 +298,7 @@ struct tu_render_pass_state bool xfb_used; bool has_tess; bool has_prim_generated_query_in_rp; + bool has_vtx_stats_query_in_rp; bool has_zpass_done_sample_count_write_in_rp; bool disable_gmem; bool sysmem_single_prim_mode; @@ -578,6 +584,8 @@ struct tu_cmd_state uint32_t prim_counters_running; bool prim_generated_query_running_before_rp; + bool vtx_stats_query_running_before_rp; + bool xfb_query_running_before_rp; bool occlusion_query_may_be_running; @@ -601,6 +609,15 @@ struct tu_cmd_state uint32_t total_renderpasses; uint32_t total_dispatches; + + unsigned tile_render_pass_count; +}; + +struct tu_vis_stream_patchpoint { + unsigned render_pass_idx; + uint32_t *data; + uint64_t iova; + uint32_t offset; }; struct tu_cmd_buffer @@ -618,6 +635,7 @@ struct tu_cmd_buffer void *patchpoints_ctx; struct util_dynarray fdm_bin_patchpoints; + struct tu_vis_stream_patchpoint vis_stream_count_patchpoint; struct util_dynarray vis_stream_patchpoints; struct util_dynarray vis_stream_bos; struct util_dynarray vis_stream_cs_bos; @@ -838,12 +856,6 @@ struct tu_fdm_bin_patchpoint { tu_fdm_bin_apply_t apply; }; -struct tu_vis_stream_patchpoint { - uint32_t *data; - uint64_t iova; - uint32_t offset; -}; - struct tu_vis_stream_patchpoint_cs { struct tu_suballoc_bo cs_bo; struct tu_suballoc_bo fence_bo; diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index 6b9b123b9da..094f6bca80f 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -93,6 +93,8 @@ (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \ A6XX_TEX_CONST_DWORDS +#define TU_MAX_VIS_STREAMS 4 + /* With dynamic rendering, input attachment indices are shifted by 1 and * attachment 0 is used for input attachments without an InputAttachmentIndex * (which can only be depth/stencil). @@ -151,8 +153,31 @@ enum tu_predicate_bit { TU_PREDICATE_LOAD_STORE = 0, TU_PREDICATE_PERFCNTRS = 1, + TU_PREDICATE_CB_ENABLED = 2, + TU_PREDICATE_VTX_STATS_RUNNING = 3, + TU_PREDICATE_VTX_STATS_NOT_RUNNING = 4, + TU_PREDICATE_FIRST_TILE = 5, }; +/* Onchip timestamp register layout. */ +enum tu_onchip_addr { + /* Registers 0-7 are defined by firmware to be shared between BR/BV. + */ + + /* See tu7_emit_concurrent_binning */ + TU_ONCHIP_CB_BR_TIMESTAMP, + TU_ONCHIP_CB_BV_TIMESTAMP, + TU_ONCHIP_CB_BV_DETERMINATION_FINISHED_TIMESTAMP, + TU_ONCHIP_CB_BV_DISABLED_TIMESTAMP, + TU_ONCHIP_BARRIER, + TU_ONCHIP_CB_RESLIST_OVERFLOW, + + /* Registers 8-15 are defined by firmware to be split between BR and BV. + * Each has their own copy. + */ +}; + + #define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME) #define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing) diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h index 320ddec5a32..960c54037df 100644 --- a/src/freedreno/vulkan/tu_cs.h +++ b/src/freedreno/vulkan/tu_cs.h @@ -493,6 +493,55 @@ tu7_thread_control(struct tu_cs *cs, enum cp_thread thread) tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(thread)); } +static inline void +tu7_set_pred_mask(struct tu_cs *cs, uint32_t mask, uint32_t val) +{ + tu_cs_emit_pkt7(cs, CP_REG_TEST, 3); + tu_cs_emit(cs, A6XX_CP_REG_TEST_0_PRED_UPDATE | + A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME); + tu_cs_emit(cs, mask); + tu_cs_emit(cs, val); +} + +static inline void +tu7_set_pred_bit(struct tu_cs *cs, enum tu_predicate_bit bit, bool val) +{ + tu7_set_pred_mask(cs, 1u << bit, val ? (1u << bit) : 0); +} + +static inline void +tu7_write_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr) +{ + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 2); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_TIMESTAMP_SUM) | + CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_WRITE_ENABLED); + tu_cs_emit(cs, onchip_addr); +} + +static inline void +tu7_wait_onchip_timestamp(struct tu_cs *cs, enum tu_onchip_addr onchip_addr) +{ + tu_cs_emit_pkt7(cs, CP_WAIT_TIMESTAMP, 3); + tu_cs_emit(cs, CP_WAIT_TIMESTAMP_0_WAIT_DST(TS_WAIT_ONCHIP) | + CP_WAIT_TIMESTAMP_0_WAIT_VALUE_SRC(TS_WAIT_GE_TIMESTAMP_SUM)); + tu_cs_emit_qw(cs, onchip_addr); +} + +static inline void +tu7_wait_onchip_val(struct tu_cs *cs, enum tu_onchip_addr onchip_addr, + uint32_t val) +{ + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL(POLL_ON_CHIP)); + tu_cs_emit_qw(cs, onchip_addr); + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(val)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0)); +} + uint64_t tu_cs_emit_data_nop(struct tu_cs *cs, const uint32_t *data, diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index c99cc535cf2..b65cea64896 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -3046,6 +3046,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->vk.flush_buffer_write_cp = tu_flush_buffer_write_cp; device->vk.cmd_fill_buffer_addr = tu_cmd_fill_buffer_addr; + device->vis_stream_count = 0; + *pDevice = tu_device_to_handle(device); return VK_SUCCESS; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 6d555a0be2a..b1c3c7b3c55 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -255,6 +255,8 @@ struct tu6_global uint32_t vsc_state[32]; + uint64_t bv_predicate; + volatile uint32_t vtx_stats_query_not_running; /* To know when renderpass stats for autotune are valid */ @@ -487,6 +489,9 @@ struct tu_device /* This is an internal queue for mapping/unmapping non-sparse BOs */ uint32_t vm_bind_queue_id; + + uint32_t vis_stream_count; + uint32_t vis_stream_size; }; VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc index 060d5940a6b..a6204d4cc4b 100644 --- a/src/freedreno/vulkan/tu_lrz.cc +++ b/src/freedreno/vulkan/tu_lrz.cc @@ -234,6 +234,7 @@ tu_lrz_init_state(struct tu_cmd_buffer *cmd, * enabled and there will be a NULL/garbage LRZ buffer. */ cmd->state.lrz.image_view = view; + cmd->state.lrz.store = att->store; if (!clears_depth && !att->load) return; @@ -412,6 +413,51 @@ tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd) } } +void +tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + /* The LRZ double-buffering guarantees that passes that clear or discard + * depth don't have to worry about LRZ dependencies. However we do have to + * worry about renderpasses that load depth, because we cannot flip LRZ + * then and have to reuse what the previous pass wrote. There is then a + * write-after-read dependency from an earlier subpass reading LRZ. We + * solve this using CP_RESOURCE_LIST, because the Vulkan user doesn't have + * to track render-and-clear dependencies vs. render-and-render depdencies + * (LOAD_OP_CLEAR happens in the same stage as rendering). + */ + if (!cmd->state.lrz.image_view) + return; + + uint64_t iova = + cmd->state.lrz.image_view->image->iova + + cmd->state.lrz.image_view->image->lrz_layout.lrz_offset; + uint64_t fc_iova = + cmd->state.lrz.image_view->image->iova + + cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset; + + if (cmd->state.lrz.reuse_previous_state) { + tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4); + tu_cs_emit(cs, 1); /* BV count */ + tu_cs_emit_qw(cs, iova | CP_BV_RESOURCE_0_WRITE); + tu_cs_emit(cs, 0); /* BR count */ + } + + if (cmd->state.lrz.store) { + tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4); + tu_cs_emit(cs, 0); /* BV count */ + tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) | + CP_RESOURCE_LIST_BR_0_OVERFLOW | + CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW)); + tu_cs_emit_qw(cs, iova); + } + + /* See tu_lrz_before_tiles() */ + tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4); + tu_cs_emit(cs, 1); /* BV count */ + tu_cs_emit_qw(cs, CP_BV_RESOURCE_0_ENCODING(BV_RES_LRZ) | fc_iova); + tu_cs_emit(cs, 0); /* BR count */ +} + template void tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) @@ -439,6 +485,16 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) return; } + /* If CB is dynamically enabled, then this is executed on BV. Flip the + * buffer BV is using. + */ + if (CHIP >= A7XX) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED)); + tu_emit_event_write(cmd, cs, FD_LRZ_FLIP); + tu_cond_exec_end(cs); + } + if (!lrz->valid_at_start) { /* If LRZ was never valid so disable it manually here. * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass) @@ -488,6 +544,98 @@ tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } TU_GENX(tu_lrz_tiling_begin); +template +void +tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (CHIP < A7XX) + return; + + /* BV and BR have different LRZ caches, so flush LRZ cache to be read by + * BR. + */ + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED)); + tu_emit_event_write(cmd, cs, FD_LRZ_FLUSH); + tu_cond_exec_end(cs); +} +TU_GENX(tu_lrz_after_bv); + +static void +tu_lrz_clear_resource(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + uint64_t fc_iova = + cmd->state.lrz.image_view->image->iova + + cmd->state.lrz.image_view->image->lrz_layout.lrz_fc_offset; + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_CLEAR_LRZ_RESOURCE); + tu_cs_emit_qw(cs, fc_iova); /* resource to clear */ +} + +template +void +tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb) +{ + if (CHIP < A7XX) + return; + + tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, true); + + if (!cmd->state.lrz.image_view) + return; + + /* By clearing the LRZ resource before rendering, we make any future + * binning pass writing to the same LRZ image wait for all renderpasses + * before this one. Crucially this includes any earlier renderpass reading + * from the same LRZ buffer. Because LRZ is only double-buffered but it's + * possible to have more than two visibility streams, this is necessary to + * prevent write-after-read hazards if BV writes the same LRZ image more + * than once before BR reads it. + * + * For example, consider the sequence: + * + * RP 1 clears + writes depth image A + * - BV: Clear + write LRZ image A + * - BR: Read LRZ image A + * RP 2 clears + writes depth image A + * - BV: Clear + write LRZ image A + * - BR: Read LRZ image A + * RP 3 clears + writes depth image A + * - BV: Clear + write LRZ image A + * - BR: Read LRZ image A + * + * RP 1 BV will write to one LRZ image, RP 2 BV will write to the other, + * and then RP 3 BV must stall until RP 1 BR is done reading/writing the + * first LRZ image. Specifiying the LRZ resource before BV starts and + * clearing it before BR starts will cause RP 2 BV to stall until RP 1 BR + * starts, which technically isn't necessary, but it will also cause RP 3 + * BV to stall until RP 2 BR has started and RP 1 BR has finished. + * + * This pairs with the last CP_RESOURCE_LIST in tu_lrz_cb_begin(). + */ + if (use_cb) + tu_lrz_clear_resource(cmd, cs); +} +TU_GENX(tu_lrz_before_tiles); + +static void +tu_lrz_emit_view_info(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + struct tu_lrz_state *lrz = &cmd->state.lrz; + + if (lrz->gpu_dir_tracking) { + if (!lrz->valid_at_start) { + /* Make sure we fail the comparison of depth views */ + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0)); + } else { + tu6_write_lrz_reg(cmd, cs, + A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO)); + } + } +} + /* We need to re-emit LRZ state before each tile due to skipsaverestore. */ template @@ -501,19 +649,104 @@ tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } else { tu6_emit_lrz_buffer(cs, lrz->image_view->image); - if (lrz->gpu_dir_tracking) { - if (!lrz->valid_at_start) { - /* Make sure we fail the comparison of depth views */ - tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_VIEW_INFO(.dword = 0)); - } else { - tu6_write_lrz_reg(cmd, cs, - A6XX_GRAS_LRZ_VIEW_INFO(.dword = lrz->image_view->view.GRAS_LRZ_VIEW_INFO)); + if (CHIP >= A7XX) { + /* If CB is dynamically enabled, then flip the buffer BR is using. + * This pairs with the LRZ flip in tu_lrz_tiling_begin. FIRST_TILE is + * cleared in tu_lrz_before_tiles(). + */ + if (!lrz->reuse_previous_state) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_CB_ENABLED)); + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_FIRST_TILE)); + tu_emit_event_write(cmd, cs, FD_LRZ_FLIP); + tu_cond_exec_end(cs); + tu_cond_exec_end(cs); } + + tu7_set_pred_bit(cs, TU_PREDICATE_FIRST_TILE, false); } + + tu_lrz_emit_view_info(cmd, cs); } } TU_GENX(tu_lrz_before_tile); +template +void +tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + struct tu_lrz_state *lrz = &cmd->state.lrz; + + if (!lrz->image_view) { + tu6_emit_lrz_buffer(cs, NULL); + } else { + tu_lrz_clear_resource(cmd, cs); + + tu6_emit_lrz_buffer(cs, lrz->image_view->image); + + tu_lrz_emit_view_info(cmd, cs); + + /* If CB is dynamically enabled, then flip the buffer BR is using. + * This pairs with the LRZ flip in tu_lrz_sysmem_begin. + */ + if (!lrz->reuse_previous_state) { + tu_emit_event_write(cmd, cs, FD_LRZ_FLIP); + + /* This shouldn't be necessary, because we should be able to clear + * LRZ on BV and then BR should use the clear value written by BV, + * but there seems to be a HW errata where the value from the + * register instead of the clear value is sometimes used when LRZ + * writes are disabled. This doesn't seem to be a problem in GMEM + * mode, however. + * + * This is seen with + * dEQP-VK.pipeline.monolithic.color_write_enable.alpha_channel.static.* + */ + if (lrz->fast_clear) + tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_CLEAR(lrz->depth_clear_value.depthStencil.depth)); + } else { + /* To workaround the same HW errata as above, but where we don't know + * the clear value, copy the clear value from memory to the register. + * This is tricky because there are two and we have to select the + * right one using CP_COND_EXEC. + */ + const unsigned if_dwords = 4, else_dwords = if_dwords; + uint64_t lrz_fc_iova = + lrz->image_view->image->iova + lrz->image_view->image->lrz_layout.lrz_fc_offset; + uint64_t br_cur_buffer_iova = + lrz_fc_iova + offsetof(fd_lrzfc_layout, br_cur_buffer); + + /* Make sure the value is written to memory. */ + tu_emit_event_write(cmd, cs, FD_CACHE_CLEAN); + tu_cs_emit_wfi(cs); + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + /* if (br_cur_buffer != 0) { */ + tu_cs_reserve(cs, 7 + if_dwords + 1 + else_dwords); + tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); + tu_cs_emit_qw(cs, br_cur_buffer_iova); + tu_cs_emit_qw(cs, br_cur_buffer_iova); + tu_cs_emit(cs, 2); /* REF */ + tu_cs_emit(cs, if_dwords + 1); + /* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[1].depth_clear_val */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR)); + tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout, + buffer[1].depth_clear_val)); + /* } else { */ + tu_cs_emit_pkt7(cs, CP_NOP, else_dwords); + /* GRAS_LRZ_DEPTH_CLEAR = lrz_fc->buffer[0].depth_clear_val */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A7XX_GRAS_LRZ_DEPTH_CLEAR)); + tu_cs_emit_qw(cs, lrz_fc_iova + offsetof(fd_lrzfc_layout, + buffer[0].depth_clear_val)); + /* } */ + } + } +} +TU_GENX(tu_lrz_before_sysmem_br); + template void tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) @@ -635,8 +868,51 @@ tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, if (!image->lrz_layout.lrz_total_size) return; + uint64_t lrz_iova = image->iova + image->lrz_layout.lrz_offset; + + /* Synchronize writes in BV with subsequent render passes against this + * write in BR. + */ + if (CHIP >= A7XX) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH)); + + tu_cs_emit_pkt7(cs, CP_MODIFY_TIMESTAMP, 1); + tu_cs_emit(cs, CP_MODIFY_TIMESTAMP_0_ADD(1) | + CP_MODIFY_TIMESTAMP_0_OP(MODIFY_TIMESTAMP_ADD_LOCAL)); + + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BV)); + + tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0); + + tu_cs_emit_pkt7(cs, CP_RESOURCE_LIST, 4); + tu_cs_emit(cs, 0); /* BV count */ + tu_cs_emit(cs, CP_RESOURCE_LIST_BR_0_BR_COUNT(1) | + CP_RESOURCE_LIST_BR_0_OVERFLOW | + CP_RESOURCE_LIST_BR_0_OVERFLOW_ONCHIP_ADDR(TU_ONCHIP_CB_RESLIST_OVERFLOW)); + tu_cs_emit_qw(cs, lrz_iova); + + tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + + tu7_thread_control(cs, CP_SET_THREAD_BR); + + tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); + } + tu6_emit_lrz_buffer(cs, image); tu6_disable_lrz_via_depth_view(cmd, cs); + + if (CHIP >= A7XX) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); + tu_cs_emit(cs, CP_EVENT_WRITE7_0_EVENT(DUMMY_EVENT) | + CP_EVENT_WRITE7_0_CLEAR_RENDER_RESOURCE | + CP_EVENT_WRITE7_0_WRITE_DST(EV_DST_ONCHIP) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) | + CP_EVENT_WRITE7_0_WRITE_ENABLED); + tu_cs_emit_qw(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW); + tu_cs_emit(cs, 0); /* value */ + } } TU_GENX(tu_disable_lrz); diff --git a/src/freedreno/vulkan/tu_lrz.h b/src/freedreno/vulkan/tu_lrz.h index a74808fd944..5a16ee4c11f 100644 --- a/src/freedreno/vulkan/tu_lrz.h +++ b/src/freedreno/vulkan/tu_lrz.h @@ -51,6 +51,8 @@ struct tu_lrz_state bool color_written_with_z_test : 1; bool has_lrz_write_with_skipped_color_writes : 1; + bool store : 1; + enum tu_lrz_direction prev_direction; }; @@ -86,14 +88,29 @@ tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd); void tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd); +void +tu_lrz_cb_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + template void tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs); +template +void +tu_lrz_after_bv(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +template +void +tu_lrz_before_tiles(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool use_cb); + template void tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs); +template +void +tu_lrz_before_sysmem_br(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + template void tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs); diff --git a/src/freedreno/vulkan/tu_perfetto.cc b/src/freedreno/vulkan/tu_perfetto.cc index 511d747ebe8..c38102d9401 100644 --- a/src/freedreno/vulkan/tu_perfetto.cc +++ b/src/freedreno/vulkan/tu_perfetto.cc @@ -38,8 +38,10 @@ tu_device_get_u_trace(struct tu_device *device); /** * Queue-id's */ -enum { - DEFAULT_HW_QUEUE_ID, +enum tu_queue_id { + BR_HW_QUEUE_ID, + BV_HW_QUEUE_ID, + /* Labels set via VK_EXT_debug_utils are in a separate track due to the * following part of the spec: * "An application may open a debug label region in one command buffer and @@ -67,6 +69,7 @@ enum tu_stage_id { SECONDARY_CMD_BUFFER_STAGE_ID, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID, BINNING_STAGE_ID, + CONCURRENT_BINNING_STAGE_ID, GMEM_STAGE_ID, BYPASS_STAGE_ID, BLIT_STAGE_ID, @@ -85,7 +88,8 @@ static const struct { const char *name; const char *desc; } queues[] = { - [DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"}, + [BR_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"}, + [BV_HW_QUEUE_ID] = {"GPU Queue 1", "Adreno Bin Visibility Queue"}, [ANNOTATIONS_QUEUE_ID] = {"Annotations", "Annotations Queue"}, }; @@ -99,6 +103,7 @@ static const struct { [SECONDARY_CMD_BUFFER_STAGE_ID] = { "Secondary Command Buffer" }, [CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID] = { "Annotation", "Render Pass Command Buffer Annotation" }, [BINNING_STAGE_ID] = { "Binning", "Perform Visibility pass and determine target bins" }, + [CONCURRENT_BINNING_STAGE_ID] = { "Concurrent Binning", "Perform concurrent Visibility pass and determine target bins" }, [GMEM_STAGE_ID] = { "GMEM", "Rendering to GMEM" }, [BYPASS_STAGE_ID] = { "Bypass", "Rendering to system memory" }, [BLIT_STAGE_ID] = { "Blit", "Performing a Blit operation" }, @@ -323,12 +328,17 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id, emit_sync_timestamp(clocks); } - uint32_t queue_id = DEFAULT_HW_QUEUE_ID; + uint32_t queue_id = BR_HW_QUEUE_ID; switch (stage->stage_id) { case CMD_BUFFER_ANNOTATION_STAGE_ID: case CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID: queue_id = ANNOTATIONS_QUEUE_ID; break; + /* We only know dynamically whether concurrent binning was enabled. Just + * assume it is and always make binning appear on the BV timeline. + */ + case CONCURRENT_BINNING_STAGE_ID: + queue_id = BV_HW_QUEUE_ID; default: break; } @@ -577,6 +587,7 @@ CREATE_EVENT_CALLBACK(cmd_buffer, CMD_BUFFER_STAGE_ID) CREATE_EVENT_CALLBACK(secondary_cmd_buffer, SECONDARY_CMD_BUFFER_STAGE_ID) CREATE_EVENT_CALLBACK(render_pass, RENDER_PASS_STAGE_ID) CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID) +CREATE_EVENT_CALLBACK(concurrent_binning_ib, CONCURRENT_BINNING_STAGE_ID) CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID) CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID) CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID) diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 73ebf55dc8b..f1dbbcb0c6f 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -1099,6 +1099,9 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running; cmdbuf->state.prim_counters_running++; + if (cmdbuf->state.pass) + cmdbuf->state.rp.has_vtx_stats_query_in_rp = true; + /* Prevent starting primitive counters when it is supposed to be stopped * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query. */ @@ -1110,9 +1113,26 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, tu_emit_event_write(cmdbuf, cs, FD_START_PRIMITIVE_CTRS); - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, 0); + if (CHIP >= A7XX) { + /* We need the predicate for determining whether to enable CB, so set + * it for both BR and BV. + */ + if (!cmdbuf->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH)); + } + tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) | + (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING), + (1u << TU_PREDICATE_VTX_STATS_RUNNING)); + if (!cmdbuf->state.pass) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + } + } else { + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, 0); + } if (need_cond_exec) { tu_cond_exec_end(cs); @@ -1312,6 +1332,9 @@ emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = begin_iova)); tu_emit_event_write(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS); + + if (!cmdbuf->state.pass) + cmdbuf->state.xfb_query_running_before_rp = true; } template @@ -1545,24 +1568,39 @@ emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, if (!need_cond_exec) { tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); } else { - tu_cs_reserve(cs, 7 + 2); /* Check that pipeline stats query is not running, only then * we count stop the counter. */ - tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); - tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ + if (CHIP >= A7XX) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) | + CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_VTX_STATS_NOT_RUNNING)); + tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); + tu_cond_exec_end(cs); + } else { + tu_cs_reserve(cs, 7 + 2); + + tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); + tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ + + tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); + } - tu_emit_event_write(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS); } } if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, 1); + if (CHIP >= A7XX) { + tu7_set_pred_mask(cs, (1u << TU_PREDICATE_VTX_STATS_RUNNING) | + (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING), + (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING)); + } else { + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, 1); + } } } @@ -1822,6 +1860,9 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1); uint64_t available_iova = query_available_iova(pool, query); + if (!cmdbuf->state.pass) + cmdbuf->state.xfb_query_running_before_rp = false; + tu_cs_emit_regs(cs, A6XX_VPC_SO_QUERY_BASE(.qword = end_iova)); tu_emit_event_write(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS); diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index 7394e18c39c..8584b7e8a96 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -104,7 +104,7 @@ get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd, /* See below for the commands emitted to the CS. */ uint32_t cs_size = 5 * util_dynarray_num_elements(&cmd->vis_stream_patchpoints, - struct tu_vis_stream_patchpoint) + 6; + struct tu_vis_stream_patchpoint) + 4 + 6; util_dynarray_foreach (&cmd->vis_stream_cs_bos, struct tu_vis_stream_patchpoint_cs, @@ -165,8 +165,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, struct tu_device *dev = queue->device; uint32_t max_size = 0; - for (unsigned i = 0; i < cmdbuf_count; i++) + uint32_t rp_count = 0; + for (unsigned i = 0; i < cmdbuf_count; i++) { max_size = MAX2(max_size, cmd_buffers[i]->vsc_size); + rp_count += cmd_buffers[i]->state.tile_render_pass_count; + } if (max_size == 0) return VK_SUCCESS; @@ -174,17 +177,32 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, struct tu_bo *bo = NULL; VkResult result = VK_SUCCESS; + /* Note, we want to make the vis stream count at least 1 because an + * BV_BR_OFFSET of 0 can lead to hangs even if not using visibility + * streams and therefore should be avoided. + */ + uint32_t min_vis_stream_count = + (TU_DEBUG(NO_CONCURRENT_BINNING) || dev->physical_device->info->chip < 7) ? + 1 : MIN2(MAX2(rp_count, 1), TU_MAX_VIS_STREAMS); + uint32_t vis_stream_count; + mtx_lock(&dev->vis_stream_mtx); - if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) { + if (!dev->vis_stream_bo || max_size > dev->vis_stream_size || + min_vis_stream_count > dev->vis_stream_count) { + dev->vis_stream_count = MAX2(dev->vis_stream_count, + min_vis_stream_count); + dev->vis_stream_size = MAX2(dev->vis_stream_size, max_size); if (dev->vis_stream_bo) tu_bo_finish(dev, dev->vis_stream_bo); result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo, - max_size, TU_BO_ALLOC_INTERNAL_RESOURCE, + dev->vis_stream_size * dev->vis_stream_count, + TU_BO_ALLOC_INTERNAL_RESOURCE, "visibility stream"); } bo = dev->vis_stream_bo; + vis_stream_count = dev->vis_stream_count; mtx_unlock(&dev->vis_stream_mtx); @@ -210,6 +228,8 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, } } + unsigned render_pass_idx = queue->render_pass_idx; + for (unsigned i = 0; i < cmdbuf_count; i++) { struct tu_cs cs, sub_cs; uint64_t fence_iova = 0; @@ -224,7 +244,11 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints, struct tu_vis_stream_patchpoint, patchpoint) { - uint64_t final_iova = bo->iova + patchpoint->offset; + unsigned vis_stream_idx = + (render_pass_idx + patchpoint->render_pass_idx) % + vis_stream_count; + uint64_t final_iova = + bo->iova + vis_stream_idx * max_size + patchpoint->offset; if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { @@ -237,6 +261,19 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, } } + struct tu_vis_stream_patchpoint *count_patchpoint = + &cmd_buffers[i]->vis_stream_count_patchpoint; + if (count_patchpoint->data) { + if (cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(&sub_cs, count_patchpoint->iova); + tu_cs_emit(&sub_cs, vis_stream_count); + } else { + count_patchpoint->data[0] = vis_stream_count; + } + } + if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0); @@ -250,8 +287,12 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs); submit_add_entries(queue->device, submit, dump_cmds, &entry, 1); } + + render_pass_idx += cmd_buffers[i]->state.tile_render_pass_count; } + queue->render_pass_idx = render_pass_idx; + return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_queue.h b/src/freedreno/vulkan/tu_queue.h index 3f7f78a92ba..28925bfcb50 100644 --- a/src/freedreno/vulkan/tu_queue.h +++ b/src/freedreno/vulkan/tu_queue.h @@ -33,6 +33,8 @@ struct tu_queue uint32_t sparse_syncobj, gfx_syncobj; uint64_t sparse_timepoint, gfx_timepoint; + unsigned render_pass_idx; + int fence; /* timestamp/fence of the last queue submission */ }; VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) diff --git a/src/freedreno/vulkan/tu_tracepoints.py b/src/freedreno/vulkan/tu_tracepoints.py index 0c583ee3f5c..3d102950b2b 100644 --- a/src/freedreno/vulkan/tu_tracepoints.py +++ b/src/freedreno/vulkan/tu_tracepoints.py @@ -135,6 +135,7 @@ begin_end_tp('draw', ], tp_default_enabled=False) begin_end_tp('binning_ib') +begin_end_tp('concurrent_binning_ib') begin_end_tp('draw_ib_sysmem') begin_end_tp('draw_ib_gmem') diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index 81c0176d865..f61d8d2eaf9 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -54,6 +54,8 @@ static const struct debug_control tu_debug_options[] = { { "check_cmd_buffer_status", TU_DEBUG_CHECK_CMD_BUFFER_STATUS }, { "comm", TU_DEBUG_COMM }, { "nofdm", TU_DEBUG_NOFDM }, + { "nocb", TU_DEBUG_NO_CONCURRENT_BINNING }, + { "forcecb", TU_DEBUG_FORCE_CONCURRENT_BINNING }, { NULL, 0 } }; diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 5a79cbe0354..7ce6d3e053a 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -73,6 +73,8 @@ enum tu_debug_flags : uint64_t TU_DEBUG_CHECK_CMD_BUFFER_STATUS = BITFIELD64_BIT(32), TU_DEBUG_COMM = BITFIELD64_BIT(33), TU_DEBUG_NOFDM = BITFIELD64_BIT(34), + TU_DEBUG_NO_CONCURRENT_BINNING = BITFIELD64_BIT(35), + TU_DEBUG_FORCE_CONCURRENT_BINNING = BITFIELD64_BIT(36), }; struct tu_env {