diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 486178a548a..564c267bd4d 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -353,6 +353,52 @@ tu7_write_onchip_val(struct tu_cs *cs, enum tu_onchip_addr addr, tu_cs_emit(cs, val); } +static void +tu_add_cb_barrier_info(struct tu_cmd_buffer *cmd_buffer) +{ + /* Future concurrent binning cannot happen earlier than the barrier, + * so we won't need to patch previous patchpoints. Pop them now. + */ + uint32_t size = util_dynarray_num_elements(&cmd_buffer->cb_control_points, + struct tu_cb_control_point); + for (int32_t idx = size - 1; idx >= 0; idx--) { + struct tu_cb_control_point *info = util_dynarray_element( + &cmd_buffer->cb_control_points, struct tu_cb_control_point, idx); + if (info->type == TU_CB_CONTROL_TYPE_CB_ENABLED) { + break; + } + (void) util_dynarray_pop(&cmd_buffer->cb_control_points, + struct tu_cb_control_point); + } + + struct tu_cb_control_point barrier_info = { + .type = TU_CB_CONTROL_TYPE_BARRIER, + }; + util_dynarray_append(&cmd_buffer->cb_control_points, barrier_info); +} + +void +tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + bool force_disable_cb) +{ + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + + if (!force_disable_cb) { + struct tu_cb_control_point info = { + .type = TU_CB_CONTROL_TYPE_PATCHPOINT, + .patchpoint = cs->cur, + .patch_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR), + .original_value = CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE, + }; + util_dynarray_append(&cmd->cb_control_points, info); + } + + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); +} + /* "Normal" cache flushes outside the renderpass, that don't require any special handling */ template void @@ -407,7 +453,8 @@ tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer) */ tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 0); - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu_add_cb_barrier_info(cmd_buffer); + tu7_set_thread_br_patchpoint(cmd_buffer, cs, false); trace_end_concurrent_binning_barrier(&cmd_buffer->trace, cs); } @@ -571,7 +618,7 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, emit_vpc_attr_buf(cs, cmd_buffer->device, ccu_state == TU_CMD_CCU_GMEM); - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_set_thread_br_patchpoint(cmd_buffer, cs, false); } cmd_buffer->state.ccu_state = ccu_state; } @@ -2208,7 +2255,7 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) tu_cs_set_writeable(cs, false); - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_set_thread_br_patchpoint(cmd, cs, false); } tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3); @@ -2244,7 +2291,7 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, CP_SET_AMBLE_2_TYPE(POSTAMBLE_AMBLE_TYPE)); if (CHIP >= A7XX) { - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_set_thread_br_patchpoint(cmd, cs, false); } tu_cs_sanity_check(cs); @@ -2785,8 +2832,9 @@ tu7_cb_disable_reason(bool disable_cb, } static bool -tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - bool disable_cb) +tu7_emit_concurrent_binning_start(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + bool disable_cb) { if (tu7_cb_disable_reason(disable_cb, cmd, "disable_cb") || /* LRZ can only be cleared via fast clear in BV. Disable CB if we can't @@ -2797,13 +2845,25 @@ tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs, "LRZ fast clear disabled") || tu7_cb_disable_reason(TU_DEBUG(NO_CONCURRENT_BINNING), cmd, "TU_DEBUG(NO_CONCURRENT_BINNING)")) { - tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); - tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | - CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); - tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); - cmd->state.renderpass_cb_disabled = true; - return false; - } + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); + cmd->state.renderpass_cb_disabled = true; + + tu_add_cb_barrier_info(cmd); + + return false; + } + + return true; +} + +static void +tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + assert(!cmd->state.renderpass_cb_disabled); + tu7_thread_control(cs, CP_SET_THREAD_BOTH); /* Increment timestamp to make it unique in subsequent commands */ @@ -2824,16 +2884,22 @@ tu7_emit_concurrent_binning(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu7_wait_onchip_val(cs, TU_ONCHIP_CB_RESLIST_OVERFLOW, 0); tu_lrz_cb_begin(cmd, cs); - - return true; } template static void -tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; + /* Why all the complexity? + * The logic necessary to support concurrent binning running in parallel to + * sysmem has enough overhead to reduce performance for a workload with + * high number of renderpasses, so we have to patch out the CB logic if + * CB cannot run in parallel to this renderpass. + * It does everything in IB1 because from testing the CB logic hangs in IB2. + */ + + struct tu_cs_patchable_state cb_state = tu_cs_patchable_start(cs, 128); /* It seems that for sysmem render passes we have to use BV to clear LRZ * before the renderpass. Otherwise the clear doesn't become visible to @@ -2844,17 +2910,14 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, * * In the future, we may also support writing LRZ in BV. */ - bool concurrent_binning = false; - if (CHIP >= A7XX) { - concurrent_binning = tu7_emit_concurrent_binning(cmd, cs, false); + { + tu7_emit_concurrent_binning(cmd, cs); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY)); - } - tu_lrz_sysmem_begin(cmd, cs); + tu_lrz_sysmem_begin(cmd, cs); - if (concurrent_binning) { tu_lrz_after_bv(cmd, cs); tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); @@ -2868,6 +2931,51 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_lrz_before_sysmem_br(cmd, cs); } + tu_cs_patchable_end(cs, false, &cb_state); + + struct tu_cs_patchable_state no_cb_state = tu_cs_patchable_start(cs, 64); + { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); + tu7_set_pred_bit(cs, TU_PREDICATE_CB_ENABLED, false); + tu_lrz_sysmem_begin(cmd, cs); + } + tu_cs_patchable_end(cs, true, &no_cb_state); + + struct tu_cb_control_point enable_cb_patch = { + .type = TU_CB_CONTROL_TYPE_PATCHPOINT, + .patchpoint = cb_state.nop_header, + .patch_value = cb_state.enable_patch, + .original_value = cb_state.disable_patch, + }; + util_dynarray_append(&cmd->cb_control_points, enable_cb_patch); + + struct tu_cb_control_point disable_no_cb_patch = { + .type = TU_CB_CONTROL_TYPE_PATCHPOINT, + .patchpoint = no_cb_state.nop_header, + .patch_value = no_cb_state.disable_patch, + .original_value = no_cb_state.enable_patch, + }; + util_dynarray_append(&cmd->cb_control_points, disable_no_cb_patch); +} + +template +static void +tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_renderpass_result *autotune_result) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + + if (CHIP == A6XX) { + tu_lrz_sysmem_begin(cmd, cs); + } else { + if (tu7_emit_concurrent_binning_start(cmd, cs, false)) { + tu7_emit_concurrent_binning_sysmem(cmd, cs); + } else { + tu_lrz_sysmem_begin(cmd, cs); + } + } assert(fb->width > 0 && fb->height > 0); tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); @@ -2997,9 +3105,16 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, "xfb/prim-gen/prim-counters/vtx-stats query is running"); tu7_cb_disable_reason(!use_hw_binning, cmd, "hw binning disabled"); - if (!tu7_emit_concurrent_binning(cmd, cs, disable_cb || !use_hw_binning)) + if (!tu7_emit_concurrent_binning_start(cmd, cs, disable_cb || !use_hw_binning)) return false; + tu7_emit_concurrent_binning(cmd, cs); + + struct tu_cb_control_point cb_enabled_info = { + .type = TU_CB_CONTROL_TYPE_CB_ENABLED, + }; + util_dynarray_append(&cmd->cb_control_points, cb_enabled_info); + /* We want to disable concurrent binning if BV isn't far enough ahead of * BR. The core idea is to write a timestamp in BR and BV, and compare the * BR and BV timestamps for equality. if BR is fast enough, it will write @@ -3293,7 +3408,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, /* Earlier we disabled concurrent binning to make LRZ fast-clear work * with no HW binning, now re-enable it while staying on BR. */ - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_set_thread_br_patchpoint(cmd, cs, false); } tu_lrz_before_tiles(cmd, cs, use_cb); @@ -4014,6 +4129,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints); util_dynarray_fini(&cmd_buffer->vis_stream_patchpoints); + util_dynarray_fini(&cmd_buffer->cb_control_points); util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *, bo) { @@ -4110,6 +4226,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints); util_dynarray_clear(&cmd_buffer->vis_stream_patchpoints); + util_dynarray_clear(&cmd_buffer->cb_control_points); util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *, bo) { @@ -9484,8 +9601,7 @@ tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, /* Restore original BR thread after setting BOTH */ if (CHIP >= A7XX && !cmd->state.pass) { - tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); - tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + tu7_set_thread_br_patchpoint(cmd, cs, false); } } TU_GENX(tu_CmdBeginConditionalRenderingEXT); @@ -9509,8 +9625,7 @@ tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) tu_cs_emit(cs, 0); if (CHIP >= A7XX && !cmd->state.pass) { - tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); - tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + tu7_set_thread_br_patchpoint(cmd, cs, false); } } TU_GENX(tu_CmdEndConditionalRenderingEXT); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index d203e3cd854..52facaf615f 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -622,6 +622,19 @@ struct tu_vis_stream_patchpoint { uint32_t offset; }; +enum tu_cb_control_type { + TU_CB_CONTROL_TYPE_PATCHPOINT, + TU_CB_CONTROL_TYPE_BARRIER, + TU_CB_CONTROL_TYPE_CB_ENABLED, +}; + +struct tu_cb_control_point { + enum tu_cb_control_type type; + uint32_t *patchpoint; + uint32_t patch_value; + uint32_t original_value; +}; + struct tu_cmd_buffer { struct vk_command_buffer vk; @@ -642,6 +655,8 @@ struct tu_cmd_buffer struct util_dynarray vis_stream_bos; struct util_dynarray vis_stream_cs_bos; + struct util_dynarray cb_control_points; + VkCommandBufferUsageFlags usage_flags; VkQueryPipelineStatisticFlags inherited_pipeline_statistics; @@ -920,4 +935,9 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd, VkResult tu_init_bin_preamble(struct tu_device *device); +void +tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + bool force_disable_cb); + #endif /* TU_CMD_BUFFER_H */ diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc index a6204d4cc4b..3be6f2e8604 100644 --- a/src/freedreno/vulkan/tu_lrz.cc +++ b/src/freedreno/vulkan/tu_lrz.cc @@ -895,7 +895,7 @@ tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu7_write_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); - tu7_thread_control(cs, CP_SET_THREAD_BR); + tu7_set_thread_br_patchpoint(cmd, cs, false); tu7_wait_onchip_timestamp(cs, TU_ONCHIP_CB_BV_TIMESTAMP); } diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 4f12bb6dce9..41c80e87a5d 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -1123,8 +1123,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, (1u << TU_PREDICATE_VTX_STATS_NOT_RUNNING), (1u << TU_PREDICATE_VTX_STATS_RUNNING)); if (!cmdbuf->state.pass) { - tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); - tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR)); + tu7_set_thread_br_patchpoint(cmdbuf, cs, false); } } else { tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index 5ef98324098..a87a73f0cd4 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -296,6 +296,46 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, return VK_SUCCESS; } +static VkResult +resolve_cb_control_patchpoints(struct tu_queue *queue, + void *submit, + struct util_dynarray *dump_cmds, + struct tu_cmd_buffer **cmd_buffers, + uint32_t cmdbuf_count) +{ + bool enable_cb = false; + for (int32_t i = cmdbuf_count - 1; i >= 0; i--) { + struct tu_cmd_buffer *cmd = cmd_buffers[i]; + + /* Simultaneous cmdbufs are not expected to be used for workloads that + * benefit from CB, so instead of on-GPU patching, just treat them as CB + * barriers. + */ + if (cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + enable_cb = false; + continue; + } + + bool one_time_submit = !!(cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); + util_dynarray_foreach_reverse (&cmd->cb_control_points, + struct tu_cb_control_point, info) { + if (info->type == TU_CB_CONTROL_TYPE_CB_ENABLED) { + enable_cb = true; + } else if (info->type == TU_CB_CONTROL_TYPE_BARRIER) { + enable_cb = false; + } else if (enable_cb) { + *info->patchpoint = info->patch_value; + } else if (!one_time_submit) { + *info->patchpoint = info->original_value; + } + } + } + + return VK_SUCCESS; +} + static VkResult queue_submit_sparse(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) { @@ -422,6 +462,12 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) if (result != VK_SUCCESS) goto out; + result = resolve_cb_control_patchpoints(queue, submit, &dump_cmds, + cmd_buffers, cmdbuf_count); + + if (result != VK_SUCCESS) + goto out; + if (has_trace_points) { tu_u_trace_submission_data_create( device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);