diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 07185a93b93..abd34c2e965 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -31,6 +31,7 @@ tu_entrypoints = custom_target( libtu_files = files( + 'tu_autotune.c', 'tu_clear_blit.c', 'tu_cmd_buffer.c', 'tu_cs.c', diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c new file mode 100644 index 00000000000..7ed78e1727e --- /dev/null +++ b/src/freedreno/vulkan/tu_autotune.c @@ -0,0 +1,547 @@ +/* + * Copyright © 2021 Igalia S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "tu_autotune.h" +#include "tu_private.h" +#include "tu_cs.h" + +/* In Vulkan application may fill command buffer from many threads + * and expect no locking to occur. We do introduce the possibility of + * locking on renderpass end, however assuming that application + * doesn't have a huge amount of slightly different renderpasses, + * there would be minimal to none contention. + * + * Other assumptions are: + * - Application does submit command buffers soon after their creation. + * + * Breaking the above may lead to some decrease in performance or + * autotuner turning itself off. + */ + +#define TU_AUTOTUNE_DEBUG_LOG 0 +/* Dump history entries on autotuner finish, + * could be used to gather data from traces. + */ +#define TU_AUTOTUNE_LOG_AT_FINISH 0 + +#define MAX_HISTORY_RESULTS 5 +#define MAX_HISTORY_LIFETIME 128 + +/** + * Tracks results for a given renderpass key + */ +struct tu_renderpass_history { + uint64_t key; + + /* We would delete old history entries */ + uint32_t last_fence; + + /** + * List of recent fd_renderpass_result's + */ + struct list_head results; + uint32_t num_results; + + uint32_t avg_samples; +}; + +/* Holds per-submission cs which writes the fence. */ +struct tu_submission_fence_cs { + struct list_head node; + struct tu_cs cs; + uint32_t fence; +}; + +#define APPEND_TO_HASH(state, field) \ + XXH64_update(state, &field, sizeof(field)); + +static uint64_t +hash_renderpass_instance(const struct tu_render_pass *pass, + const struct tu_framebuffer *framebuffer, + const struct tu_cmd_buffer *cmd) { + XXH64_state_t hash_state; + XXH64_reset(&hash_state, 0); + + APPEND_TO_HASH(&hash_state, framebuffer->width); + APPEND_TO_HASH(&hash_state, framebuffer->height); + APPEND_TO_HASH(&hash_state, framebuffer->layers); + + APPEND_TO_HASH(&hash_state, pass->attachment_count); + XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0])); + + for (unsigned i = 0; i < pass->attachment_count; i++) { + APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width); + APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height); + APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk_format); + APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->layer_count); + APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->level_count); + } + + APPEND_TO_HASH(&hash_state, pass->subpass_count); + for (unsigned i = 0; i < pass->subpass_count; i++) { + APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples); + APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count); + APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count); + APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count); + } + + return XXH64_digest(&hash_state); +} + +static void +history_destructor(void *h) +{ + struct tu_renderpass_history *history = h; + + list_for_each_entry_safe(struct tu_renderpass_result, result, + &history->results, node) { + ralloc_free(result); + } +} + +static void +result_destructor(void *r) +{ + struct tu_renderpass_result *result = r; + + /* Just in case we manage to somehow still be on the pending_results list: */ + list_del(&result->node); +} + +static bool +get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples) +{ + bool has_history = false; + + /* If the lock contantion would be found in the wild - + * we could use try_lock here. + */ + u_rwlock_rdlock(&at->ht_lock); + struct hash_entry *entry = + _mesa_hash_table_search(at->ht, &rp_key); + if (entry) { + struct tu_renderpass_history *history = entry->data; + if (history->num_results > 0) { + *avg_samples = p_atomic_read(&history->avg_samples); + has_history = true; + } + } + u_rwlock_rdunlock(&at->ht_lock); + + return has_history; +} + +static struct tu_renderpass_result * +create_history_result(struct tu_autotune *at, uint64_t rp_key) +{ + struct tu_renderpass_result *result = rzalloc_size(NULL, sizeof(*result)); + + result->idx = p_atomic_inc_return(&at->idx_counter); + result->rp_key = rp_key; + + ralloc_set_destructor(result, result_destructor); + + return result; +} + +static void +history_add_result(struct tu_renderpass_history *history, + struct tu_renderpass_result *result) +{ + list_delinit(&result->node); + list_add(&result->node, &history->results); + + if (history->num_results < MAX_HISTORY_RESULTS) { + history->num_results++; + } else { + /* Once above the limit, start popping old results off the + * tail of the list: + */ + struct tu_renderpass_result *old_result = + list_last_entry(&history->results, struct tu_renderpass_result, node); + list_delinit(&old_result->node); + ralloc_free(old_result); + } + + /* Do calculations here to avoid locking history in tu_autotune_use_bypass */ + uint32_t total_samples = 0; + list_for_each_entry(struct tu_renderpass_result, result, + &history->results, node) { + total_samples += result->samples_passed; + } + + float avg_samples = (float)total_samples / (float)history->num_results; + p_atomic_set(&history->avg_samples, (uint32_t)avg_samples); +} + +static void +process_results(struct tu_autotune *at) +{ + uint32_t current_fence = at->results->fence; + + uint32_t min_idx = ~0; + uint32_t max_idx = 0; + + list_for_each_entry_safe(struct tu_renderpass_result, result, + &at->pending_results, node) { + if (result->fence > current_fence) + break; + + struct tu_renderpass_history *history = result->history; + + min_idx = MIN2(min_idx, result->idx); + max_idx = MAX2(max_idx, result->idx); + uint32_t idx = result->idx % ARRAY_SIZE(at->results->result); + + result->samples_passed = at->results->result[idx].samples_end - + at->results->result[idx].samples_start; + + history_add_result(history, result); + } + + list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs, + &at->pending_submission_cs, node) { + if (submission_cs->fence > current_fence) + break; + + list_del(&submission_cs->node); + tu_cs_finish(&submission_cs->cs); + free(submission_cs); + } + + if (max_idx - min_idx > TU_AUTOTUNE_MAX_RESULTS) { + /* If results start to trample each other it's better to bail out */ + at->enabled = false; + mesa_logw("disabling sysmem vs gmem autotuner because results " + "are trampling each other: min_idx=%u, max_idx=%u", + min_idx, max_idx); + } +} + +static struct tu_cs * +create_fence_cs(struct tu_device *dev, struct tu_autotune *at) +{ + struct tu_submission_fence_cs *submission_cs = + calloc(1, sizeof(struct tu_submission_fence_cs)); + submission_cs->fence = at->fence_counter; + + tu_cs_init(&submission_cs->cs, dev, TU_CS_MODE_GROW, 5); + tu_cs_begin(&submission_cs->cs); + + tu_cs_emit_pkt7(&submission_cs->cs, CP_EVENT_WRITE, 4); + tu_cs_emit(&submission_cs->cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); + tu_cs_emit_qw(&submission_cs->cs, autotune_results_ptr(at, fence)); + tu_cs_emit(&submission_cs->cs, at->fence_counter); + + tu_cs_end(&submission_cs->cs); + + list_addtail(&submission_cs->node, &at->pending_submission_cs); + + return &submission_cs->cs; +} + +struct tu_cs * +tu_autotune_on_submit(struct tu_device *dev, + struct tu_autotune *at, + struct tu_cmd_buffer **cmd_buffers, + uint32_t cmd_buffer_count) +{ + /* We are single-threaded here */ + + process_results(at); + + /* pre-increment so zero isn't valid fence */ + uint32_t new_fence = ++at->fence_counter; + + /* Create history entries here to minimize work and locking being + * done on renderpass end. + */ + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; + list_for_each_entry_safe(struct tu_renderpass_result, result, + &cmdbuf->renderpass_autotune_results, node) { + struct tu_renderpass_history *history; + struct hash_entry *entry = + _mesa_hash_table_search(at->ht, &result->rp_key); + if (!entry) { + history = rzalloc_size(NULL, sizeof(*history)); + ralloc_set_destructor(history, history_destructor); + history->key = result->rp_key; + list_inithead(&history->results); + + u_rwlock_wrlock(&at->ht_lock); + _mesa_hash_table_insert(at->ht, &history->key, history); + u_rwlock_wrunlock(&at->ht_lock); + } else { + history = (struct tu_renderpass_history *) entry->data; + } + + history->last_fence = new_fence; + + result->fence = new_fence; + result->history = history; + } + + if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) { + list_splicetail(&cmdbuf->renderpass_autotune_results, + &at->pending_results); + list_inithead(&cmdbuf->renderpass_autotune_results); + } + } + +#if TU_AUTOTUNE_DEBUG_LOG != 0 + mesa_logi("Total history entries: %u", at->ht->entries); +#endif + + /* Cleanup old entries from history table. The assumption + * here is that application doesn't hold many old unsubmitted + * command buffers, otherwise this table may grow big. + */ + hash_table_foreach(at->ht, entry) { + struct tu_renderpass_history *history = entry->data; + if (history->last_fence == 0 || + (new_fence - history->last_fence) <= MAX_HISTORY_LIFETIME) + continue; + +#if TU_AUTOTUNE_DEBUG_LOG != 0 + mesa_logi("Removed old history entry %016"PRIx64"", history->key); +#endif + + u_rwlock_wrlock(&at->ht_lock); + _mesa_hash_table_remove_key(at->ht, &history->key); + u_rwlock_wrunlock(&at->ht_lock); + + ralloc_free(history); + } + + return create_fence_cs(dev, at); +} + +static bool +renderpass_key_equals(const void *_a, const void *_b) +{ + return *(uint64_t *)_a == *(uint64_t *)_b; +} + +static uint32_t +renderpass_key_hash(const void *_a) +{ + return *((uint64_t *) _a) & 0xffffffff; +} + +VkResult +tu_autotune_init(struct tu_autotune *at, struct tu_device *dev) +{ + VkResult result; + + at->enabled = true; + at->ht = _mesa_hash_table_create(NULL, + renderpass_key_hash, + renderpass_key_equals); + u_rwlock_init(&at->ht_lock); + + at->results_bo = malloc(sizeof(struct tu_bo)); + result = tu_bo_init_new(dev, at->results_bo, + sizeof(struct tu_autotune_results), + TU_BO_ALLOC_NO_FLAGS); + if (result != VK_SUCCESS) { + vk_startup_errorf(dev->instance, result, "Autotune BO init"); + goto fail_bo; + } + + result = tu_bo_map(dev, at->results_bo); + + if (result != VK_SUCCESS) { + vk_startup_errorf(dev->instance, result, "Autotune BO map"); + goto fail_map_bo; + } + + at->results = at->results_bo->map; + + list_inithead(&at->pending_results); + list_inithead(&at->pending_submission_cs); + + return VK_SUCCESS; + +fail_map_bo: + tu_bo_finish(dev, at->results_bo); + +fail_bo: + free(at->results_bo); + u_rwlock_destroy(&at->ht_lock); + _mesa_hash_table_destroy(at->ht, NULL); + + return result; +} + +void +tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev) +{ +#if TU_AUTOTUNE_LOG_AT_FINISH != 0 + while (!list_is_empty(&at->pending_results)) { + process_results(at); + } + + hash_table_foreach(at->ht, entry) { + struct tu_renderpass_history *history = entry->data; + + mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u", + history->key, history->avg_samples, history->num_results); + } +#endif + + tu_autotune_free_results(&at->pending_results); + + hash_table_foreach(at->ht, entry) { + struct tu_renderpass_history *history = entry->data; + ralloc_free(history); + } + + list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs, + &at->pending_submission_cs, node) { + tu_cs_finish(&submission_cs->cs); + free(submission_cs); + } + + _mesa_hash_table_destroy(at->ht, NULL); + u_rwlock_destroy(&at->ht_lock); + tu_bo_finish(dev, at->results_bo); + free(at->results_bo); +} + +bool +tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, + uint32_t cmd_buffer_count) +{ + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; + if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) + return true; + } + + return false; +} + +void +tu_autotune_free_results(struct list_head *results) +{ + list_for_each_entry_safe(struct tu_renderpass_result, result, + results, node) { + ralloc_free(result); + } +} + +static bool +fallback_use_bypass(const struct tu_render_pass *pass, + const struct tu_framebuffer *framebuffer, + const struct tu_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->state.drawcall_count > 5) + return false; + + for (unsigned i = 0; i < pass->subpass_count; i++) { + if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT) + return false; + } + + return true; +} + +bool +tu_autotune_use_bypass(struct tu_autotune *at, + struct tu_cmd_buffer *cmd_buffer, + struct tu_renderpass_result **autotune_result) +{ + const struct tu_render_pass *pass = cmd_buffer->state.pass; + const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer; + + /* If we would want to support buffers that could be submitted + * several times we would have to copy the sample counts of renderpasses + * after each submission of such buffer (like with u_trace support). + * This is rather messy and since almost all apps use ONE_TIME_SUBMIT + * we choose to unconditionally use fallback. + */ + bool one_time_submit = cmd_buffer->usage_flags & + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + if (!at->enabled || !one_time_submit) + return fallback_use_bypass(pass, framebuffer, cmd_buffer); + + /* We use 64bit hash as a key since we don't fear rare hash collision, + * the worst that would happen is sysmem being selected when it should + * have not, and with 64bit it would be extremely rare. + * + * Q: Why not make the key from framebuffer + renderpass pointers? + * A: At least DXVK creates new framebuffers each frame while keeping + * renderpasses the same. Also we want to support replaying a single + * frame in a loop for testing. + */ + uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer); + + *autotune_result = create_history_result(at, renderpass_key); + + uint32_t avg_samples = 0; + if (get_history(at, renderpass_key, &avg_samples)) { + /* TODO we should account for load/stores/clears/resolves especially + * with low drawcall count and ~fb_size samples passed, in D3D11 games + * we are seeing many renderpasses like: + * - color attachment load + * - single fullscreen draw + * - color attachment store + */ + + /* Low sample count could mean there was only a clear.. or there was + * a clear plus draws that touch no or few samples + */ + if (avg_samples < 500) { +#if TU_AUTOTUNE_DEBUG_LOG != 0 + mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem", + renderpass_key, cmd_buffer->state.drawcall_count, avg_samples); +#endif + return true; + } + + /* Cost-per-sample is an estimate for the average number of reads+ + * writes for a given passed sample. + */ + float sample_cost = cmd_buffer->state.total_drawcalls_cost; + sample_cost /= cmd_buffer->state.drawcall_count; + + float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count; + + bool select_sysmem = single_draw_cost < 6000.0; + +#if TU_AUTOTUNE_DEBUG_LOG != 0 + mesa_logi("%016"PRIx64":%u\t avg_samples=%u, " + "sample_cost=%f, single_draw_cost=%f selecting %s", + renderpass_key, cmd_buffer->state.drawcall_count, avg_samples, + sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem"); +#endif + + return select_sysmem; + } + + return fallback_use_bypass(pass, framebuffer, cmd_buffer); +} diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h new file mode 100644 index 00000000000..3344b3e854d --- /dev/null +++ b/src/freedreno/vulkan/tu_autotune.h @@ -0,0 +1,187 @@ +/* + * Copyright © 2021 Igalia S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TU_AUTOTUNE_H +#define TU_AUTOTUNE_H + +#include "util/hash_table.h" +#include "util/list.h" +#include "util/rwlock.h" + +#define autotune_offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) +#define autotune_results_ptr(at, member) \ + (at->results_bo->iova + \ + autotune_offset((at)->results, &(at)->results->member)) + +struct tu_device; +struct tu_cmd_buffer; + +struct tu_autotune_results; +struct tu_renderpass_history; + +/** + * "autotune" our decisions about bypass vs GMEM rendering, based on historical + * data about a given render target. + * + * In deciding which path to take there are tradeoffs, including some that + * are not reasonably estimateable without having some additional information: + * + * (1) If you know you are touching every pixel (ie. there is a clear), + * then the GMEM path will at least not cost more memory bandwidth than + * sysmem[1] + * + * (2) If there is no clear, GMEM could potentially cost *more* bandwidth + * if there is sysmem->GMEM restore pass. + * + * (3) If you see a high draw count, that is an indication that there will be + * enough pixels accessed multiple times to benefit from the reduced + * memory bandwidth that GMEM brings + * + * (4) But high draw count where there is not much overdraw can actually be + * faster in bypass mode if it is pushing a lot of state change, due to + * not having to go thru the state changes per-tile[1] + * + * The approach taken is to measure the samples-passed for the batch to estimate + * the amount of overdraw to detect cases where the number of pixels touched is + * low. + * + * [1] ignoring early-tile-exit optimizations, but any draw that touches all/ + * most of the tiles late in the tile-pass can defeat that + */ +struct tu_autotune { + + /* We may have to disable autotuner if there are too many + * renderpasses in-flight. + */ + bool enabled; + + /** + * Cache to map renderpass key to historical information about + * rendering to that particular render target. + */ + struct hash_table *ht; + struct u_rwlock ht_lock; + + /** + * GPU buffer used to communicate back results to the CPU + */ + struct tu_bo *results_bo; + struct tu_autotune_results *results; + + /** + * List of per-renderpass results that we are waiting for the GPU + * to finish with before reading back the results. + */ + struct list_head pending_results; + + /** + * List of per-submission CS that we are waiting for the GPU + * to finish using. + */ + struct list_head pending_submission_cs; + + uint32_t fence_counter; + uint32_t idx_counter; +}; + +#define TU_AUTOTUNE_MAX_RESULTS 256 + +/** + * The layout of the memory used to read back per-batch results from the + * GPU + * + * Note this struct is intentionally aligned to 4k. And hw requires the + * sample start/stop locations to be 128b aligned. + */ +struct tu_autotune_results { + + /** + * The GPU writes back a "fence" seqno value from the cmdstream after + * it finishes the submission, so that the CPU knows when + * results are valid. + */ + uint32_t fence; + + uint32_t __pad0; + uint64_t __pad1; + + /** + * From the cmdstream, the captured samples-passed values are recorded + * at the start and end of the batch. + * + * Note that we do the math on the CPU to avoid a WFI. But pre-emption + * may force us to revisit that. + */ + struct { + uint64_t samples_start; + uint64_t __pad0; + uint64_t samples_end; + uint64_t __pad1; + } result[TU_AUTOTUNE_MAX_RESULTS]; +}; + +/** + * Tracks the results from an individual renderpass. Initially created + * per renderpass, and appended to the tail of at->pending_results. At a later + * time, when the GPU has finished writing the results, we fill samples_passed. + */ +struct tu_renderpass_result { + + /** + * The index/slot in tu_autotune_results::result[] to write start/end + * counter to + */ + unsigned idx; + + /* + * Below here, only used internally within autotune + */ + uint64_t rp_key; + struct tu_renderpass_history *history; + struct list_head node; + uint32_t fence; + uint64_t samples_passed; +}; + +VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev); +void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev); + +bool tu_autotune_use_bypass(struct tu_autotune *at, + struct tu_cmd_buffer *cmd_buffer, + struct tu_renderpass_result **autotune_result); +void tu_autotune_free_results(struct list_head *results); + +bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, + uint32_t cmd_buffer_count); + +/** + * A magic 8-ball that tells the gmem code whether we should do bypass mode + * for moar fps. + */ +struct tu_cs *tu_autotune_on_submit(struct tu_device *dev, + struct tu_autotune *at, + struct tu_cmd_buffer **cmd_buffers, + uint32_t cmd_buffer_count); + + +#endif /* TU_AUTOTUNE_H */ diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index bd067d3af16..d9394bcf872 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -592,7 +592,8 @@ use_hw_binning(struct tu_cmd_buffer *cmd) } static bool -use_sysmem_rendering(struct tu_cmd_buffer *cmd) +use_sysmem_rendering(struct tu_cmd_buffer *cmd, + struct tu_renderpass_result **autotune_result) { if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM)) return true; @@ -615,7 +616,13 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd) if (cmd->state.disable_gmem) return true; - return false; + bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune, + cmd, autotune_result); + if (*autotune_result) { + list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results); + } + + return use_sysmem; } static void @@ -1210,7 +1217,50 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, } static void -tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_autotune_begin(struct tu_cs *cs, struct tu_autotune *at, + const struct tu_renderpass_result *autotune_result) +{ + if (!autotune_result) + return; + + uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS; + uint64_t begin_iova = autotune_results_ptr(at, result[result_idx].samples_start); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); +} + +static void +tu6_autotune_end(struct tu_cs *cs, struct tu_autotune *at, + const struct tu_renderpass_result *autotune_result) +{ + if (!autotune_result) + return; + + uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS; + uint64_t end_iova = autotune_results_ptr(at, result[result_idx].samples_end); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); + + /* A fence would be emitted at the submission time */ +} + +static void +tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + const struct tu_renderpass_result *autotune_result) { const struct tu_framebuffer *fb = cmd->state.framebuffer; @@ -1240,12 +1290,17 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); tu_cs_emit(cs, 0x0); + tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result); + tu_cs_sanity_check(cs); } static void -tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + const struct tu_renderpass_result *autotune_result) { + tu6_autotune_end(cs, &cmd->device->autotune, autotune_result); + /* Do any resolves of the last subpass. These are handled in the * tile_store_cs in the gmem path. */ @@ -1262,7 +1317,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } static void -tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + const struct tu_renderpass_result *autotune_result) { struct tu_physical_device *phys_dev = cmd->device->physical_device; @@ -1312,6 +1368,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); } + tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result); + tu_cs_sanity_check(cs); } @@ -1340,8 +1398,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } static void -tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + const struct tu_renderpass_result *autotune_result) { + tu6_autotune_end(cs, &cmd->device->autotune, autotune_result); + tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); tu_cs_emit_regs(cs, @@ -1355,11 +1416,12 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } static void -tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) +tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, + const struct tu_renderpass_result *autotune_result) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - tu6_tile_render_begin(cmd, &cmd->cs); + tu6_tile_render_begin(cmd, &cmd->cs, autotune_result); uint32_t pipe = 0; for (uint32_t py = 0; py < fb->pipe_count.height; py++) { @@ -1381,7 +1443,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) } } - tu6_tile_render_end(cmd, &cmd->cs); + tu6_tile_render_end(cmd, &cmd->cs, autotune_result); trace_end_render_pass(&cmd->trace, &cmd->cs, fb); @@ -1391,9 +1453,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) } static void -tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) +tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, + const struct tu_renderpass_result *autotune_result) { - tu6_sysmem_render_begin(cmd, &cmd->cs); + tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs); @@ -1401,7 +1464,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs); - tu6_sysmem_render_end(cmd, &cmd->cs); + tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer); } @@ -1442,7 +1505,9 @@ tu_create_cmd_buffer(struct tu_device *device, cmd_buffer->queue_family_index = TU_QUEUE_GENERAL; } + u_trace_init(&cmd_buffer->trace, &device->trace_context); + list_inithead(&cmd_buffer->renderpass_autotune_results); tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096); @@ -1468,6 +1533,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) u_trace_fini(&cmd_buffer->trace); + tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results); + for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { if (cmd_buffer->descriptors[i].push_set.layout) tu_descriptor_set_layout_unref(cmd_buffer->device, @@ -1492,6 +1559,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); + tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results); + for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); if (cmd_buffer->descriptors[i].push_set.layout) @@ -3818,6 +3887,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, { const struct tu_pipeline *pipeline = cmd->state.pipeline; + /* Fill draw stats for autotuner */ + cmd->state.drawcall_count++; + + cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost; + if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE) + cmd->state.total_drawcalls_cost++; + if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) + cmd->state.total_drawcalls_cost++; + tu_emit_cache_flush_renderpass(cmd, cs); bool primitive_restart_enabled = pipeline->ia.primitive_restart; @@ -4584,10 +4662,11 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); - if (use_sysmem_rendering(cmd_buffer)) - tu_cmd_render_sysmem(cmd_buffer); + struct tu_renderpass_result *autotune_result = NULL; + if (use_sysmem_rendering(cmd_buffer, &autotune_result)) + tu_cmd_render_sysmem(cmd_buffer, autotune_result); else - tu_cmd_render_tiles(cmd_buffer); + tu_cmd_render_tiles(cmd_buffer, autotune_result); /* Outside of renderpasses we assume all draw states are disabled. We do * this outside the draw CS for the normal case where 3d gmem stores aren't @@ -4617,6 +4696,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, cmd_buffer->state.has_tess = false; cmd_buffer->state.has_subpass_predication = false; cmd_buffer->state.disable_gmem = false; + cmd_buffer->state.drawcall_count = 0; + cmd_buffer->state.total_drawcalls_cost = 0; /* LRZ is not valid next time we use it */ cmd_buffer->state.lrz.valid = false; diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 7f7656fa58b..5f60a4a65b1 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1810,6 +1810,11 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->mem_cache = tu_pipeline_cache_from_handle(pc); + result = tu_autotune_init(&device->autotune, device); + if (result != VK_SUCCESS) { + goto fail_timeline_cond; + } + for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain); @@ -1891,6 +1896,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) free(device->perfcntrs_pass_cs); } + tu_autotune_fini(&device->autotune, device); + pthread_cond_destroy(&device->timeline_cond); vk_free(&device->vk.alloc, device->bo_list); vk_free(&device->vk.alloc, device->bo_idx); diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index 93307300a3c..150cbe97612 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -53,6 +53,8 @@ struct tu_queue_submit uint32_t nr_out_syncobjs; uint32_t entry_count; uint32_t perf_pass_index; + + bool autotune_fence; }; struct tu_u_trace_syncobj @@ -746,8 +748,14 @@ tu_queue_submit_create_locked(struct tu_queue *queue, } } + memset(new_submit, 0, sizeof(struct tu_queue_submit)); + new_submit->autotune_fence = + tu_autotune_submit_requires_fence(cmd_buffers, vk_submit->command_buffer_count); + if (new_submit->autotune_fence) + entry_count++; + new_submit->cmds = vk_zalloc(&queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); @@ -818,9 +826,26 @@ tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit) } static void -tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, - struct tu_queue_submit *submit) +tu_fill_msm_gem_submit(struct tu_device *dev, + struct drm_msm_gem_submit_cmd *cmd, + struct tu_cs_entry *cs_entry) { + cmd->type = MSM_SUBMIT_CMD_BUF; + cmd->submit_idx = + dev->bo_idx[cs_entry->bo->gem_handle]; + cmd->submit_offset = cs_entry->offset; + cmd->size = cs_entry->size; + cmd->pad = 0; + cmd->nr_relocs = 0; + cmd->relocs = 0; +} + +static void +tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, + struct tu_queue_submit *submit, + struct tu_cs *autotune_cs) +{ + struct tu_device *dev = queue->device; struct drm_msm_gem_submit_cmd *cmds = submit->cmds; struct vk_command_buffer **vk_cmd_buffers = submit->vk_submit->command_buffers; @@ -836,45 +861,27 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, struct tu_cs_entry *perf_cs_entry = &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index]; - cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; - cmds[entry_idx].submit_idx = - dev->bo_idx[perf_cs_entry->bo->gem_handle]; - cmds[entry_idx].submit_offset = perf_cs_entry->offset; - cmds[entry_idx].size = perf_cs_entry->size; - cmds[entry_idx].pad = 0; - cmds[entry_idx].nr_relocs = 0; - cmds[entry_idx++].relocs = 0; + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry); } for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { - cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; - cmds[entry_idx].submit_idx = - dev->bo_idx[cs->entries[i].bo->gem_handle]; - cmds[entry_idx].submit_offset = cs->entries[i].offset; - cmds[entry_idx].size = cs->entries[i].size; - cmds[entry_idx].pad = 0; - cmds[entry_idx].nr_relocs = 0; - cmds[entry_idx].relocs = 0; + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]); } if (submit->u_trace_submission_data) { struct tu_cs *ts_cs = submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs; if (ts_cs) { - cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; - cmds[entry_idx].submit_idx = - queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle]; - - assert(cmds[entry_idx].submit_idx < queue->device->bo_count); - - cmds[entry_idx].submit_offset = ts_cs->entries[0].offset; - cmds[entry_idx].size = ts_cs->entries[0].size; - cmds[entry_idx].pad = 0; - cmds[entry_idx].nr_relocs = 0; - cmds[entry_idx++].relocs = 0; + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]); } } } + + if (autotune_cs) { + assert(autotune_cs->entry_count == 1); + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]); + entry_idx++; + } } static VkResult @@ -882,6 +889,15 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) { queue->device->submit_count++; + struct tu_cs *autotune_cs = NULL; + if (submit->autotune_fence) { + struct tu_cmd_buffer **cmd_buffers = (void *)submit->vk_submit->command_buffers; + autotune_cs = tu_autotune_on_submit(queue->device, + &queue->device->autotune, + cmd_buffers, + submit->vk_submit->command_buffer_count); + } + uint32_t flags = MSM_PIPE_3D0; if (submit->vk_submit->wait_count) @@ -896,7 +912,7 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) * time when bo_mutex is not locked. So we build submit cmds here the real * place to submit. */ - tu_queue_build_msm_gem_submit_cmds(queue, submit); + tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs); struct drm_msm_gem_submit req = { .flags = flags, diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c index e93e04d8a6a..861c55d6e02 100644 --- a/src/freedreno/vulkan/tu_kgsl.c +++ b/src/freedreno/vulkan/tu_kgsl.c @@ -358,6 +358,10 @@ tu_QueueSubmit(VkQueue _queue, entry_count++; } + struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers; + if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount)) + entry_count++; + max_entry_count = MAX2(max_entry_count, entry_count); } @@ -404,6 +408,22 @@ tu_QueueSubmit(VkQueue _queue, } } + struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers; + if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount)) { + struct tu_cs *autotune_cs = + tu_autotune_on_submit(queue->device, + &queue->device->autotune, + cmd_buffers, + submit->commandBufferCount); + cmds[entry_idx++] = (struct kgsl_command_object) { + .offset = autotune_cs->entries[0].offset, + .gpuaddr = autotune_cs->entries[0].bo->iova, + .size = autotune_cs->entries[0].size, + .flags = KGSL_CMDLIST_IB, + .id = autotune_cs->entries[0].bo->gem_handle, + }; + } + struct tu_syncobj s = sync_merge(submit->pWaitSemaphores, submit->waitSemaphoreCount, true, true); diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index a0e18991596..b52ba8cdbd7 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -1576,6 +1576,9 @@ tu6_emit_fs_outputs(struct tu_cs *cs, (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) { pipeline->lrz.force_late_z = true; } + + pipeline->drawcall_base_cost += + util_bitcount(fs_render_components) / util_bitcount(0xf); } } @@ -3121,6 +3124,10 @@ tu_pipeline_builder_parse_multisample_and_color_blend( if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) { pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; } + + if (blendAttachment.blendEnable) { + pipeline->drawcall_base_cost++; + } } } diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 70d74066d37..d0bf7072a36 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -77,6 +77,7 @@ #include "perfcntrs/freedreno_perfcntr.h" #include "tu_descriptor_set.h" +#include "tu_autotune.h" #include "tu_util.h" #include "tu_perfetto.h" @@ -462,6 +463,8 @@ struct tu_device pthread_cond_t timeline_cond; pthread_mutex_t submit_mutex; + struct tu_autotune autotune; + #ifdef ANDROID const void *gralloc; enum { @@ -1063,6 +1066,35 @@ struct tu_cmd_state bool disable_gmem; enum a5xx_line_mode line_mode; + uint32_t drawcall_count; + + /* A calculated "draw cost" value for renderpass, which tries to + * estimate the bandwidth-per-sample of all the draws according + * to: + * + * foreach_draw (...) { + * cost += num_frag_outputs; + * if (blend_enabled) + * cost += num_blend_enabled; + * if (depth_test_enabled) + * cost++; + * if (depth_write_enabled) + * cost++; + * } + * + * The idea is that each sample-passed minimally does one write + * per MRT. If blend is enabled, the hw will additionally do + * a framebuffer read per sample-passed (for each MRT with blend + * enabled). If depth-test is enabled, the hw will additionally + * a depth buffer read. If depth-write is enable, the hw will + * additionally do a depth buffer write. + * + * This does ignore depth buffer traffic for samples which do not + * pass do to depth-test fail, and some other details. But it is + * just intended to be a rough estimate that is easy to calculate. + */ + uint32_t total_drawcalls_cost; + struct tu_lrz_state lrz; struct tu_draw_state depth_plane_state; @@ -1102,6 +1134,8 @@ struct tu_cmd_buffer struct u_trace_iterator trace_renderpass_start; struct u_trace_iterator trace_renderpass_end; + struct list_head renderpass_autotune_results; + VkCommandBufferUsageFlags usage_flags; VkCommandBufferLevel level; enum tu_cmd_buffer_status status; @@ -1300,6 +1334,9 @@ struct tu_pipeline struct tu_lrz_pipeline lrz; + /* Base drawcall cost for sysmem vs gmem autotuner */ + uint8_t drawcall_base_cost; + void *executables_mem_ctx; /* tu_pipeline_executable */ struct util_dynarray executables;