diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst index f57db5ff18d..3bfbb348bbb 100644 --- a/docs/drivers/freedreno.rst +++ b/docs/drivers/freedreno.rst @@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi Some of these options will behave differently when toggled at runtime, for example: ``nolrz`` will still result in LRZ allocation which would not happen if the option was set in the environment variable. + +Autotune +^^^^^^^^ + +Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the +autotune system, the behavior of which can be controlled with the following +environment variables: + +.. envvar:: TU_AUTOTUNE_ALGO + + Selects the algorithm used for autotuning. Supported values are: + + ``bandwidth`` + Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses + the one with lower estimated bandwidth. + + ``profiled`` + Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to + move a probability distribution towards the optimal choice over time. This + algorithm tends to be far more accurate than the bandwidth algorithm at choosing + the optimal rendering mode but may result in larger FPS variance due to being + based on a probability distribution with random sampling. This is the default + algorithm. + + ``profiled_imm`` + Similar to ``profiled``, but only profiles the first few instances of a RP + and then sticks to the chosen mode for subsequent instances. This is meant + for single-frame traces run multiple times in a CI where this algorithm can + immediately chose the optimal rendering mode for each RP. + + ``prefer_sysmem`` + Always chooses SYSMEM rendering. This is useful for games that don't benefit + from GMEM rendering due to their rendering patterns, setting this is better + than using ``TU_DEBUG=sysmem`` when done for performance reasons. + + The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well. + +.. envvar:: TU_AUTOTUNE_FLAGS + + Modifies the behavior of the selected algorithm. Supported flags are: + + ``big_gmem`` + Always chooses GMEM rendering if the amount of draw calls in the render pass + is greater than a certain threshold. Larger RPs generally benefit more from + GMEM rendering due to less overhead from tiling. + + ``small_sysmem`` + Always chooses SYSMEM rendering if the amount of draw calls in the render pass + is lower than a certain threshold. The benefits of GMEM rendering are less + pronounced in these smaller RPs and SYSMEM rendering tends to win more often. + + ``preempt_optimize`` + Tries to keep non-preemptible time in the render pass is below a certain + threshold. This is useful for systems with GPU-based compositors where long + non-preemptible times can lead to missed frame deadlines, causing noticeable + stuttering. This flag will reduce the performance of the render pass in order + to improve overall system responsiveness, it should not be used unless the + rest of the system is affected by preemption delays. + + Multiple flags can be combined by separating them with commas, e.g. + ``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``. + + If no flags are specified, the default behavior is used. \ No newline at end of file diff --git a/src/freedreno/vulkan/.clang-format b/src/freedreno/vulkan/.clang-format index f7f9e5755db..256e3ff892f 100644 --- a/src/freedreno/vulkan/.clang-format +++ b/src/freedreno/vulkan/.clang-format @@ -4,7 +4,7 @@ DisableFormat: false AlwaysBreakAfterReturnType: TopLevel BinPackParameters: false -ColumnLimit: 78 +ColumnLimit: 120 Cpp11BracedListStyle: false IncludeBlocks: Regroup diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index df969834a37..0b3dbc5b4f7 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -5,113 +5,358 @@ #include "tu_autotune.h" +#include +#include +#include +#include +#include +#include + +#include "util/rand_xor.h" + +#define XXH_INLINE_ALL +#include "util/xxhash.h" + #include "tu_cmd_buffer.h" #include "tu_cs.h" #include "tu_device.h" #include "tu_image.h" #include "tu_pass.h" -#define XXH_INLINE_ALL -#include "util/xxhash.h" +/** Compile-time debug options **/ -/* How does it work? - * - * - For each renderpass we calculate the number of samples passed - * by storing the number before and after in GPU memory. - * - To store the values each command buffer holds GPU memory which - * expands with more renderpasses being written. - * - For each renderpass we create tu_renderpass_result entry which - * points to the results in GPU memory. - * - Later on tu_renderpass_result would be added to the - * tu_renderpass_history entry which aggregate results for a - * given renderpass. - * - On submission: - * - Process results which fence was signalled. - * - Free per-submission data which we now don't need. - * - * - Create a command stream to write a fence value. This way we would - * know when we could safely read the results. - * - We cannot rely on the command buffer's lifetime when referencing - * its resources since the buffer could be destroyed before we process - * the results. - * - For each command buffer: - * - Reference its GPU memory. - * - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue. - * - * Since the command buffers could be recorded on different threads - * we have to maintaining some amount of locking history table, - * however we change the table only in a single thread at the submission - * time, so in most cases there will be no locking. - */ +#define TU_AUTOTUNE_DEBUG_LOG_BASE 0 +#define TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH 0 +#define TU_AUTOTUNE_DEBUG_LOG_PROFILED 0 +#define TU_AUTOTUNE_DEBUG_LOG_PREEMPT 0 -void -tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results); +#if TU_AUTOTUNE_DEBUG_LOG_BASE +#define at_log_base(fmt, ...) mesa_logi("autotune: " fmt, ##__VA_ARGS__) +#define at_log_base_h(fmt, hash, ...) mesa_logi("autotune %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__) +#else +#define at_log_base(fmt, ...) +#define at_log_base_h(fmt, hash, ...) +#endif -#define TU_AUTOTUNE_DEBUG_LOG 0 -/* Dump history entries on autotuner finish, - * could be used to gather data from traces. - */ -#define TU_AUTOTUNE_LOG_AT_FINISH 0 +#if TU_AUTOTUNE_DEBUG_LOG_BANDWIDTH +#define at_log_bandwidth_h(fmt, hash, ...) mesa_logi("autotune-bw %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__) +#else +#define at_log_bandwidth_h(fmt, hash, ...) +#endif -/* How many last renderpass stats are taken into account. */ -#define MAX_HISTORY_RESULTS 5 -/* For how many submissions we store renderpass stats. */ -#define MAX_HISTORY_LIFETIME 128 +#if TU_AUTOTUNE_DEBUG_LOG_PROFILED +#define at_log_profiled_h(fmt, hash, ...) mesa_logi("autotune-prof %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__) +#else +#define at_log_profiled_h(fmt, hash, ...) +#endif +#if TU_AUTOTUNE_DEBUG_LOG_PREEMPT +#define at_log_preempt_h(fmt, hash, ...) mesa_logi("autotune-preempt %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__) +#else +#define at_log_preempt_h(fmt, hash, ...) +#endif -/** - * Tracks results for a given renderpass key - */ -struct tu_renderpass_history { - uint64_t key; +#if TU_AUTOTUNE_DEBUG_PERFCTR +#define at_log_perfctr_h(fmt, hash, ...) mesa_logi("autotune-perfctr %016" PRIx64 ": " fmt, hash, ##__VA_ARGS__) +#else +#define at_log_perfctr_h(fmt, hash, ...) +#endif - /* We would delete old history entries */ - uint32_t last_fence; +/* Process any pending entries on autotuner finish, could be used to gather data from traces. */ +#define TU_AUTOTUNE_FLUSH_AT_FINISH 0 - /** - * List of recent fd_renderpass_result's - */ - struct list_head results; - uint32_t num_results; +/** Global constants and helpers **/ - uint32_t avg_samples; -}; +/* GPU always-on timer constants */ +constexpr uint64_t ALWAYS_ON_FREQUENCY_HZ = 19'200'000; +constexpr uint64_t GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1'000'000; -/* Holds per-submission cs which writes the fence. */ -struct tu_submission_data { - struct list_head node; - uint32_t fence; - - struct tu_cs fence_cs; -}; - -static bool -fence_before(uint32_t a, uint32_t b) +constexpr uint64_t +ticks_to_us(uint64_t ticks) { - /* essentially a < b, but handle wrapped values */ - return (int32_t)(a - b) < 0; + return ticks / GPU_TICKS_PER_US; } -static uint32_t -get_autotune_fence(struct tu_autotune *at) +constexpr bool +fence_before(uint32_t a, uint32_t b) { - return at->device->global_bo_map->autotune_fence; + /* Essentially a < b, but handles wrapped values. */ + return (int32_t) (a - b) < 0; +} + +constexpr const char * +render_mode_str(tu_autotune::render_mode mode) +{ + switch (mode) { + case tu_autotune::render_mode::SYSMEM: + return "SYSMEM"; + case tu_autotune::render_mode::GMEM: + return "GMEM"; + default: + return "UNKNOWN"; + } +} + +/** Configuration **/ + +enum class tu_autotune::algorithm : uint8_t { + BANDWIDTH = 0, /* Uses estimated BW for determining rendering mode. */ + PROFILED = 1, /* Uses dynamically profiled results for determining rendering mode. */ + PROFILED_IMM = 2, /* Same as PROFILED but immediately resolves the SYSMEM/GMEM probability. */ + PREFER_SYSMEM = 3, /* Always use SYSMEM unless we have strong evidence that GMEM is better. */ + + DEFAULT = PROFILED, /* Default algorithm, used if no other is specified. */ +}; + +/* Modifier flags, these modify the behavior of the autotuner in a user-defined way. */ +enum class tu_autotune::mod_flag : uint8_t { + BIG_GMEM = BIT(1), /* All RPs with >= 10 draws use GMEM. */ + TUNE_SMALL = BIT(2), /* Try tuning all RPs with <= 5 draws, ignored by default. */ + PREEMPT_OPTIMIZE = BIT(3), /* Attempts to minimize the preemption latency. */ +}; + +/* Metric flags, for internal tracking of enabled metrics. */ +enum class tu_autotune::metric_flag : uint8_t { + SAMPLES = BIT(1), /* Enable tracking samples passed metric. */ + TS = BIT(2), /* Enable tracking per-RP timestamp metric. */ + TS_TILE = BIT(3), /* Enable tracking per-tile timestamp metric. */ +}; + +struct PACKED tu_autotune::config_t { + private: + algorithm algo = algorithm::DEFAULT; + uint8_t mod_flags = 0; /* See mod_flag enum. */ + uint8_t metric_flags = 0; /* See metric_flag enum. */ + + constexpr void update_metric_flags() + { + /* Note: Always keep in sync with rp_history to prevent UB. */ + if (algo == algorithm::BANDWIDTH) { + metric_flags |= (uint8_t) metric_flag::SAMPLES; + } else if (algo == algorithm::PROFILED || algo == algorithm::PROFILED_IMM) { + metric_flags |= (uint8_t) metric_flag::TS; + } + + if (mod_flags & (uint8_t) mod_flag::PREEMPT_OPTIMIZE) { + metric_flags |= (uint8_t) metric_flag::TS | (uint8_t) metric_flag::TS_TILE; + } + } + + public: + constexpr config_t() = default; + + constexpr config_t(algorithm algo, uint8_t mod_flags): algo(algo), mod_flags(mod_flags) + { + update_metric_flags(); + } + + constexpr bool is_enabled(algorithm a) const + { + return algo == a; + } + + constexpr bool test(mod_flag f) const + { + return mod_flags & (uint32_t) f; + } + + constexpr bool test(metric_flag f) const + { + return metric_flags & (uint32_t) f; + } + + constexpr bool set_algo(algorithm a) + { + if (algo == a) + return false; + + algo = a; + update_metric_flags(); + return true; + } + + constexpr bool disable(mod_flag f) + { + if (!(mod_flags & (uint8_t) f)) + return false; + + mod_flags &= ~(uint8_t) f; + update_metric_flags(); + return true; + } + + constexpr bool enable(mod_flag f) + { + if (mod_flags & (uint8_t) f) + return false; + + mod_flags |= (uint8_t) f; + update_metric_flags(); + return true; + } + + std::string to_string() const + { +#define ALGO_STR(algo_name) \ + if (algo == algorithm::algo_name) \ + str += #algo_name; +#define MODF_STR(flag) \ + if (mod_flags & (uint8_t) mod_flag::flag) { \ + str += #flag " "; \ + } +#define METRICF_STR(flag) \ + if (metric_flags & (uint8_t) metric_flag::flag) { \ + str += #flag " "; \ + } + + std::string str = "Algorithm: "; + + ALGO_STR(BANDWIDTH); + ALGO_STR(PROFILED); + ALGO_STR(PROFILED_IMM); + ALGO_STR(PREFER_SYSMEM); + + str += ", Mod Flags: 0x" + std::to_string(mod_flags) + " ("; + MODF_STR(BIG_GMEM); + MODF_STR(TUNE_SMALL); + MODF_STR(PREEMPT_OPTIMIZE); + str += ")"; + + str += ", Metric Flags: 0x" + std::to_string(metric_flags) + " ("; + METRICF_STR(SAMPLES); + METRICF_STR(TS); + METRICF_STR(TS_TILE); + str += ")"; + + return str; + +#undef ALGO_STR +#undef MODF_STR +#undef METRICF_STR + } +}; + +union tu_autotune::packed_config_t { + config_t config; + uint32_t bits = 0; + static_assert(sizeof(bits) >= sizeof(config)); + static_assert(std::is_trivially_copyable::value, + "config_t must be trivially copyable to be automatically packed"); + + constexpr packed_config_t(config_t config): config(config) + { + } + + constexpr packed_config_t(uint32_t bits): bits(bits) + { + } +}; + +tu_autotune::atomic_config_t::atomic_config_t(config_t initial): config_bits(packed_config_t { initial }.bits) +{ +} + +tu_autotune::config_t +tu_autotune::atomic_config_t::load() const +{ + return config_t(packed_config_t { config_bits.load(std::memory_order_relaxed) }.config); +} + +bool +tu_autotune::atomic_config_t::compare_and_store(config_t updated, config_t expected) +{ + uint32_t expected_bits = packed_config_t { expected }.bits; + return config_bits.compare_exchange_strong(expected_bits, packed_config_t { updated }.bits, + std::memory_order_acquire, std::memory_order_relaxed); +} + +tu_autotune::config_t +tu_autotune::get_env_config() +{ + static std::once_flag once; + static config_t at_config; + std::call_once(once, [&] { + algorithm algo = algorithm::DEFAULT; + const char *algo_str = os_get_option("TU_AUTOTUNE_ALGO"); + std::string_view algo_strv; + + if (algo_str) + algo_strv = algo_str; + else if (device->instance->autotune_algo) + algo_strv = device->instance->autotune_algo; + + if (!algo_strv.empty()) { + if (algo_strv == "bandwidth") { + algo = algorithm::BANDWIDTH; + } else if (algo_strv == "profiled") { + algo = algorithm::PROFILED; + } else if (algo_strv == "profiled_imm") { + algo = algorithm::PROFILED_IMM; + } else if (algo_strv == "prefer_sysmem") { + algo = algorithm::PREFER_SYSMEM; + } else { + mesa_logw("Unknown TU_AUTOTUNE_ALGO '%s', using default", algo_strv.data()); + } + + if (TU_DEBUG(STARTUP)) + mesa_logi("TU_AUTOTUNE_ALGO=%u (%s)", (uint8_t) algo, algo_strv.data()); + } + + /* Parse the flags from the environment variable. */ + const char *flags_env_str = os_get_option("TU_AUTOTUNE_FLAGS"); + uint32_t mod_flags = 0; + if (flags_env_str) { + static const struct debug_control tu_at_flags_control[] = { + { "big_gmem", (uint32_t) mod_flag::BIG_GMEM }, + { "tune_small", (uint32_t) mod_flag::TUNE_SMALL }, + { "preempt_optimize", (uint32_t) mod_flag::PREEMPT_OPTIMIZE }, + { NULL, 0 } + }; + + mod_flags = parse_debug_string(flags_env_str, tu_at_flags_control); + if (TU_DEBUG(STARTUP)) + mesa_logi("TU_AUTOTUNE_FLAGS=0x%x (%s)", mod_flags, flags_env_str); + } + + assert((uint8_t) mod_flags == mod_flags); + at_config = config_t(algo, (uint8_t) mod_flags); + }); + + if (TU_DEBUG(STARTUP)) + mesa_logi("TU_AUTOTUNE: %s", at_config.to_string().c_str()); + + return at_config; +} + +/** Global Fence and Internal CS Management **/ + +tu_autotune::submission_entry::submission_entry(tu_device *device): fence(0) +{ + tu_cs_init(&fence_cs, device, TU_CS_MODE_GROW, 5, "autotune fence cs"); +} + +tu_autotune::submission_entry::~submission_entry() +{ + assert(!is_active()); + tu_cs_finish(&fence_cs); +} + +bool +tu_autotune::submission_entry::is_active() const +{ + return fence_cs.device->global_bo_map->autotune_fence < fence; } template static void -create_submission_fence(struct tu_device *dev, - struct tu_cs *cs, - uint32_t fence) +write_fence_cs(struct tu_device *dev, struct tu_cs *cs, uint32_t fence) { uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence); if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4); - tu_cs_emit(cs, - CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS, - .write_src = EV_WRITE_USER_32B, - .write_dst = EV_DST_RAM, - .write_enabled = true).value); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS, .write_src = EV_WRITE_USER_32B, .write_dst = EV_DST_RAM, + .write_enabled = true) + .value); } else { tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); @@ -121,626 +366,1518 @@ create_submission_fence(struct tu_device *dev, tu_cs_emit(cs, fence); } -static struct tu_submission_data * -create_submission_data(struct tu_device *dev, struct tu_autotune *at, - uint32_t fence) +struct tu_cs * +tu_autotune::submission_entry::try_get_cs(uint32_t new_fence) { - struct tu_submission_data *submission_data = NULL; - if (!list_is_empty(&at->submission_data_pool)) { - submission_data = list_first_entry(&at->submission_data_pool, - struct tu_submission_data, node); - list_del(&submission_data->node); - } else { - submission_data = (struct tu_submission_data *) calloc( - 1, sizeof(struct tu_submission_data)); - tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs"); - } - submission_data->fence = fence; - - struct tu_cs* fence_cs = &submission_data->fence_cs; - tu_cs_begin(fence_cs); - TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence); - tu_cs_end(fence_cs); - - list_addtail(&submission_data->node, &at->pending_submission_data); - - return submission_data; -} - -static void -finish_submission_data(struct tu_autotune *at, - struct tu_submission_data *data) -{ - list_del(&data->node); - list_addtail(&data->node, &at->submission_data_pool); - tu_cs_reset(&data->fence_cs); -} - -static void -free_submission_data(struct tu_submission_data *data) -{ - list_del(&data->node); - tu_cs_finish(&data->fence_cs); - - free(data); -} - -static uint64_t -hash_renderpass_instance(const struct tu_render_pass *pass, - const struct tu_framebuffer *framebuffer, - const struct tu_cmd_buffer *cmd) { - uint32_t data[3 + pass->attachment_count * 5]; - uint32_t* ptr = data; - - *ptr++ = framebuffer->width; - *ptr++ = framebuffer->height; - *ptr++ = framebuffer->layers; - - for (unsigned i = 0; i < pass->attachment_count; i++) { - *ptr++ = cmd->state.attachments[i]->view.width; - *ptr++ = cmd->state.attachments[i]->view.height; - *ptr++ = cmd->state.attachments[i]->image->vk.format; - *ptr++ = cmd->state.attachments[i]->image->vk.array_layers; - *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels; + if (is_active()) { + /* If the CS is already active, we cannot write to it. */ + return nullptr; } - return XXH64(data, sizeof(data), pass->autotune_hash); + struct tu_device *device = fence_cs.device; + tu_cs_reset(&fence_cs); + tu_cs_begin(&fence_cs); + TU_CALLX(device, write_fence_cs)(device, &fence_cs, new_fence); + tu_cs_end(&fence_cs); + assert(fence_cs.entry_count == 1); /* We expect the initial allocation to be large enough. */ + fence = new_fence; + + return &fence_cs; } -static void -free_result(struct tu_device *dev, struct tu_renderpass_result *result) +struct tu_cs * +tu_autotune::get_cs_for_fence(uint32_t fence) { - tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo); - list_del(&result->node); - free(result); + for (submission_entry &entry : submission_entries) { + struct tu_cs *cs = entry.try_get_cs(fence); + if (cs) + return cs; + } + + /* If we reach here, we have to allocate a new entry. */ + submission_entry &entry = submission_entries.emplace_back(device); + struct tu_cs *cs = entry.try_get_cs(fence); + assert(cs); /* We just allocated it, so it should be available. */ + return cs; } -static void -free_history(struct tu_device *dev, struct tu_renderpass_history *history) -{ - tu_autotune_free_results_locked(dev, &history->results); - free(history); -} +/** RP Entry Management **/ -static bool -get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples) -{ - bool has_history = false; +#if TU_AUTOTUNE_DEBUG_PERFCTR +struct PACKED tu_perf_ctr_sample { + uint64_t begin; + uint64_t end; + /* The selector value at the beginning/end, used to validate that the countable wasn't changed during a preemption. */ + uint32_t selector_begin; + uint32_t selector_end; +}; +#endif - /* If the lock contantion would be found in the wild - - * we could use try_lock here. +/* The part of the per-RP entry which is written by the GPU. */ +struct PACKED tu_autotune::rp_gpu_data { + /* HW requires the sample start/stop locations to be 128b aligned. */ + alignas(16) uint64_t samples_start; + alignas(16) uint64_t samples_end; + uint64_t ts_start; + uint64_t ts_end; + +#if TU_AUTOTUNE_DEBUG_PERFCTR + struct tu_perf_ctr_sample preemption_reaction_delay, num_preemptions, always_count; + uint64_t cntrs_ready; + constexpr static uint64_t CNTRS_READY_MAGIC = 0xABCDEFEFE; +#endif +}; + +/* Per-tile values for GMEM rendering, this structure is appended to the end of rp_gpu_data for each tile. */ +struct PACKED tu_autotune::tile_gpu_data { + uint64_t ts_start; + uint64_t ts_end; + + /* A helper for the offset of this relative to BO start. */ + static constexpr uint64_t offset(uint32_t tile_index) + { + return sizeof(rp_gpu_data) + (tile_index * sizeof(tile_gpu_data)); + } +}; + +/* A small wrapper around rp_history to provide ref-counting and usage timestamps. */ +struct tu_autotune::rp_history_handle { + rp_history *history; + + /* Note: Must be called with rp_mutex held. */ + rp_history_handle(rp_history &history); + + constexpr rp_history_handle(std::nullptr_t): history(nullptr) + { + } + + rp_history_handle(const rp_history_handle &) = delete; + rp_history_handle &operator=(const rp_history_handle &) = delete; + + constexpr rp_history_handle(rp_history_handle &&other): history(other.history) + { + other.history = nullptr; + } + + constexpr rp_history_handle &operator=(rp_history_handle &&other) + { + if (this != &other) { + history = other.history; + other.history = nullptr; + } + return *this; + } + + constexpr operator bool() const + { + return history != nullptr; + } + + constexpr rp_history &operator*() const + { + assert(history); + return *history; + } + + constexpr operator rp_history *() const + { + assert(history); + return history; + } + + constexpr rp_history *operator->() const + { + assert(history); + return history; + } + + ~rp_history_handle(); +}; + +/* An "entry" of renderpass autotune results, which is used to store the results of a renderpass autotune run for a + * given command buffer. */ +struct tu_autotune::rp_entry { + private: + struct tu_device *device; + + struct tu_suballoc_bo bo; + uint8_t *map; /* A direct pointer to the BO's CPU mapping. */ + + static_assert(alignof(rp_gpu_data) == 16); + static_assert(offsetof(rp_gpu_data, samples_start) == 0); + static_assert(offsetof(rp_gpu_data, samples_end) == 16); + static_assert(sizeof(rp_gpu_data) % alignof(tile_gpu_data) == 0); + + public: + rp_history_handle history; + config_t config; /* Configuration at the time of entry creation. */ + bool sysmem; + uint32_t tile_count; + uint32_t draw_count; + + /* Amount of repeated RPs so far, used for uniquely identifying instances of the same RPs. */ + uint32_t duplicates = 0; + + rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count) + : device(device), map(nullptr), history(std::move(history)), config(config), draw_count(draw_count) + { + } + + ~rp_entry() + { + if (map) { + std::scoped_lock lock(device->autotune->suballoc_mutex); + tu_suballoc_bo_free(&device->autotune->suballoc, &bo); + } + } + + /* Disable the copy/move operators as that shouldn't be done. */ + rp_entry(const rp_entry &) = delete; + rp_entry &operator=(const rp_entry &) = delete; + rp_entry(rp_entry &&) = delete; + rp_entry &operator=(rp_entry &&) = delete; + + void allocate(bool sysmem, uint32_t tile_count) + { + this->sysmem = sysmem; + this->tile_count = tile_count; + size_t total_size = sizeof(rp_gpu_data) + (tile_count * sizeof(tile_gpu_data)); + + std::scoped_lock lock(device->autotune->suballoc_mutex); + VkResult result = tu_suballoc_bo_alloc(&bo, &device->autotune->suballoc, total_size, alignof(rp_gpu_data)); + if (result != VK_SUCCESS) { + mesa_loge("Failed to allocate BO for autotune rp_entry: %u", result); + return; + } + + map = (uint8_t *) tu_suballoc_bo_map(&bo); + memset(map, 0, total_size); + } + + rp_gpu_data &get_gpu_data() + { + assert(map); + return *(rp_gpu_data *) map; + } + + tile_gpu_data &get_tile_gpu_data(uint32_t tile_index) + { + assert(map); + assert(tile_index < tile_count); + uint64_t offset = tile_gpu_data::offset(tile_index); + return *(tile_gpu_data *) (map + offset); + } + + /** Samples-Passed Metric **/ + + uint64_t get_samples_passed() + { + assert(config.test(metric_flag::SAMPLES)); + rp_gpu_data &gpu = get_gpu_data(); + return gpu.samples_end - gpu.samples_start; + } + + void emit_metric_samples_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova) + { + tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); + if (cmd->device->physical_device->info->props.has_event_write_sample_count) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value); + tu_cs_emit_qw(cs, start_iova); + + /* If the renderpass contains an occlusion query with its own ZPASS_DONE, we have to provide a fake ZPASS_DONE + * event here to logically close the previous one, preventing firmware from misbehaving due to nested events. + * This writes into the samples_end field, which will be overwritten in tu_autotune_end_renderpass. + */ + if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true, + .sample_count_end_offset = true, .write_accum_sample_count_diff = true) + .value); + tu_cs_emit_qw(cs, start_iova); + } + } else { + tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = start_iova)); + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); + } + } + + void emit_metric_samples_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint64_t start_iova, uint64_t end_iova) + { + tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); + if (cmd->device->physical_device->info->props.has_event_write_sample_count) { + /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE event here, composing a pair of these + * events that firmware handles without issue. This first event writes into the samples_end field and the + * second event overwrites it. The second event also enables the accumulation flag even when we don't use that + * result because the blob always sets it. + */ + if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) { + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true).value); + tu_cs_emit_qw(cs, end_iova); + } + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); + tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, .write_sample_count = true, + .sample_count_end_offset = true, .write_accum_sample_count_diff = true) + .value); + tu_cs_emit_qw(cs, start_iova); + } else { + tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_BASE(.qword = end_iova)); + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); + } + } + + /** RP/Tile Timestamp Metric **/ + + uint64_t get_rp_duration() + { + assert(config.test(metric_flag::TS)); + rp_gpu_data &gpu = get_gpu_data(); + return gpu.ts_end - gpu.ts_start; + } + + /* The amount of cycles spent in the longest tile. This is used to calculate the average draw duration for + * determining the largest non-preemptible duration for GMEM rendering. */ - u_rwlock_rdlock(&at->ht_lock); - struct hash_entry *entry = - _mesa_hash_table_search(at->ht, &rp_key); - if (entry) { - struct tu_renderpass_history *history = - (struct tu_renderpass_history *) entry->data; - if (history->num_results > 0) { - *avg_samples = p_atomic_read(&history->avg_samples); - has_history = true; + uint64_t get_max_tile_duration() + { + assert(config.test(metric_flag::TS_TILE)); + uint64_t max_duration = 0; + for (uint32_t i = 0; i < tile_count; i++) { + tile_gpu_data &tile = get_tile_gpu_data(i); + max_duration = MAX2(max_duration, tile.ts_end - tile.ts_start); } + return max_duration; } - u_rwlock_rdunlock(&at->ht_lock); - return has_history; + void emit_metric_timestamp(struct tu_cs *cs, uint64_t timestamp_iova) + { + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | CP_REG_TO_MEM_0_CNT(2) | CP_REG_TO_MEM_0_64B); + tu_cs_emit_qw(cs, timestamp_iova); + } + + /** Debug Performance Counters **/ + +#if TU_AUTOTUNE_DEBUG_PERFCTR + uint64_t get_preemption_reaction_delay(tu_autotune &at, uint64_t rp_hash) + { + rp_gpu_data &gpu = get_gpu_data(); + + while (p_atomic_read(&gpu.cntrs_ready) != rp_gpu_data::CNTRS_READY_MAGIC) { + /* Just spin until the counter values are written out. */ + } + + auto read_counter = [&](const struct tu_perf_ctr_sample &sample, const struct fd_perfcntr_countable *ctbl, + uint64_t &outValue, const char *name) { + if (sample.selector_begin != sample.selector_end || sample.selector_begin != ctbl->selector) { + mesa_loge( + "autotune %016" PRIx64 ": %s: selector mismatch %" PRIu32 " != %" PRIu32 " (%" PRIu32 " - %" PRIu32 ")", + rp_hash, ctbl->name, sample.selector_begin, sample.selector_end, sample.selector_begin, ctbl->selector); + } + + outValue = sample.end - sample.begin; + if (sample.end < sample.begin) { + mesa_loge("autotune %016" PRIx64 ": %s: end < begin %" PRIu64 " < %" PRIu64, rp_hash, name, sample.end, + sample.begin); + outValue = 0; + } + }; + + /* We read all counters for logging, even though we only need to return the preemption reaction delay. */ + uint64_t preemption_reaction_delay; + uint64_t num_preemptions; + uint64_t always_count; + read_counter(gpu.preemption_reaction_delay, at.preemption_reaction_delay, preemption_reaction_delay, + "preemption_reaction_delay"); + read_counter(gpu.num_preemptions, at.num_preemptions, num_preemptions, "num_preemptions"); + read_counter(gpu.always_count, at.always_count, always_count, "always_count"); + + if (preemption_reaction_delay || num_preemptions) { + at_log_perfctr_h("preemption_reaction_delay: %" PRIu64 ", always_count: %" PRIu64 + ", num_preemptions: %" PRIu64, + rp_hash, preemption_reaction_delay, always_count, num_preemptions); + } + + return preemption_reaction_delay; + } + + void emit_debug_perfcntr_start(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova) + { + auto countable_begin = [&](const struct fd_perfcntr_countable *ctbl, uint32_t cntr_idx, uint32_t offset) { + const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx]; + uint64_t offset_iova = bo_iova + offset; + assert(!ctr->enable); /* CP counters shouldn't use it. */ + + tu_cs_emit_pkt4(cs, ctr->select_reg, 1); + tu_cs_emit(cs, ctbl->selector); + + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1)); + tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_begin)); + + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, begin)); + }; + + countable_begin(at.preemption_reaction_delay, 10, offsetof(rp_gpu_data, preemption_reaction_delay)); + countable_begin(at.num_preemptions, 11, offsetof(rp_gpu_data, num_preemptions)); + countable_begin(at.always_count, 12, offsetof(rp_gpu_data, always_count)); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_wfi(cs); + } + + void emit_debug_perfcntr_end(struct tu_cs *cs, tu_autotune &at, uint64_t bo_iova) + { + tu_cs_emit_wfi(cs); + + auto countable_end = [&](uint32_t cntr_idx, uint64_t offset) { + const struct fd_perfcntr_counter *ctr = &at.cp_group->counters[cntr_idx]; + uint64_t offset_iova = bo_iova + offset; + + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->select_reg) | CP_REG_TO_MEM_0_CNT(1)); + tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, selector_end)); + + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(ctr->counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit_qw(cs, offset_iova + offsetof(struct tu_perf_ctr_sample, end)); + }; + + countable_end(10, offsetof(rp_gpu_data, preemption_reaction_delay)); + countable_end(11, offsetof(rp_gpu_data, num_preemptions)); + countable_end(12, offsetof(rp_gpu_data, always_count)); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, bo_iova + offsetof(rp_gpu_data, cntrs_ready)); + tu_cs_emit_qw(cs, rp_gpu_data::CNTRS_READY_MAGIC); + } +#endif + + /** CS Emission **/ + + void emit_rp_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs) + { + assert(map && bo.iova); + uint64_t bo_iova = bo.iova; + if (config.test(metric_flag::SAMPLES)) + emit_metric_samples_start(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start)); + + if (config.test(metric_flag::TS)) + emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_start)); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + emit_debug_perfcntr_start(cs, *cmd->device->autotune, bo_iova); +#endif + } + + void emit_rp_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) + { + assert(map && bo.iova); + uint64_t bo_iova = bo.iova; + if (config.test(metric_flag::SAMPLES)) + emit_metric_samples_end(cmd, cs, bo_iova + offsetof(rp_gpu_data, samples_start), + bo_iova + offsetof(rp_gpu_data, samples_end)); + + if (config.test(metric_flag::TS)) + emit_metric_timestamp(cs, bo_iova + offsetof(rp_gpu_data, ts_end)); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + emit_debug_perfcntr_end(cs, *cmd->device->autotune, bo_iova); +#endif + } + + void emit_tile_start(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index) + { + assert(map && bo.iova); + assert(!sysmem); + assert(tile_index < tile_count); + if (config.test(metric_flag::TS_TILE)) + emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_start)); + } + + void emit_tile_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tile_index) + { + assert(map && bo.iova); + assert(!sysmem); + assert(tile_index < tile_count); + if (config.test(metric_flag::TS_TILE)) + emit_metric_timestamp(cs, bo.iova + tile_gpu_data::offset(tile_index) + offsetof(tile_gpu_data, ts_end)); + } +}; + +tu_autotune::rp_entry_batch::rp_entry_batch(): active(false), fence(0), entries() +{ } -static struct tu_renderpass_result * -create_history_result(struct tu_autotune *at, uint64_t rp_key) +void +tu_autotune::rp_entry_batch::assign_fence(uint32_t new_fence) { - struct tu_renderpass_result *result = - (struct tu_renderpass_result *) calloc(1, sizeof(*result)); - result->rp_key = rp_key; - - return result; + assert(!active); /* Cannot assign a fence to an active entry batch. */ + fence = new_fence; + active = true; } -static void -history_add_result(struct tu_device *dev, struct tu_renderpass_history *history, - struct tu_renderpass_result *result) -{ - list_delinit(&result->node); - list_add(&result->node, &history->results); +/** Renderpass state tracking. **/ - if (history->num_results < MAX_HISTORY_RESULTS) { - history->num_results++; +tu_autotune::rp_key::rp_key(const struct tu_render_pass *pass, + const struct tu_framebuffer *framebuffer, + const struct tu_cmd_buffer *cmd) +{ + /* It may be hard to match the same renderpass between frames, or rather it's hard to strike a + * balance between being too lax with identifying different renderpasses as the same one, and + * not recognizing the same renderpass between frames when only a small thing changed. + * + * This is mainly an issue with translation layers (particularly DXVK), because a layer may + * break a "renderpass" into smaller ones due to some heuristic that isn't consistent between + * frames. + * + * Note: Not using image IOVA leads to too many false matches. + */ + + struct PACKED packed_att_properties { + uint64_t iova; + bool load; + bool store; + bool load_stencil; + bool store_stencil; + }; + + auto get_hash = [&](uint32_t *data, size_t size) { + uint32_t *ptr = data; + *ptr++ = framebuffer->width; + *ptr++ = framebuffer->height; + *ptr++ = framebuffer->layers; + + for (unsigned i = 0; i < pass->attachment_count; i++) { + packed_att_properties props = { + .iova = cmd->state.attachments[i]->image->iova + cmd->state.attachments[i]->view.offset, + .load = pass->attachments[i].load, + .store = pass->attachments[i].store, + .load_stencil = pass->attachments[i].load_stencil, + .store_stencil = pass->attachments[i].store_stencil, + }; + + memcpy(ptr, &props, sizeof(packed_att_properties)); + ptr += sizeof(packed_att_properties) / sizeof(uint32_t); + } + assert(ptr == data + size); + + return XXH64(data, size * sizeof(uint32_t), 0); + }; + + /* We do a manual Boost-style "small vector" optimization here where the stack is used for the vast majority of + * cases, while only extreme cases need to allocate on the heap. + */ + size_t data_count = 3 + (pass->attachment_count * sizeof(packed_att_properties) / sizeof(uint32_t)); + constexpr size_t STACK_MAX_DATA_COUNT = 3 + (5 * 3); /* in u32 units. */ + + if (data_count <= STACK_MAX_DATA_COUNT) { + /* If the data is small enough, we can use the stack. */ + std::array arr; + hash = get_hash(arr.data(), data_count); } else { - /* Once above the limit, start popping old results off the - * tail of the list: + /* If the data is too large, we have to allocate it on the heap. */ + std::vector vec(data_count); + hash = get_hash(vec.data(), vec.size()); + } +} + +tu_autotune::rp_key::rp_key(const rp_key &key, uint32_t duplicates) +{ + hash = XXH64(&key.hash, sizeof(key.hash), duplicates); +} + +/* Exponential moving average (EMA) calculator for smoothing successive values of any metric. An alpha (smoothing + * factor) of 0.1 means 10% weight to new values (slow adaptation), while 0.9 means 90% weight (fast adaptation). + */ +template class exponential_average { + private: + std::atomic average = std::numeric_limits::quiet_NaN(); + double alpha; + + public: + explicit exponential_average(double alpha = 0.1) noexcept: alpha(alpha) + { + } + + bool empty() const noexcept + { + double current = average.load(std::memory_order_relaxed); + return std::isnan(current); + } + + void add(T value) noexcept + { + double v = static_cast(value); + double current = average.load(std::memory_order_relaxed); + double new_avg; + do { + new_avg = std::isnan(current) ? v : (1.0 - alpha) * current + alpha * v; + } while (!average.compare_exchange_weak(current, new_avg, std::memory_order_relaxed, std::memory_order_relaxed)); + } + + void clear() noexcept + { + average.store(std::numeric_limits::quiet_NaN(), std::memory_order_relaxed); + } + + T get() const noexcept + { + double current = average.load(std::memory_order_relaxed); + return std::isnan(current) ? T {} : static_cast(current); + } +}; + +/* An improvement over pure EMA to filter out spikes by using two EMAs: + * - A "slow" EMA with a low alpha to track the long-term average. + * - A "fast" EMA with a high alpha to track short-term changes. + * When retrieving the average, if the fast EMA deviates significantly from the slow EMA, it indicates a spike, and we + * fall back to the slow EMA. + */ +template class adaptive_average { + private: + static constexpr double DEFAULT_SLOW_ALPHA = 0.1, DEFAULT_FAST_ALPHA = 0.5, DEFAULT_DEVIATION_THRESHOLD = 0.3; + exponential_average slow; + exponential_average fast; + double deviationThreshold; + + public: + size_t count = 0; + + explicit adaptive_average(double slow_alpha = DEFAULT_SLOW_ALPHA, + double fast_alpha = DEFAULT_FAST_ALPHA, + double deviation_threshold = DEFAULT_DEVIATION_THRESHOLD) noexcept + : slow(slow_alpha), fast(fast_alpha), deviationThreshold(deviation_threshold) + { + } + + void add(T value) noexcept + { + slow.add(value); + fast.add(value); + count++; + } + + T get() const noexcept + { + double s = slow.get(); + double f = fast.get(); + /* Use fast if it's close to slow (normal variation). + * Use slow if fast deviates too much (likely a spike). */ - struct tu_renderpass_result *old_result = - list_last_entry(&history->results, struct tu_renderpass_result, node); - mtx_lock(&dev->autotune_mutex); - free_result(dev, old_result); - mtx_unlock(&dev->autotune_mutex); + double deviation = std::abs(f - s) / s; + return (deviation < deviationThreshold) ? f : s + (f - s) * deviationThreshold; } - /* Do calculations here to avoid locking history in tu_autotune_use_bypass */ - uint32_t total_samples = 0; - list_for_each_entry(struct tu_renderpass_result, result, - &history->results, node) { - total_samples += result->samples_passed; + void clear() noexcept + { + slow.clear(); + fast.clear(); + count = 0; + } +}; + +/* All historical state pertaining to a uniquely identified RP. This integrates data from RP entries, accumulating + * metrics over the long-term and providing autotune algorithms using the data. + */ +struct tu_autotune::rp_history { + private: + /* Amount of duration samples for profiling before we start averaging. */ + static constexpr uint32_t MIN_PROFILE_DURATION_COUNT = 5; + + adaptive_average sysmem_rp_average; + adaptive_average gmem_rp_average; + + public: + uint64_t hash; /* The hash of the renderpass, just for debug output. */ + + std::atomic refcount = 0; /* Reference count to prevent deletion when active. */ + std::atomic last_use_ts; /* Last time the reference count was updated, in monotonic nanoseconds. */ + + rp_history(uint64_t hash): hash(hash), last_use_ts(os_time_get_nano()), profiled(hash) + { } - float avg_samples = (float)total_samples / (float)history->num_results; - p_atomic_set(&history->avg_samples, (uint32_t)avg_samples); -} + /** Bandwidth Estimation Algorithm **/ + struct bandwidth_algo { + private: + exponential_average mean_samples_passed; -static void -process_results(struct tu_autotune *at, uint32_t current_fence) -{ - struct tu_device *dev = at->device; - - list_for_each_entry_safe(struct tu_renderpass_result, result, - &at->pending_results, node) { - if (fence_before(current_fence, result->fence)) - break; - - struct tu_renderpass_history *history = result->history; - result->samples_passed = - result->samples->samples_end - result->samples->samples_start; - - history_add_result(dev, history, result); - } - - list_for_each_entry_safe(struct tu_submission_data, submission_data, - &at->pending_submission_data, node) { - if (fence_before(current_fence, submission_data->fence)) - break; - - finish_submission_data(at, submission_data); - } -} - -static void -queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf) -{ - bool one_time_submit = cmdbuf->usage_flags & - VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - - if (one_time_submit) { - /* We can just steal the list since it won't be resubmitted again */ - list_splicetail(&cmdbuf->renderpass_autotune_results, - &at->pending_results); - list_inithead(&cmdbuf->renderpass_autotune_results); - } else { - list_for_each_entry_safe(struct tu_renderpass_result, result, - &cmdbuf->renderpass_autotune_results, node) { - /* TODO: copying each result isn't nice */ - struct tu_renderpass_result *copy = - (struct tu_renderpass_result *) malloc(sizeof(*result)); - *copy = *result; - tu_bo_get_ref(copy->bo.bo); - list_addtail(©->node, &at->pending_results); + public: + void update(uint32_t samples) + { + mean_samples_passed.add(samples); } + + render_mode get_optimal_mode(rp_history &history, + const struct tu_cmd_state *cmd_state, + const struct tu_render_pass *pass, + const struct tu_framebuffer *framebuffer, + const struct tu_render_pass_state *rp_state) + { + const VkExtent2D &extent = cmd_state->render_area.extent; + const uint32_t pass_pixel_count = extent.width * extent.height; + uint64_t sysmem_bandwidth = (uint64_t) pass->sysmem_bandwidth_per_pixel * pass_pixel_count; + uint64_t gmem_bandwidth = (uint64_t) pass->gmem_bandwidth_per_pixel * pass_pixel_count; + + uint64_t total_draw_call_bandwidth = 0; + uint64_t mean_samples = mean_samples_passed.get(); + if (rp_state->drawcall_count && mean_samples > 0.0) { + /* The total draw call bandwidth is estimated as the average samples (collected via tracking samples passed + * within the CS) multiplied by the drawcall bandwidth per sample, divided by the amount of draw calls. + * + * This is a rough estimate of the bandwidth used by the draw calls in the renderpass for FB writes which + * is used to determine whether to use SYSMEM or GMEM. + */ + total_draw_call_bandwidth = + (mean_samples * rp_state->drawcall_bandwidth_per_sample_sum) / rp_state->drawcall_count; + } + + /* Drawcalls access the memory in SYSMEM rendering (ignoring CCU). */ + sysmem_bandwidth += total_draw_call_bandwidth; + + /* Drawcalls access GMEM in GMEM rendering, but we do not want to ignore them completely. The state changes + * between tiles also have an overhead. The magic numbers of 11 and 10 are randomly chosen. + */ + gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10; + + bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth; + render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM; + + at_log_bandwidth_h( + "%" PRIu32 " selecting %s\n" + " mean_samples=%" PRIu64 ", draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64 + ", render_area=%" PRIu32 "x%" PRIu32 ", sysmem_bandwidth_per_pixel=%" PRIu32 + ", gmem_bandwidth_per_pixel=%" PRIu32 ", sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64, + history.hash, rp_state->drawcall_count, render_mode_str(mode), mean_samples, + (float) rp_state->drawcall_bandwidth_per_sample_sum / rp_state->drawcall_count, total_draw_call_bandwidth, + extent.width, extent.height, pass->sysmem_bandwidth_per_pixel, pass->gmem_bandwidth_per_pixel, + sysmem_bandwidth, gmem_bandwidth); + + return mode; + } + } bandwidth; + + /** Profiled Algorithms **/ + struct profiled_algo { + private: + /* Range [0 (GMEM), 100 (SYSMEM)], where 50 means no preference. */ + constexpr static uint32_t PROBABILITY_MAX = 100, PROBABILITY_MID = 50; + constexpr static uint32_t PROBABILITY_PREFER_SYSMEM = 80, PROBABILITY_PREFER_GMEM = 20; + + std::atomic sysmem_probability = PROBABILITY_MID; + bool should_reset = false; /* If true, will reset sysmem_probability before next update. */ + bool locked = false; /* If true, the probability will no longer be updated. */ + uint64_t seed[2] { 0x3bffb83978e24f88, 0x9238d5d56c71cd35 }; + + bool is_sysmem_winning = false; + uint64_t winning_since_ts = 0; + + public: + profiled_algo(uint64_t hash) + { + seed[1] = hash; + } + + void update(rp_history &history, bool immediate) + { + if (locked) + return; + + auto &sysmem_ema = history.sysmem_rp_average; + auto &gmem_ema = history.gmem_rp_average; + uint32_t sysmem_prob = sysmem_probability.load(std::memory_order_relaxed); + if (immediate) { + /* Try to immediately resolve the probability, this is useful for CI running a single trace of frames where + * the probabilites aren't expected to change from run to run. This environment also gives us a best case + * scenario for autotune performance, since we know the optimal decisions. + */ + + if (sysmem_ema.count < 1) { + sysmem_prob = PROBABILITY_MAX; + } else if (gmem_ema.count < 1) { + sysmem_prob = 0; + } else { + sysmem_prob = gmem_ema.get() < sysmem_ema.get() ? 0 : PROBABILITY_MAX; + locked = true; + } + } else { + if (sysmem_ema.count < MIN_PROFILE_DURATION_COUNT || gmem_ema.count < MIN_PROFILE_DURATION_COUNT) { + /* Not enough data to make a decision, bias towards least used. */ + sysmem_prob = sysmem_ema.count < gmem_ema.count ? PROBABILITY_PREFER_SYSMEM : PROBABILITY_PREFER_GMEM; + should_reset = true; + } else { + if (should_reset) { + sysmem_prob = PROBABILITY_MID; + should_reset = false; + } + + /* Adjust probability based on timing results. */ + constexpr uint32_t FAST_STEP_DELTA = 5, FAST_MIN_PROBABILITY = 5, FAST_MAX_PROBABILITY = 95; + constexpr uint32_t SLOW_STEP_DELTA = 1, SLOW_MIN_PROBABILITY = 1, SLOW_MAX_PROBABILITY = 99; + + uint64_t avg_sysmem = sysmem_ema.get(); + uint64_t avg_gmem = gmem_ema.get(); + + if (avg_gmem < avg_sysmem) { + if (sysmem_prob > FAST_MIN_PROBABILITY && sysmem_prob <= FAST_MAX_PROBABILITY) + sysmem_prob = MAX2(sysmem_prob - FAST_STEP_DELTA, FAST_MIN_PROBABILITY); + else if (sysmem_prob > SLOW_MIN_PROBABILITY) + sysmem_prob = MAX2(sysmem_prob - SLOW_STEP_DELTA, SLOW_MIN_PROBABILITY); + } else if (avg_sysmem < avg_gmem) { + if (sysmem_prob >= FAST_MIN_PROBABILITY && sysmem_prob < FAST_MAX_PROBABILITY) + sysmem_prob = MIN2(sysmem_prob + FAST_STEP_DELTA, FAST_MAX_PROBABILITY); + else if (sysmem_prob < SLOW_MAX_PROBABILITY) + sysmem_prob = MIN2(sysmem_prob + SLOW_STEP_DELTA, SLOW_MAX_PROBABILITY); + } + + /* If the RP duration exceeds a certain minimum duration threshold (i.e. has a large impact on frametime) + * and the percentage difference between the modes is large enough, we lock into the optimal mode. This + * avoids performance hazards from switching to an extremely suboptimal mode even if done very rarely. + * Note: Due to the potentially huge negative impact of a bad lock, this is a very conservative check. + */ + constexpr uint32_t MIN_LOCK_DURATION_COUNT = 15; + constexpr uint64_t MIN_LOCK_THRESHOLD = GPU_TICKS_PER_US * 1'000; /* 1ms */ + constexpr uint32_t LOCK_PERCENT_DIFF = 30; + constexpr uint64_t LOCK_TIME_WINDOW_NS = 30'000'000'000; /* 30s */ + + uint64_t now = os_time_get_nano(); + bool current_sysmem_winning = avg_sysmem < avg_gmem; + + if (winning_since_ts == 0 || current_sysmem_winning != is_sysmem_winning) { + winning_since_ts = now; + is_sysmem_winning = current_sysmem_winning; + } + + bool has_resolved = sysmem_prob == SLOW_MAX_PROBABILITY || sysmem_prob == SLOW_MIN_PROBABILITY; + bool enough_samples = + sysmem_ema.count >= MIN_LOCK_DURATION_COUNT && gmem_ema.count >= MIN_LOCK_DURATION_COUNT; + uint64_t min_avg = MIN2(avg_sysmem, avg_gmem); + uint64_t max_avg = MAX2(avg_sysmem, avg_gmem); + uint64_t percent_diff = (100 * (max_avg - min_avg)) / min_avg; + + if (has_resolved && enough_samples && max_avg >= MIN_LOCK_THRESHOLD && + percent_diff >= LOCK_PERCENT_DIFF && (now - winning_since_ts) >= LOCK_TIME_WINDOW_NS) { + if (avg_gmem < avg_sysmem) + sysmem_prob = 0; + else + sysmem_prob = 100; + locked = true; + } + } + } + + sysmem_probability.store(sysmem_prob, std::memory_order_relaxed); + + at_log_profiled_h("update%s avg_gmem: %" PRIu64 " us (%" PRIu64 " samples) avg_sysmem: %" PRIu64 + " us (%" PRIu64 " samples) = sysmem_probability: %" PRIu32 " locked: %u", + history.hash, immediate ? "-imm" : "", ticks_to_us(gmem_ema.get()), gmem_ema.count, + ticks_to_us(sysmem_ema.get()), sysmem_ema.count, sysmem_prob, locked); + } + + public: + render_mode get_optimal_mode(rp_history &history) + { + uint32_t l_sysmem_probability = sysmem_probability.load(std::memory_order_relaxed); + bool select_sysmem = (rand_xorshift128plus(seed) % PROBABILITY_MAX) < l_sysmem_probability; + render_mode mode = select_sysmem ? render_mode::SYSMEM : render_mode::GMEM; + + at_log_profiled_h("%" PRIu32 "%% sysmem chance, using %s", history.hash, l_sysmem_probability, + render_mode_str(mode)); + + return mode; + } + } profiled; + + /** Preemption Latency Optimization Mode **/ + struct preempt_optimize_mode { + private: + adaptive_average sysmem_draw_average; + adaptive_average gmem_tile_average; + + /* If the renderpass has long draws which are at risk of causing high preemptible latency. */ + std::atomic latency_risk = false; + /* The factor by which the tile size should be divided to reduce preemption latency. */ + std::atomic tile_size_divisor = 1; + + /* The next timestamp to update the latency sensitivity parameters at. */ + uint64_t latency_update_ts = 0; + /* The next timestamp where it's allowed to decrement the divisor. */ + uint64_t divisor_decrement_ts = 0; + /* The next timestamp where it's allowed to mark the RP as no longer latency sensitive. */ + uint64_t latency_switch_ts = 0; + + /* Threshold of longest non-preemptible duration before activating latency optimization: 1.5ms */ + static constexpr uint64_t TARGET_THRESHOLD = GPU_TICKS_PER_US * 1500; + +#if TU_AUTOTUNE_DEBUG_PERFCTR + /* The highest preemption reaction delay recorded for the RP since the last update. */ + uint64_t max_preemption_latency = 0; + + public: + void update_preemption_latency(uint64_t preemption_latency) + { + max_preemption_latency = MAX2(max_preemption_latency, preemption_latency); + } +#endif + + public: + void update_sysmem(rp_history &history, uint64_t draw_duration) + { + bool l_latency_risk = latency_risk.load(std::memory_order_relaxed); + + if (!l_latency_risk) { + /* Try to estimate the minimum non-preemptible duration for draw-level preemption, by dividing the total + * time by the RP by the amount of draws. This isn't very accurate as it's skewed by the time taken by + * commands other than draws (e.g. clears or blits), but it's a good enough estimate to catch the worst + * offenders. + * + * If the average draw duration is above a certain threshold, we mark the RP as latency sensitive which + * should bias the decision towards GMEM. + */ + + sysmem_draw_average.add(draw_duration); + uint64_t avg_sysmem_draw = sysmem_draw_average.get(); + uint64_t sysmem_draw_count = sysmem_draw_average.count; + + at_log_preempt_h("avg_sysmem_draw: %" PRIu64 " us (%u), latency_risk: %u" +#if TU_AUTOTUNE_DEBUG_PERFCTR + ", preemption_latency: %" PRIu64 +#endif + , + history.hash, ticks_to_us(avg_sysmem_draw), avg_sysmem_draw > TARGET_THRESHOLD, + l_latency_risk +#if TU_AUTOTUNE_DEBUG_PERFCTR + , + max_preemption_latency +#endif + ); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + max_preemption_latency = 0; +#endif + + if (sysmem_draw_count >= MIN_PROFILE_DURATION_COUNT && avg_sysmem_draw > TARGET_THRESHOLD) { + latency_risk.store(true, std::memory_order_relaxed); + at_log_preempt_h("high sysmem draw duration %" PRIu64 " us, marking as latency sensitive", history.hash, + ticks_to_us(avg_sysmem_draw)); + } + } + } + + void update_gmem(rp_history &history, uint64_t tile_duration) + { + constexpr uint64_t default_update_duration_ns = 100'000'000; /* 100ms */ + constexpr uint64_t change_update_duration_ns = 500'000'000; /* 500ms */ + constexpr uint64_t downward_update_duration_ns = 10'000'000'000; /* 10s */ + constexpr uint64_t latency_insensitive_duration_ns = 30'000'000'000; /* 30s */ + + gmem_tile_average.add(tile_duration); + + uint64_t now = os_time_get_nano(); + if (latency_update_ts > now) + return; /* No need to update yet. */ + + /* If the RP is latency sensitive and we're using GMEM, we should check if it's worth reducing the tile size to + * reduce the latency risk further or if it's already low enough that it's not worth the performance hit. + */ + + uint64_t update_duration_ns = default_update_duration_ns; + if (gmem_tile_average.count > MIN_PROFILE_DURATION_COUNT) { + uint64_t avg_gmem_tile = gmem_tile_average.get(); + bool l_latency_risk = latency_risk.load(std::memory_order_relaxed); + if (!l_latency_risk) { + if (avg_gmem_tile > TARGET_THRESHOLD) { + latency_risk.store(true, std::memory_order_relaxed); + latency_switch_ts = now + latency_insensitive_duration_ns; + + at_log_preempt_h("high gmem tile duration %" PRIu64 ", marking as latency sensitive", history.hash, + avg_gmem_tile); + } + } else { + uint32_t l_tile_size_divisor = tile_size_divisor.load(std::memory_order_relaxed); + at_log_preempt_h("avg_gmem_tile: %" PRIu64 " us (%u), latency_risk: %u, tile_size_divisor: %" PRIu32 +#if TU_AUTOTUNE_DEBUG_PERFCTR + ", preemption_latency: %" PRIu64 +#endif + , + history.hash, ticks_to_us(avg_gmem_tile), avg_gmem_tile > TARGET_THRESHOLD, + l_latency_risk, l_tile_size_divisor +#if TU_AUTOTUNE_DEBUG_PERFCTR + , + max_preemption_latency +#endif + ); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + max_preemption_latency = 0; +#endif + + int delta = 0; + if (avg_gmem_tile > TARGET_THRESHOLD && l_tile_size_divisor < TU_GMEM_LAYOUT_DIVISOR_MAX) { + /* If the average tile duration is high, we should reduce the tile size to reduce the latency risk. */ + delta = 1; + + divisor_decrement_ts = now + downward_update_duration_ns; + } else if (avg_gmem_tile * 4 < TARGET_THRESHOLD && l_tile_size_divisor > 1 && + divisor_decrement_ts <= now) { + /* If the average tile duration is low enough that we can get away with a larger tile size, we should + * increase the tile size to reduce the performance hit of the smaller tiles. + * + * Note: The 4x factor is to account for the tile duration being halved when we increase the tile size + * divisor by 1, with an additional 2x factor to generally be conservative about reducing the divisor + * since it can lead to oscillation between tile sizes. + * + * Similarly, divisor_decrement_ts is used to limit how often we can reduce the divisor to avoid + * oscillation. + */ + delta = -1; + latency_switch_ts = now + latency_insensitive_duration_ns; + } else if (avg_gmem_tile * 10 < TARGET_THRESHOLD && l_tile_size_divisor == 1 && + latency_switch_ts <= now) { + /* If the average tile duration is low enough that we no longer consider the RP latency sensitive, we + * can switch it back to non-latency sensitive. + */ + latency_risk.store(false, std::memory_order_relaxed); + } + + if (delta != 0) { + /* Clear all the results to avoid biasing the decision based on the old tile size. */ + gmem_tile_average.clear(); + + uint32_t new_tile_size_divisor = l_tile_size_divisor + delta; + at_log_preempt_h("updating tile size divisor: %" PRIu32 " -> %" PRIu32, history.hash, + l_tile_size_divisor, new_tile_size_divisor); + + tile_size_divisor.store(new_tile_size_divisor, std::memory_order_relaxed); + + update_duration_ns = change_update_duration_ns; + } + } + + latency_update_ts = now + update_duration_ns; + } + } + + /* If this RP has a risk of causing high preemption latency. */ + bool is_latency_sensitive() const + { + return latency_risk.load(std::memory_order_relaxed); + } + + uint32_t get_tile_size_divisor() const + { + return tile_size_divisor.load(std::memory_order_relaxed); + } + } preempt_optimize; + + void process(rp_entry &entry, tu_autotune &at) + { + /* We use entry config to know what metrics it has, autotune config to know what algorithms are enabled. */ + config_t entry_config = entry.config; + config_t at_config = at.active_config.load(); + + if (entry_config.test(metric_flag::SAMPLES) && at_config.is_enabled(algorithm::BANDWIDTH)) + bandwidth.update(entry.get_samples_passed()); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + preempt_optimize.update_preemption_latency(entry.get_preemption_reaction_delay(at, hash)); +#endif + + if (entry_config.test(metric_flag::TS)) { + if (entry.sysmem) { + uint64_t rp_duration = entry.get_rp_duration(); + + sysmem_rp_average.add(rp_duration); + + if (at_config.test(mod_flag::PREEMPT_OPTIMIZE)) + preempt_optimize.update_sysmem(*this, rp_duration / entry.draw_count); + } else { + gmem_rp_average.add(entry.get_rp_duration()); + + if (entry_config.test(metric_flag::TS_TILE) && at_config.test(mod_flag::PREEMPT_OPTIMIZE)) + preempt_optimize.update_gmem(*this, entry.get_max_tile_duration()); + } + + if (at_config.is_enabled(algorithm::PROFILED) || at_config.is_enabled(algorithm::PROFILED_IMM)) { + profiled.update(*this, at_config.is_enabled(algorithm::PROFILED_IMM)); + } + } + } +}; + +tu_autotune::rp_history_handle::~rp_history_handle() +{ + if (!history) + return; + + history->last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed); + ASSERTED uint32_t old_refcount = history->refcount.fetch_sub(1, std::memory_order_relaxed); + assert(old_refcount != 0); /* Underflow check. */ +} + +tu_autotune::rp_history_handle::rp_history_handle(rp_history &history): history(&history) +{ + history.refcount.fetch_add(1, std::memory_order_relaxed); + history.last_use_ts.store(os_time_get_nano(), std::memory_order_relaxed); +} + +tu_autotune::rp_history_handle +tu_autotune::find_rp_history(const rp_key &key) +{ + std::shared_lock lock(rp_mutex); + auto it = rp_histories.find(key); + if (it != rp_histories.end()) + return rp_history_handle(it->second); + + return rp_history_handle(nullptr); +} + +tu_autotune::rp_history_handle +tu_autotune::find_or_create_rp_history(const rp_key &key) +{ + rp_history_handle existing = find_rp_history(key); + if (existing) + return existing; + + /* If we reach here, we have to create a new history. */ + std::unique_lock lock(rp_mutex); + auto it = rp_histories.find(key); + if (it != rp_histories.end()) + return it->second; /* Another thread created the history while we were waiting for the lock. */ + auto history = rp_histories.emplace(std::make_pair(key, key.hash)); + return rp_history_handle(history.first->second); +} + +void +tu_autotune::reap_old_rp_histories() +{ + constexpr uint64_t REAP_INTERVAL_NS = 10'000'000'000; /* 10s */ + uint64_t now = os_time_get_nano(); + if (last_reap_ts + REAP_INTERVAL_NS > now) + return; + last_reap_ts = now; + + constexpr size_t MAX_RP_HISTORIES = 1024; /* Not a hard limit, we might exceed this if there's many active RPs. */ + { + /* Quicker non-unique lock, should hit this path mostly. */ + std::shared_lock lock(rp_mutex); + if (rp_histories.size() <= MAX_RP_HISTORIES) + return; + } + + std::unique_lock lock(rp_mutex); + size_t og_size = rp_histories.size(); + if (og_size <= MAX_RP_HISTORIES) + return; + + std::vector candidates; + candidates.reserve(og_size); + for (auto it = rp_histories.begin(); it != rp_histories.end(); ++it) { + if (it->second.refcount.load(std::memory_order_relaxed) == 0) + candidates.push_back(it); + } + + size_t to_purge = std::min(candidates.size(), og_size - MAX_RP_HISTORIES); + if (to_purge == 0) { + at_log_base("no RP histories to reap at size %zu, all are active", og_size); + return; + } + + /* Partition candidates by last use timestamp, oldest first. */ + auto partition_end = candidates.begin() + to_purge; + if (to_purge < candidates.size()) { + std::nth_element(candidates.begin(), partition_end, candidates.end(), + [](rp_histories_t::iterator a, rp_histories_t::iterator b) { + return a->second.last_use_ts.load(std::memory_order_relaxed) < + b->second.last_use_ts.load(std::memory_order_relaxed); + }); + } + + for (auto it = candidates.begin(); it != partition_end; ++it) { + rp_history &history = (*it)->second; + if (history.refcount.load(std::memory_order_relaxed) == 0) { + at_log_base("reaping RP history %016" PRIx64, history.hash); + rp_histories.erase(*it); + } + } + + at_log_base("reaped old RP histories %zu -> %zu", og_size, rp_histories.size()); +} + +void +tu_autotune::process_entries() +{ + uint32_t current_fence = device->global_bo_map->autotune_fence; + + while (!active_batches.empty()) { + auto &batch = active_batches.front(); + assert(batch->active); + + if (fence_before(current_fence, batch->fence)) + break; /* Entries are allocated in sequence, next will be newer and + also fail so we can just directly break out of the loop. */ + + for (auto &entry : batch->entries) + entry->history->process(*entry, *this); + + active_batches.pop_front(); + } + + if (active_batches.size() > 10) { + at_log_base("high amount of active batches: %zu, fence: %" PRIu32 " < %" PRIu32, active_batches.size(), + current_fence, active_batches.front()->fence); } } struct tu_cs * -tu_autotune_on_submit(struct tu_device *dev, - struct tu_autotune *at, - struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count) +tu_autotune::on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count) { - /* We are single-threaded here */ - const uint32_t gpu_fence = get_autotune_fence(at); - const uint32_t new_fence = at->fence_counter++; - - process_results(at, gpu_fence); - - /* Create history entries here to minimize work and locking being - * done on renderpass end. + /* This call occurs regularly and we are single-threaded here, so we use this opportunity to process any available + * entries. It's also important that any entries are processed here because we always want to ensure that we've + * processed all entries from prior CBs before we submit any new CBs with the same RP to the GPU. */ + process_entries(); + reap_old_rp_histories(); + + bool has_results = false; for (uint32_t i = 0; i < cmd_buffer_count; i++) { - struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; - list_for_each_entry_safe(struct tu_renderpass_result, result, - &cmdbuf->renderpass_autotune_results, node) { - struct tu_renderpass_history *history; - struct hash_entry *entry = - _mesa_hash_table_search(at->ht, &result->rp_key); - if (!entry) { - history = - (struct tu_renderpass_history *) calloc(1, sizeof(*history)); - history->key = result->rp_key; - list_inithead(&history->results); - - u_rwlock_wrlock(&at->ht_lock); - _mesa_hash_table_insert(at->ht, &history->key, history); - u_rwlock_wrunlock(&at->ht_lock); - } else { - history = (struct tu_renderpass_history *) entry->data; - } - - history->last_fence = new_fence; - - result->fence = new_fence; - result->history = history; + auto &batch = cmd_buffers[i]->autotune_ctx.batch; + if (!batch->entries.empty()) { + has_results = true; + break; } } + if (!has_results) + return nullptr; /* No results to process, return early. */ - struct tu_submission_data *submission_data = - create_submission_data(dev, at, new_fence); - + /* Generate a new fence and the CS for it. */ + const uint32_t new_fence = next_fence++; + auto fence_cs = get_cs_for_fence(new_fence); for (uint32_t i = 0; i < cmd_buffer_count; i++) { + /* Transfer the entries from the command buffers to the active queue. */ struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; - if (list_is_empty(&cmdbuf->renderpass_autotune_results)) + auto &batch = cmdbuf->autotune_ctx.batch; + if (batch->entries.empty()) continue; - queue_pending_results(at, cmdbuf); + batch->assign_fence(new_fence); + if (cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) { + /* If the command buffer is one-time submit, we can move the batch directly into the active batches, as it + * won't be used again. This would lead to it being deallocated as early as possible. + */ + active_batches.push_back(std::move(batch)); + } else { + active_batches.push_back(batch); + } } - if (TU_AUTOTUNE_DEBUG_LOG) - mesa_logi("Total history entries: %u", at->ht->entries); + return fence_cs; +} - /* Cleanup old entries from history table. The assumption - * here is that application doesn't hold many old unsubmitted - * command buffers, otherwise this table may grow big. +tu_autotune::tu_autotune(struct tu_device *device, VkResult &result): device(device), active_config(get_env_config()) +{ + tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc"); + +#if TU_AUTOTUNE_DEBUG_PERFCTR + uint32_t group_count; + const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count); + + for (uint32_t i = 0; i < group_count; i++) { + if (strcmp(groups[i].name, "CP") == 0) { + cp_group = &groups[i]; + break; + } + } + + if (!cp_group) { + mesa_loge("autotune: CP group not found"); + result = VK_ERROR_INITIALIZATION_FAILED; + return; + } else if (cp_group->num_countables < 5) { + mesa_loge("autotune: CP group has too few countables"); + result = VK_ERROR_INITIALIZATION_FAILED; + return; + } + + auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group, + const char *name) -> const struct fd_perfcntr_countable * { + for (uint32_t i = 0; i < group->num_countables; i++) { + if (strcmp(group->countables[i].name, name) == 0) + return &group->countables[i]; + } + + mesa_loge("autotune: %s not found in group %s", name, group->name); + return nullptr; + }; + + preemption_reaction_delay = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); + num_preemptions = get_perfcntr_countable(cp_group, "PERF_CP_NUM_PREEMPTIONS"); + always_count = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); + + if (!preemption_reaction_delay || !num_preemptions || !always_count) { + mesa_loge("autotune: preemption countables not found"); + result = VK_ERROR_INITIALIZATION_FAILED; + return; + } +#endif + + result = VK_SUCCESS; + return; +} + +tu_autotune::~tu_autotune() +{ + if (TU_AUTOTUNE_FLUSH_AT_FINISH) { + while (!active_batches.empty()) + process_entries(); + at_log_base("finished processing all entries"); + } + + tu_bo_suballocator_finish(&suballoc); +} + +tu_autotune::cmd_buf_ctx::cmd_buf_ctx(): batch(std::make_shared()) +{ +} + +tu_autotune::cmd_buf_ctx::~cmd_buf_ctx() +{ + /* This is empty but it causes the implicit destructor to be compiled within this compilation unit with access to + * internal structures. Otherwise, we would need to expose the full definition of autotuner internals in the header + * file, which is not desirable. */ - hash_table_foreach(at->ht, entry) { - struct tu_renderpass_history *history = - (struct tu_renderpass_history *) entry->data; - if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME)) - continue; - - if (TU_AUTOTUNE_DEBUG_LOG) - mesa_logi("Removed old history entry %016" PRIx64 "", history->key); - - u_rwlock_wrlock(&at->ht_lock); - _mesa_hash_table_remove_key(at->ht, &history->key); - u_rwlock_wrunlock(&at->ht_lock); - - mtx_lock(&dev->autotune_mutex); - free_history(dev, history); - mtx_unlock(&dev->autotune_mutex); - } - - return &submission_data->fence_cs; -} - -static bool -renderpass_key_equals(const void *_a, const void *_b) -{ - return *(uint64_t *)_a == *(uint64_t *)_b; -} - -static uint32_t -renderpass_key_hash(const void *_a) -{ - return *((uint64_t *) _a) & 0xffffffff; -} - -VkResult -tu_autotune_init(struct tu_autotune *at, struct tu_device *dev) -{ - at->enabled = true; - at->device = dev; - at->ht = _mesa_hash_table_create(NULL, - renderpass_key_hash, - renderpass_key_equals); - u_rwlock_init(&at->ht_lock); - - list_inithead(&at->pending_results); - list_inithead(&at->pending_submission_data); - list_inithead(&at->submission_data_pool); - - /* start from 1 because tu6_global::autotune_fence is initialized to 0 */ - at->fence_counter = 1; - - return VK_SUCCESS; } void -tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev) +tu_autotune::cmd_buf_ctx::reset() { - if (TU_AUTOTUNE_LOG_AT_FINISH) { - while (!list_is_empty(&at->pending_results)) { - const uint32_t gpu_fence = get_autotune_fence(at); - process_results(at, gpu_fence); - } - - hash_table_foreach(at->ht, entry) { - struct tu_renderpass_history *history = - (struct tu_renderpass_history *) entry->data; - - mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u", - history->key, history->avg_samples, history->num_results); - } - } - - tu_autotune_free_results(dev, &at->pending_results); - - mtx_lock(&dev->autotune_mutex); - hash_table_foreach(at->ht, entry) { - struct tu_renderpass_history *history = - (struct tu_renderpass_history *) entry->data; - free_history(dev, history); - } - mtx_unlock(&dev->autotune_mutex); - - list_for_each_entry_safe(struct tu_submission_data, submission_data, - &at->pending_submission_data, node) { - free_submission_data(submission_data); - } - - list_for_each_entry_safe(struct tu_submission_data, submission_data, - &at->submission_data_pool, node) { - free_submission_data(submission_data); - } - - _mesa_hash_table_destroy(at->ht, NULL); - u_rwlock_destroy(&at->ht_lock); + batch = std::make_shared(); } -bool -tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count) +tu_autotune::rp_entry * +tu_autotune::cmd_buf_ctx::attach_rp_entry(struct tu_device *device, + rp_history_handle &&history, + config_t config, + uint32_t drawcall_count) { - for (uint32_t i = 0; i < cmd_buffer_count; i++) { - struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; - if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) - return true; + std::unique_ptr &new_entry = + batch->entries.emplace_back(std::make_unique(device, std::move(history), config, drawcall_count)); + return new_entry.get(); +} + +tu_autotune::rp_entry * +tu_autotune::cmd_buf_ctx::find_rp_entry(const rp_key &key) +{ + for (auto &entry : batch->entries) { + if (entry->history->hash == key.hash) + return entry.get(); } - - return false; + return nullptr; } -void -tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results) +tu_autotune::render_mode +tu_autotune::get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx) { - list_for_each_entry_safe(struct tu_renderpass_result, result, - results, node) { - free_result(dev, result); - } -} + const struct tu_cmd_state *cmd_state = &cmd_buffer->state; + const struct tu_render_pass *pass = cmd_state->pass; + const struct tu_framebuffer *framebuffer = cmd_state->framebuffer; + const struct tu_render_pass_state *rp_state = &cmd_state->rp; + cmd_buf_ctx &cb_ctx = cmd_buffer->autotune_ctx; + config_t config = active_config.load(); -void -tu_autotune_free_results(struct tu_device *dev, struct list_head *results) -{ - mtx_lock(&dev->autotune_mutex); - tu_autotune_free_results_locked(dev, results); - mtx_unlock(&dev->autotune_mutex); -} - -static bool -fallback_use_bypass(const struct tu_render_pass *pass, - const struct tu_framebuffer *framebuffer, - const struct tu_cmd_buffer *cmd_buffer) -{ - if (cmd_buffer->state.rp.drawcall_count > 5) - return false; - - for (unsigned i = 0; i < pass->subpass_count; i++) { - if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT) - return false; - } - - return true; -} - -static uint32_t -get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd) -{ - const VkExtent2D *extent = &cmd->state.render_area.extent; - return extent->width * extent->height; -} - -static uint64_t -estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd, - uint32_t avg_renderpass_sample_count) -{ - const struct tu_cmd_state *state = &cmd->state; - - if (!state->rp.drawcall_count) - return 0; - - /* sample count times drawcall_bandwidth_per_sample */ - return (uint64_t)avg_renderpass_sample_count * - state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count; -} - -bool -tu_autotune_use_bypass(struct tu_autotune *at, - struct tu_cmd_buffer *cmd_buffer, - struct tu_renderpass_result **autotune_result) -{ - const struct tu_render_pass *pass = cmd_buffer->state.pass; - const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer; + /* Just to ensure a segfault for accesses, in case we don't set it. */ + *rp_ctx = nullptr; /* If a feedback loop in the subpass caused one of the pipelines used to set - * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even - * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased - * sysmem bandwidth (though we haven't quantified it). + * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even SINGLE_PRIM_MODE(FLUSH), then that should cause + * significantly increased SYSMEM bandwidth (though we haven't quantified it). */ - if (cmd_buffer->state.rp.sysmem_single_prim_mode) - return false; + if (rp_state->sysmem_single_prim_mode) + return render_mode::GMEM; - /* If the user is using a fragment density map, then this will cause less - * FS invocations with GMEM, which has a hard-to-measure impact on - * performance because it depends on how heavy the FS is in addition to how - * many invocations there were and the density. Let's assume the user knows - * what they're doing when they added the map, because if sysmem is - * actually faster then they could've just not used the fragment density - * map. + /* If the user is using a fragment density map, then this will cause less FS invocations with GMEM, which has a + * hard-to-measure impact on performance because it depends on how heavy the FS is in addition to how many + * invocations there were and the density. Let's assume the user knows what they're doing when they added the map, + * because if SYSMEM is actually faster then they could've just not used the fragment density map. */ if (pass->has_fdm) - return false; + return render_mode::GMEM; - /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers - * we would have to allocate GPU memory at the submit time and copy - * results into it. - * Native games ususally don't use it, Zink and DXVK don't use it, - * D3D12 doesn't have such concept. + /* SYSMEM is always a safe default mode when we can't fully engage the autotuner. From testing, we know that for an + * incorrect decision towards SYSMEM tends to be far less impactful than an incorrect decision towards GMEM, which + * can cause significant performance issues. */ - bool simultaneous_use = - cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + constexpr render_mode default_mode = render_mode::SYSMEM; - if (!at->enabled || simultaneous_use) - return fallback_use_bypass(pass, framebuffer, cmd_buffer); - - /* We use 64bit hash as a key since we don't fear rare hash collision, - * the worst that would happen is sysmem being selected when it should - * have not, and with 64bit it would be extremely rare. + /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers, we would have to allocate GPU memory at the submit time + * and copy results into it. We just disable complex autotuner in this case, which isn't a big issue since native + * games usually don't use it, Zink and DXVK don't use it, while D3D12 doesn't even have such concept. * - * Q: Why not make the key from framebuffer + renderpass pointers? - * A: At least DXVK creates new framebuffers each frame while keeping - * renderpasses the same. Also we want to support replaying a single - * frame in a loop for testing. + * We combine this with processing entries at submit time, to avoid a race where the CPU hasn't processed the results + * from an earlier submission of the CB while a second submission of the CB is on the GPU queue. */ - uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer); + bool simultaneous_use = cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; - *autotune_result = create_history_result(at, renderpass_key); + /* These smaller RPs with few draws are too difficult to create a balanced hash for that can independently identify + * them while not being so unique to not properly identify them across CBs. They're generally insigificant outside of + * a few edge cases such as during deferred rendering G-buffer passes, as we don't have a good way to deal with those + * edge cases yet, we just disable the autotuner for small RPs entirely for now unless TUNE_SMALL is specified. + */ + bool ignore_small_rp = !config.test(mod_flag::TUNE_SMALL) && rp_state->drawcall_count < 5; - uint32_t avg_samples = 0; - if (get_history(at, renderpass_key, &avg_samples)) { - const uint32_t pass_pixel_count = - get_render_pass_pixel_count(cmd_buffer); - uint64_t sysmem_bandwidth = - (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count; - uint64_t gmem_bandwidth = - (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count; + if (!enabled || simultaneous_use || ignore_small_rp) + return default_mode; - const uint64_t total_draw_call_bandwidth = - estimate_drawcall_bandwidth(cmd_buffer, avg_samples); + + /* We can return early with the decision based on the draw call count, instead of needing to hash the renderpass + * instance and look up the history, which is far more expensive. + * + * However, certain options such as latency sensitive mode take precedence over any of the other autotuner options + * and we cannot do so in those cases. + */ + bool can_early_return = !config.test(mod_flag::PREEMPT_OPTIMIZE); + auto early_return_mode = [&]() -> std::optional { + if (config.test(mod_flag::BIG_GMEM) && rp_state->drawcall_count >= 10) + return render_mode::GMEM; + if (config.is_enabled(algorithm::PREFER_SYSMEM)) + return render_mode::SYSMEM; + return std::nullopt; + }(); - /* drawcalls access the memory in sysmem rendering (ignoring CCU) */ - sysmem_bandwidth += total_draw_call_bandwidth; - - /* drawcalls access gmem in gmem rendering, but we do not want to ignore - * them completely. The state changes between tiles also have an - * overhead. The magic numbers of 11 and 10 are randomly chosen. - */ - gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10; - - const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth; - if (TU_AUTOTUNE_DEBUG_LOG) { - const VkExtent2D *extent = &cmd_buffer->state.render_area.extent; - const float drawcall_bandwidth_per_sample = - (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum / - cmd_buffer->state.rp.drawcall_count; - - mesa_logi("autotune %016" PRIx64 ":%u selecting %s", - renderpass_key, - cmd_buffer->state.rp.drawcall_count, - select_sysmem ? "sysmem" : "gmem"); - mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64, - avg_samples, - drawcall_bandwidth_per_sample, - total_draw_call_bandwidth); - mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u", - extent->width, extent->height, - pass->sysmem_bandwidth_per_pixel, - pass->gmem_bandwidth_per_pixel); - mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64, - sysmem_bandwidth, gmem_bandwidth); - } - - return select_sysmem; + if (can_early_return && early_return_mode) { + at_log_base_h("%" PRIu32 " draw calls, using %s (early)", rp_key(pass, framebuffer, cmd_buffer).hash, + rp_state->drawcall_count, render_mode_str(*early_return_mode)); + return *early_return_mode; } - return fallback_use_bypass(pass, framebuffer, cmd_buffer); + rp_key key(pass, framebuffer, cmd_buffer); + + /* When nearly identical renderpasses appear multiple times within the same command buffer, we need to generate a + * unique hash for each instance to distinguish them. While this approach doesn't address identical renderpasses + * across different command buffers, it is good enough in most cases. + */ + rp_entry *entry = cb_ctx.find_rp_entry(key); + if (entry) { + entry->duplicates++; + key = rp_key(key, entry->duplicates); + } + + *rp_ctx = cb_ctx.attach_rp_entry(device, find_or_create_rp_history(key), config, rp_state->drawcall_count); + rp_history &history = *((*rp_ctx)->history); + + if (config.test(mod_flag::PREEMPT_OPTIMIZE) && history.preempt_optimize.is_latency_sensitive()) { + /* Try to mitigate the risk of high preemption latency by always using GMEM, which should break up any larger + * draws into smaller ones with tiling. + */ + at_log_base_h("high preemption latency risk, using GMEM", key.hash); + return render_mode::GMEM; + } + + if (early_return_mode) { + at_log_base_h("%" PRIu32 " draw calls, using %s (late)", key.hash, rp_state->drawcall_count, + render_mode_str(*early_return_mode)); + return *early_return_mode; + } + + if (config.is_enabled(algorithm::PROFILED) || config.is_enabled(algorithm::PROFILED_IMM)) + return history.profiled.get_optimal_mode(history); + + if (config.is_enabled(algorithm::BANDWIDTH)) + return history.bandwidth.get_optimal_mode(history, cmd_state, pass, framebuffer, rp_state); + + return default_mode; +} + +uint32_t +tu_autotune::get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer) +{ + const struct tu_cmd_state *cmd_state = &cmd_buffer->state; + const struct tu_render_pass *pass = cmd_state->pass; + const struct tu_framebuffer *framebuffer = cmd_state->framebuffer; + const struct tu_render_pass_state *rp_state = &cmd_state->rp; + + if (!enabled || !active_config.load().test(mod_flag::PREEMPT_OPTIMIZE) || rp_state->sysmem_single_prim_mode || + pass->has_fdm || cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + return 1; + + rp_key key(pass, framebuffer, cmd_buffer); + rp_history *history = find_rp_history(key); + if (!history) { + at_log_base_h("no RP history found, using tile_size_divisor=1", key.hash); + return 1; + } + + uint32_t tile_size_divisor = history->preempt_optimize.get_tile_size_divisor(); + + return tile_size_divisor; } -template void -tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +tu_autotune::disable_preempt_optimize() { - if (!autotune_result) - return; - - struct tu_device *dev = cmd->device; - - static const uint32_t size = sizeof(struct tu_renderpass_samples); - - mtx_lock(&dev->autotune_mutex); - VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size); - mtx_unlock(&dev->autotune_mutex); - if (ret != VK_SUCCESS) { - autotune_result->bo.iova = 0; - return; - } - - uint64_t result_iova = autotune_result->bo.iova; - - autotune_result->samples = - (struct tu_renderpass_samples *) tu_suballoc_bo_map( - &autotune_result->bo); - - tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); - if (cmd->device->physical_device->info->props.has_event_write_sample_count) { - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); - tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, - .write_sample_count = true).value); - tu_cs_emit_qw(cs, result_iova); - - /* If the renderpass contains an occlusion query with its own ZPASS_DONE, - * we have to provide a fake ZPASS_DONE event here to logically close the - * previous one, preventing firmware from misbehaving due to nested events. - * This writes into the samples_end field, which will be overwritten in - * tu_autotune_end_renderpass. - */ - if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) { - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); - tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, - .write_sample_count = true, - .sample_count_end_offset = true, - .write_accum_sample_count_diff = true).value); - tu_cs_emit_qw(cs, result_iova); - } - } else { - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova)); - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, ZPASS_DONE); - } + config_t original, updated; + do { + original = updated = active_config.load(); + if (!original.test(mod_flag::PREEMPT_OPTIMIZE)) + return; /* Already disabled, nothing to do. */ + updated.disable(mod_flag::PREEMPT_OPTIMIZE); + } while (!active_config.compare_and_store(original, updated)); } -TU_GENX(tu_autotune_begin_renderpass); -template -void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +/** RP-level CS emissions **/ + +void +tu_autotune::begin_renderpass( + struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count) { - if (!autotune_result) + if (!rp_ctx) return; - if (!autotune_result->bo.iova) - return; + assert(sysmem || tile_count > 0); + assert(!sysmem || tile_count == 0); - uint64_t result_iova = autotune_result->bo.iova; - - tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); - - if (cmd->device->physical_device->info->props.has_event_write_sample_count) { - /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE - * event here, composing a pair of these events that firmware handles without - * issue. This first event writes into the samples_end field and the second - * event overwrites it. The second event also enables the accumulation flag - * even when we don't use that result because the blob always sets it. - */ - if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) { - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); - tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, - .write_sample_count = true).value); - tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end)); - } - - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3); - tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE, - .write_sample_count = true, - .sample_count_end_offset = true, - .write_accum_sample_count_diff = true).value); - tu_cs_emit_qw(cs, result_iova); - } else { - result_iova += offsetof(struct tu_renderpass_samples, samples_end); - - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNTER_BASE(.qword = result_iova)); - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, ZPASS_DONE); - } + rp_ctx->allocate(sysmem, tile_count); + rp_ctx->emit_rp_start(cmd, cs); +} + +void +tu_autotune::end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx) +{ + if (!rp_ctx) + return; + + rp_ctx->emit_rp_end(cmd, cs); +} + +/** Tile-level CS emissions **/ + +void +tu_autotune::begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx) +{ + if (!rp_ctx) + return; + + rp_ctx->emit_tile_start(cmd, cs, tile_idx); +} + +void +tu_autotune::end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx) +{ + if (!rp_ctx) + return; + + rp_ctx->emit_tile_end(cmd, cs, tile_idx); } -TU_GENX(tu_autotune_end_renderpass); diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index c374e86ab89..b9bcf6ee0da 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -8,150 +8,265 @@ #include "tu_common.h" -#include "util/hash_table.h" -#include "util/rwlock.h" +#include +#include +#include +#include +#include +#include +#include +#include "tu_cs.h" #include "tu_suballoc.h" -struct tu_renderpass_history; +/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */ +#define TU_AUTOTUNE_DEBUG_PERFCTR 0 -/** - * "autotune" our decisions about bypass vs GMEM rendering, based on historical - * data about a given render target. - * - * In deciding which path to take there are tradeoffs, including some that - * are not reasonably estimateable without having some additional information: - * - * (1) If you know you are touching every pixel (ie. there is a clear), - * then the GMEM path will at least not cost more memory bandwidth than - * sysmem[1] - * - * (2) If there is no clear, GMEM could potentially cost *more* bandwidth - * if there is sysmem->GMEM restore pass. - * - * (3) If you see a high draw count, that is an indication that there will be - * enough pixels accessed multiple times to benefit from the reduced - * memory bandwidth that GMEM brings - * - * (4) But high draw count where there is not much overdraw can actually be - * faster in bypass mode if it is pushing a lot of state change, due to - * not having to go thru the state changes per-tile[1] - * - * The approach taken is to measure the samples-passed for the batch to estimate - * the amount of overdraw to detect cases where the number of pixels touched is - * low. - * - * [1] ignoring early-tile-exit optimizations, but any draw that touches all/ - * most of the tiles late in the tile-pass can defeat that +/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on + * dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static + * analysis, since we can adapt to the actual workload rather than relying on heuristics. */ struct tu_autotune { - - /* We may have to disable autotuner if there are too many - * renderpasses in-flight. - */ - bool enabled; - + private: + bool enabled = true; struct tu_device *device; - /** - * Cache to map renderpass key to historical information about - * rendering to that particular render target. - */ - struct hash_table *ht; - struct u_rwlock ht_lock; + /** Configuration **/ - /** - * List of per-renderpass results that we are waiting for the GPU - * to finish with before reading back the results. - */ - struct list_head pending_results; + enum class algorithm : uint8_t; + enum class mod_flag : uint8_t; + enum class metric_flag : uint8_t; + /* Container for all autotune configuration options. */ + struct PACKED config_t; + union PACKED packed_config_t; - /** - * List of per-submission data that we may want to free after we - * processed submission results. - * This could happend after command buffers which were in the submission - * are destroyed. - */ - struct list_head pending_submission_data; + /* Allows for thread-safe access to the configurations. */ + struct atomic_config_t { + private: + std::atomic config_bits = 0; - /** - * List of per-submission data that has been finished and can be reused. - */ - struct list_head submission_data_pool; + public: + atomic_config_t(config_t initial_config); - uint32_t fence_counter; - uint32_t idx_counter; + config_t load() const; + + bool compare_and_store(config_t updated, config_t expected); + } active_config; + + config_t get_env_config(); + + /** Global Fence and Internal CS Management **/ + + /* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers. + * Synchronized by suballoc_mutex. + */ + struct tu_suballocator suballoc; + std::mutex suballoc_mutex; + + /* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */ + uint32_t next_fence = 1; + + /* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically + * managing the lifetime of the CS including recycling it after the fence value has been reached. + */ + struct submission_entry { + private: + uint32_t fence; + struct tu_cs fence_cs; + + public: + explicit submission_entry(tu_device *device); + + ~submission_entry(); + + /* Disable move/copy, since this holds stable pointers to the fence_cs. */ + submission_entry(const submission_entry &) = delete; + submission_entry &operator=(const submission_entry &) = delete; + submission_entry(submission_entry &&) = delete; + submission_entry &operator=(submission_entry &&) = delete; + + /* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending + * GPU completion or currently being processed. + */ + bool is_active() const; + + /* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */ + struct tu_cs *try_get_cs(uint32_t new_fence); + }; + + /* Unified pool for submission CSes. + * Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry. + */ + std::deque submission_entries; + + /* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */ + struct tu_cs *get_cs_for_fence(uint32_t fence); + + /** RP Entry Management **/ + + struct rp_gpu_data; + struct tile_gpu_data; + struct rp_entry; + + /* A wrapper over all entries associated with a single command buffer. */ + struct rp_entry_batch { + bool active; /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a + valid fence. */ + uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to + determine when the entries can be processed. */ + std::vector> entries; + + rp_entry_batch(); + + /* Disable the copy/move to avoid performance hazards. */ + rp_entry_batch(const rp_entry_batch &) = delete; + rp_entry_batch &operator=(const rp_entry_batch &) = delete; + rp_entry_batch(rp_entry_batch &&) = delete; + rp_entry_batch &operator=(rp_entry_batch &&) = delete; + + void assign_fence(uint32_t new_fence); + }; + + /* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient + * iteration and to ensure that we process the entries in the same order they were submitted. + */ + std::deque> active_batches; + + /* Handles processing of entry batches that are pending to be processed. + * + * Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this + * in the on_submit() method, which is called on every submit of a command buffer. + */ + void process_entries(); + + /** Renderpass State Tracking **/ + + struct rp_history; + struct rp_history_handle; + + /* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to + * be stable across runs, so it can be used to identify the same renderpass instance consistently. + * + * Note: We can potentially include the vector of data we extract from the parameters to generate the hash into + * rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost + * but avoids hash collisions causing issues. + */ + struct rp_key { + uint64_t hash; + + rp_key(const struct tu_render_pass *pass, + const struct tu_framebuffer *framebuffer, + const struct tu_cmd_buffer *cmd); + + /* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */ + rp_key(const rp_key &key, uint32_t duplicates); + + /* Equality operator, used in unordered_map. */ + constexpr bool operator==(const rp_key &other) const noexcept + { + return hash == other.hash; + } + }; + + /* A thin wrapper to satisfy C++'s Hash named requirement for rp_key. + * + * Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated + * multiple times, rather than being calculated once and reused when there's multiple successive lookups like + * with find_or_create_rp_history() and providing the hash to the rp_history constructor. + */ + struct rp_hash { + constexpr size_t operator()(const rp_key &key) const noexcept + { + /* Note: This will throw away the upper 32-bits on 32-bit architectures. */ + return static_cast(key.hash); + } + }; + + /* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */ + using rp_histories_t = std::unordered_map; + rp_histories_t rp_histories; + std::shared_mutex rp_mutex; + uint64_t last_reap_ts = 0; + + /* Note: These will internally lock rp_mutex internally, no need to lock it. */ + rp_history_handle find_rp_history(const rp_key &key); + rp_history_handle find_or_create_rp_history(const rp_key &key); + void reap_old_rp_histories(); + + /** Debug Performance Counters **/ + +#if TU_AUTOTUNE_DEBUG_PERFCTR + const fd_perfcntr_group *cp_group; + const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count; +#endif + + public: + tu_autotune(struct tu_device *device, VkResult &result); + + ~tu_autotune(); + + /* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */ + using rp_ctx_t = rp_entry *; + + /* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB. + * + * Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is + * done through tu_autotune. + */ + struct cmd_buf_ctx { + private: + /* A batch of all entries from RPs within this CB. */ + std::shared_ptr batch; + + /* Creates a new RP entry attached to this CB. */ + rp_entry * + attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count); + + rp_entry *find_rp_entry(const rp_key &key); + + friend struct tu_autotune; + + public: + cmd_buf_ctx(); + ~cmd_buf_ctx(); + + /* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */ + void reset(); + }; + + enum class render_mode { + SYSMEM, + GMEM, + }; + + render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx); + + /* Returns the optimal tile size divisor for the given CB state. */ + uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer); + + /* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to + * ensure that the autotuner does not interfere with the high-priority queue's performance. + * + * Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior. + */ + void disable_preempt_optimize(); + + void + begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count); + + void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx); + + void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx); + + void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx); + + /* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner + * tracking to function correctly. + * + * Note: This must be called from a single-threaded context. There should never be multiple threads calling this + * function at the same time. + */ + struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count); }; -/** - * From the cmdstream, the captured samples-passed values are recorded - * at the start and end of the batch. - * - * Note that we do the math on the CPU to avoid a WFI. But pre-emption - * may force us to revisit that. - */ -struct PACKED tu_renderpass_samples { - uint64_t samples_start; - /* hw requires the sample start/stop locations to be 128b aligned. */ - uint64_t __pad0; - uint64_t samples_end; - uint64_t __pad1; -}; - -/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */ -static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16); - -/** - * Tracks the results from an individual renderpass. Initially created - * per renderpass, and appended to the tail of at->pending_results. At a later - * time, when the GPU has finished writing the results, we fill samples_passed. - */ -struct tu_renderpass_result { - /* Points into GPU memory */ - struct tu_renderpass_samples* samples; - - struct tu_suballoc_bo bo; - - /* - * Below here, only used internally within autotune - */ - uint64_t rp_key; - struct tu_renderpass_history *history; - struct list_head node; - uint32_t fence; - uint64_t samples_passed; -}; - -VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev); -void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev); - -bool tu_autotune_use_bypass(struct tu_autotune *at, - struct tu_cmd_buffer *cmd_buffer, - struct tu_renderpass_result **autotune_result); -void tu_autotune_free_results(struct tu_device *dev, struct list_head *results); - -bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count); - -/** - * A magic 8-ball that tells the gmem code whether we should do bypass mode - * for moar fps. - */ -struct tu_cs *tu_autotune_on_submit(struct tu_device *dev, - struct tu_autotune *at, - struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count); - -struct tu_autotune_results_buffer; - -template -void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - struct tu_renderpass_result *autotune_result); - -template -void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - struct tu_renderpass_result *autotune_result); - -#endif /* TU_AUTOTUNE_H */ +#endif /* TU_AUTOTUNE_H */ \ No newline at end of file diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index 5ba807930ef..a2c75744266 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd) } } - cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; + cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd); + + cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass, + cmd->state.gmem_layout, cmd->state.gmem_layout_divisor); } struct apply_store_coords_state { diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index e734241aeaa..0281ce7b857 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -14,6 +14,7 @@ #include "vk_render_pass.h" #include "vk_util.h" +#include "tu_autotune.h" #include "tu_buffer.h" #include "tu_clear_blit.h" #include "tu_cs.h" @@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling) static bool use_hw_binning(struct tu_cmd_buffer *cmd) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout]; + struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = + tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor); const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); /* XFB commands are emitted for BINNING || SYSMEM, which makes it @@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd) return true; } - return vsc->binning; + return vsc->binning_possible && vsc->binning_useful; } static bool use_sysmem_rendering(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result **autotune_result) + tu_autotune::rp_ctx_t *rp_ctx) { if (TU_DEBUG(SYSMEM)) { cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)"; @@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, return true; } - if (TU_DEBUG(GMEM)) + if (TU_DEBUG(GMEM)) { + cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)"; return false; - - bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune, - cmd, autotune_result); - if (*autotune_result) { - list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results); } - if (use_sysmem) { + /* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */ + if (!vsc->binning_possible && vsc->binning_useful) { + cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible"; + return true; + } + + bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM; + if (use_sysmem) cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem"; - } return use_sysmem; } @@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd, template static void tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) + tu_autotune::rp_ctx_t rp_ctx) { const struct tu_framebuffer *fb = cmd->state.framebuffer; @@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP)); } - tu_autotune_begin_renderpass(cmd, cs, autotune_result); + cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0); tu_cs_sanity_check(cs); } @@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, template static void tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) + tu_autotune::rp_ctx_t rp_ctx) { - tu_autotune_end_renderpass(cmd, cs, autotune_result); - /* Do any resolves of the last subpass. These are handled in the * tile_store_cs in the gmem path. */ @@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit(cs, 0); /* value */ } + cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx); + tu_cs_sanity_check(cs); } @@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, template static void tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result, + tu_autotune::rp_ctx_t rp_ctx, const VkOffset2D *fdm_offsets) { struct tu_physical_device *phys_dev = cmd->device->physical_device; @@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, if (use_cb) tu_trace_start_render_pass(cmd); - tu_autotune_begin_renderpass(cmd, cs, autotune_result); + uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height; + cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count); tu_cs_sanity_check(cs); } @@ -3471,13 +3476,18 @@ template static void tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_tile_config *tile, - bool fdm, const VkOffset2D *fdm_offsets) + bool fdm, const VkOffset2D *fdm_offsets, + tu_autotune::rp_ctx_t rp_ctx, + const struct tu_vsc_config *vsc) { + uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x; tu6_emit_tile_select(cmd, &cmd->cs, tile, fdm, fdm_offsets); tu_lrz_before_tile(cmd, &cmd->cs); trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd); + cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx); + /* Primitives that passed all tests are still counted in in each * tile even with HW binning beforehand. Do not permit it. */ @@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, if (cmd->state.prim_generated_query_running_before_rp) tu_emit_event_write(cmd, cs, FD_START_PRIMITIVE_CTRS); + cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx); + if (use_hw_binning(cmd)) { tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) | @@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, template static void tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) + tu_autotune::rp_ctx_t rp_ctx) { - tu_autotune_end_renderpass(cmd, cs, autotune_result); - tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); tu_lrz_tiling_end(cmd, cs); @@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_emit_event_write(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE); + cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx); + tu_cs_sanity_check(cs); } @@ -3796,7 +3808,9 @@ void tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2, const struct tu_image_view *fdm, - const VkOffset2D *fdm_offsets) + const VkOffset2D *fdm_offsets, + tu_autotune::rp_ctx_t rp_ctx, + const struct tu_vsc_config *vsc) { uint32_t width = tx2 - tx1; uint32_t height = ty2 - ty1; @@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, continue; tu6_render_tile(cmd, &cmd->cs, &tiles[tile_idx], - true, fdm_offsets); + true, fdm_offsets, + rp_ctx, vsc); } } } @@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem) template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result *autotune_result, + tu_autotune::rp_ctx_t rp_ctx, const VkOffset2D *fdm_offsets) { const struct tu_tiling_config *tiling = cmd->state.tiling; @@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu6_emit_tile_store_cs(cmd, &cmd->tile_store_cs); tu_cs_end(&cmd->tile_store_cs); - tu6_tile_render_begin(cmd, &cmd->cs, autotune_result, fdm_offsets); + tu6_tile_render_begin(cmd, &cmd->cs, rp_ctx, fdm_offsets); /* Note: we reverse the order of walking the pipes and tiles on every * other row, to improve texture cache locality compared to raster order. @@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, if (merge_tiles) { tu_render_pipe_fdm(cmd, pipe, tx1, ty1, tx2, ty2, fdm, - fdm_offsets); + fdm_offsets, rp_ctx, vsc); continue; } @@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets); tu6_render_tile(cmd, &cmd->cs, &tile, has_fdm, - fdm_offsets); + fdm_offsets, + rp_ctx, vsc); } slot_row += tile_row_stride; } } } - tu6_tile_render_end(cmd, &cmd->cs, autotune_result); + tu6_tile_render_end(cmd, &cmd->cs, rp_ctx); tu_trace_end_render_pass(cmd, true); @@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, template static void tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result *autotune_result) + tu_autotune::rp_ctx_t rp_ctx) { VkResult result = tu_allocate_transient_attachments(cmd, true); @@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, tu_trace_start_render_pass(cmd); - tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); + tu6_sysmem_render_begin(cmd, &cmd->cs, rp_ctx); trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd); @@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs); - tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); + tu6_sysmem_render_end(cmd, &cmd->cs, rp_ctx); tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace, cmd->trace_renderpass_start, @@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer, if (cmd_buffer->state.rp.has_tess) tu6_lazy_emit_tessfactor_addr(cmd_buffer); - struct tu_renderpass_result *autotune_result = NULL; - if (use_sysmem_rendering(cmd_buffer, &autotune_result)) - tu_cmd_render_sysmem(cmd_buffer, autotune_result); + tu_autotune::rp_ctx_t rp_ctx = NULL; + if (use_sysmem_rendering(cmd_buffer, &rp_ctx)) + tu_cmd_render_sysmem(cmd_buffer, rp_ctx); else - tu_cmd_render_tiles(cmd_buffer, autotune_result, fdm_offsets); + tu_cmd_render_tiles(cmd_buffer, rp_ctx, fdm_offsets); /* Outside of renderpasses we assume all draw states are disabled. We do * this outside the draw CS for the normal case where 3d gmem stores aren't @@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer) cmd_buffer->state.attachments = NULL; cmd_buffer->state.clear_values = NULL; cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */ + cmd_buffer->state.gmem_layout_divisor = 0; cmd_buffer->state.renderpass_cb_disabled = false; memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); @@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool, u_trace_init(&cmd_buffer->rp_trace, &device->trace_context); cmd_buffer->trace_renderpass_start = u_trace_begin_iterator(&cmd_buffer->rp_trace); - list_inithead(&cmd_buffer->renderpass_autotune_results); + new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx(); if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) { cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device); @@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) u_trace_fini(&cmd_buffer->trace); u_trace_fini(&cmd_buffer->rp_trace); - tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); + cmd_buffer->autotune_ctx.~cmd_buf_ctx(); for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { if (cmd_buffer->descriptors[i].push_set.layout) @@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, tu_cs_reset(&cmd_buffer->pre_chain.draw_cs); tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs); - tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); + cmd_buffer->autotune_ctx.reset(); for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); @@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, cmd->state.clear_values = suspended->state.suspended_pass.clear_values; cmd->state.render_area = suspended->state.suspended_pass.render_area; cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout; - cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; + cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor; + cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass, + cmd->state.gmem_layout, cmd->state.gmem_layout_divisor); cmd->state.lrz = suspended->state.suspended_pass.lrz; } @@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r * (perf queries), then we can't do this optimization since the * start-of-the-CS geometry condition will have been overwritten. */ - bool cond_load_allowed = vsc->binning && + bool cond_load_allowed = vsc->binning_possible && cmd->state.pass->has_cond_load_store && !cmd->state.rp.draw_cs_writes_to_cond_pred; @@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, cmd->state.suspended_pass.attachments = cmd->state.attachments; cmd->state.suspended_pass.clear_values = cmd->state.clear_values; cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout; + cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor; } tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 4e974e12827..0f8aa1500d6 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -524,11 +524,12 @@ struct tu_cmd_state /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU * might get used by tu_store_gmem_attachment(). */ - enum tu_gmem_layout gmem_layout; + tu_gmem_layout gmem_layout; + uint32_t gmem_layout_divisor; const struct tu_render_pass *pass; const struct tu_subpass *subpass; - const struct tu_framebuffer *framebuffer; + struct tu_framebuffer *framebuffer; const struct tu_tiling_config *tiling; VkRect2D render_area; @@ -543,9 +544,10 @@ struct tu_cmd_state struct { const struct tu_render_pass *pass; const struct tu_subpass *subpass; - const struct tu_framebuffer *framebuffer; + struct tu_framebuffer *framebuffer; VkRect2D render_area; enum tu_gmem_layout gmem_layout; + uint32_t gmem_layout_divisor; const struct tu_image_view **attachments; VkClearValue *clear_values; @@ -644,8 +646,7 @@ struct tu_cmd_buffer struct u_trace_iterator trace_renderpass_start; struct u_trace trace, rp_trace; - struct list_head renderpass_autotune_results; - struct tu_autotune_results_buffer* autotune_buffer; + tu_autotune::cmd_buf_ctx autotune_ctx; void *patchpoints_ctx; struct util_dynarray fdm_bin_patchpoints; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index dceb5227116..d593fbfc26c 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false) DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false) DRI_CONF_TU_ENABLE_SOFTFLOAT32(false) + DRI_CONF_TU_AUTOTUNE_ALGORITHM() DRI_CONF_SECTION_END }; @@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction"); instance->enable_softfloat32 = driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32"); + instance->autotune_algo = + driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); } static uint32_t instance_count = 0; @@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device) { mtx_destroy(&device->bo_mutex); mtx_destroy(&device->pipeline_mutex); - mtx_destroy(&device->autotune_mutex); mtx_destroy(&device->kgsl_profiling_mutex); mtx_destroy(&device->event_mutex); mtx_destroy(&device->trace_mutex); @@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, VkResult result; struct tu_device *device; bool border_color_without_format = false; + bool autotune_disable_preempt_optimize = false; vk_foreach_struct_const (ext, pCreateInfo->pNext) { switch (ext->sType) { @@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->bo_mutex, mtx_plain); mtx_init(&device->pipeline_mutex, mtx_plain); - mtx_init(&device->autotune_mutex, mtx_plain); mtx_init(&device->kgsl_profiling_mutex, mtx_plain); mtx_init(&device->event_mutex, mtx_plain); mtx_init(&device->trace_mutex, mtx_plain); @@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; + const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info = + vk_find_struct_const(queue_create->pNext, + DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR); + const VkQueueGlobalPriorityKHR global_priority = priority_info ? + priority_info->globalPriority : + (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR : + VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR); uint32_t qfi = queue_create->queueFamilyIndex; enum tu_queue_type type = physical_device->queue_families[qfi].type; device->queues[qfi] = (struct tu_queue *) vk_alloc( @@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->queue_count[qfi] = queue_create->queueCount; for (unsigned q = 0; q < queue_create->queueCount; q++) { - result = tu_queue_init(device, &device->queues[qfi][q], type, q, - queue_create); + result = tu_queue_init(device, &device->queues[qfi][q], type, + global_priority, q, queue_create); if (result != VK_SUCCESS) { device->queue_count[qfi] = q; goto fail_queues; } } + + autotune_disable_preempt_optimize |= + (global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR); } result = vk_meta_device_init(&device->vk, &device->meta); @@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, TU_BO_ALLOC_ALLOW_DUMP | TU_BO_ALLOC_INTERNAL_RESOURCE), "pipeline_suballoc"); - tu_bo_suballocator_init(&device->autotune_suballoc, device, - 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, - "autotune_suballoc"); if (is_kgsl(physical_device->instance)) { tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, @@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } pthread_condattr_destroy(&condattr); - result = tu_autotune_init(&device->autotune, device); - if (result != VK_SUCCESS) { + device->autotune = new tu_autotune(device, result); + if (result != VK_SUCCESS) goto fail_timeline_cond; - } + + if (autotune_disable_preempt_optimize) + device->autotune->disable_preempt_optimize(); device->use_z24uint_s8uint = physical_device->info->props.has_z24uint_s8uint && @@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) free(device->dbg_renderpass_stomp_cs); } - tu_autotune_fini(&device->autotune, device); + delete device->autotune; tu_bo_suballocator_finish(&device->pipeline_suballoc); - tu_bo_suballocator_finish(&device->autotune_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->event_suballoc); tu_bo_suballocator_finish(&device->vis_stream_suballocator); @@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device, } } - tu_framebuffer_tiling_config(framebuffer, device, pass); + tu_framebuffer_init_tiling_config(framebuffer, device, pass); /* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */ if (msrtss_attachment_count > 0) { @@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer, view->image->max_tile_h_constraint_fdm; } - tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass); + tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass); } VkResult diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 08c102ae145..dffb2c3f001 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -28,6 +28,7 @@ #include "common/freedreno_rd_output.h" #include "util/vma.h" #include "util/u_vector.h" +#include "util/rwlock.h" /* queue types */ #define TU_QUEUE_GENERAL 0 @@ -233,6 +234,9 @@ struct tu_instance * However we don't want native Vulkan apps using this. */ bool enable_softfloat32; + + /* Configuration option to use a specific autotune algorithm by default. */ + const char *autotune_algo; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) @@ -265,7 +269,12 @@ struct tu6_global volatile uint32_t vtx_stats_query_not_running; - /* To know when renderpass stats for autotune are valid */ + /* A fence with a monotonically increasing value that is + * incremented by the GPU on each submission that includes + * a tu_autotune::submission_entry CS. This is used to track + * which submissions have been processed by the GPU before + * processing the autotune packet on the CPU. + */ volatile uint32_t autotune_fence; /* For recycling command buffers for dynamic suspend/resume comamnds */ @@ -355,12 +364,6 @@ struct tu_device struct tu_suballocator pipeline_suballoc; mtx_t pipeline_mutex; - /* Device-global BO suballocator for reducing BO management for small - * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex. - */ - struct tu_suballocator autotune_suballoc; - mtx_t autotune_mutex; - /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on * each submission. */ @@ -462,7 +465,7 @@ struct tu_device pthread_cond_t timeline_cond; pthread_mutex_t submit_mutex; - struct tu_autotune autotune; + struct tu_autotune *autotune; struct breadcrumbs_context *breadcrumbs_ctx; @@ -547,8 +550,11 @@ struct tu_vsc_config { /* Whether binning could be used for gmem rendering using this framebuffer. */ bool binning_possible; - /* Whether binning should be used for gmem rendering using this framebuffer. */ - bool binning; + /* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether + * binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance + * hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so. + */ + bool binning_useful; /* pipe register values */ uint32_t pipe_config[MAX_VSC_PIPES]; @@ -577,7 +583,8 @@ struct tu_framebuffer uint32_t max_tile_w_constraint; uint32_t max_tile_h_constraint; - struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT]; + uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */ + struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX]; uint32_t attachment_count; const struct tu_image_view *attachments[0]; diff --git a/src/freedreno/vulkan/tu_pass.h b/src/freedreno/vulkan/tu_pass.h index da92babc657..5dc515f8db6 100644 --- a/src/freedreno/vulkan/tu_pass.h +++ b/src/freedreno/vulkan/tu_pass.h @@ -22,6 +22,8 @@ enum tu_gmem_layout TU_GMEM_LAYOUT_COUNT, }; +constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */ + struct tu_subpass_barrier { VkPipelineStageFlags2 src_stage_mask; VkPipelineStageFlags2 dst_stage_mask; diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index a87a73f0cd4..7563e2c3b45 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) struct tu_device *device = queue->device; bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); struct util_dynarray dump_cmds; + struct tu_cs *autotune_cs = NULL; if (vk_submit->buffer_bind_count || vk_submit->image_bind_count || @@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) } } - if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { - struct tu_cs *autotune_cs = tu_autotune_on_submit( - device, &device->autotune, cmd_buffers, cmdbuf_count); + autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count); + if (autotune_cs) { submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries, autotune_cs->entry_count); } @@ -605,17 +605,10 @@ VkResult tu_queue_init(struct tu_device *device, struct tu_queue *queue, enum tu_queue_type type, + const VkQueueGlobalPriorityKHR global_priority, int idx, const VkDeviceQueueCreateInfo *create_info) { - const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info = - vk_find_struct_const(create_info->pNext, - DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR); - const VkQueueGlobalPriorityKHR global_priority = priority_info ? - priority_info->globalPriority : - (TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR : - VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR); - const int priority = tu_get_submitqueue_priority( device->physical_device, global_priority, type, device->vk.enabled_features.globalPriorityQuery); diff --git a/src/freedreno/vulkan/tu_queue.h b/src/freedreno/vulkan/tu_queue.h index 28925bfcb50..278756a43af 100644 --- a/src/freedreno/vulkan/tu_queue.h +++ b/src/freedreno/vulkan/tu_queue.h @@ -43,6 +43,7 @@ VkResult tu_queue_init(struct tu_device *device, struct tu_queue *queue, enum tu_queue_type type, + const VkQueueGlobalPriorityKHR global_priority, int idx, const VkDeviceQueueCreateInfo *create_info); diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index e19d43bb8a9..ffd2975659b 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc) return tiles_per_pipe <= 32; } +static void +tu_tiling_config_divide_tile(const struct tu_device *dev, + const struct tu_render_pass *pass, + const struct tu_framebuffer *fb, + const struct tu_tiling_config *tiling, + struct tu_tiling_config *new_tiling, + uint32_t divisor) +{ + assert(divisor > 0); + + *new_tiling = *tiling; + if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) { + /* If the divisor is 1, or if the tiling is not possible, or if the + * tiling is invalid, just return the original tiling. */ + return; + } + + /* Get the hardware-specified alignment values. */ + const uint32_t tile_align_w = pass->tile_align_w; + const uint32_t tile_align_h = dev->physical_device->info->tile_align_h; + + /* Divide the current tile dimensions by the divisor. */ + uint32_t new_tile_width = tiling->tile0.width / divisor; + uint32_t new_tile_height = tiling->tile0.height / divisor; + + /* Clamp to the minimum alignment if necessary and align down. */ + if (new_tile_width < tile_align_w) + new_tile_width = tile_align_w; + else + new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w); + + if (new_tile_height < tile_align_h) + new_tile_height = tile_align_h; + else + new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h); + + new_tiling->tile0.width = new_tile_width; + new_tiling->tile0.height = new_tile_height; + + /* Recalculate the tile count from the framebuffer dimensions to ensure + * full coverage. */ + new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width); + new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height); +} + static void tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc, const struct tu_device *dev, @@ -460,22 +505,18 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc, static void tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device) { - if (vsc->binning_possible) { - vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2; + vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2; - if (TU_DEBUG(FORCEBIN)) - vsc->binning = true; - if (TU_DEBUG(NOBIN)) - vsc->binning = false; - } else { - vsc->binning = false; - } + if (TU_DEBUG(FORCEBIN)) + vsc->binning_useful = true; + if (TU_DEBUG(NOBIN)) + vsc->binning_useful = false; } void -tu_framebuffer_tiling_config(struct tu_framebuffer *fb, - const struct tu_device *device, - const struct tu_render_pass *pass) +tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb, + const struct tu_device *device, + const struct tu_render_pass *pass) { for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) { struct tu_tiling_config *tiling = &fb->tiling[gmem_layout]; @@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb, tu_tiling_config_update_binning(fdm_offset_vsc, device); } } + + fb->initd_divisor = 1; +} + +const struct tu_tiling_config * +tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb, + const struct tu_device *device, + const struct tu_render_pass *pass, + int gmem_layout, + uint32_t divisor) +{ + assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX); + assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to + appropriately size the tiles for the framebuffer.*/ + struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout]; + + if (divisor > fb->initd_divisor) { + const struct tu_tiling_config *base_tiling = + tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1); + tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor); + + struct tu_vsc_config *vsc = &tiling->vsc; + if (tiling->possible) { + tu_tiling_config_update_pipe_layout(vsc, device, false); + tu_tiling_config_update_pipes(vsc, device); + tu_tiling_config_update_binning(vsc, device); + + struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc; + fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 }; + } + + if (!tiling->possible || /* If tiling is no longer possible, this is pointless. */ + (vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea. */ + (vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning. */ + ) { + /* Revert to the previous level's tiling configuration. */ + *tiling = *base_tiling; + } + + fb->initd_divisor = divisor; + } + + return tiling; } void diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 7ce6d3e053a..b1ed4354e39 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...) } while (0) void -tu_framebuffer_tiling_config(struct tu_framebuffer *fb, - const struct tu_device *device, - const struct tu_render_pass *pass); +tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb, + const struct tu_device *device, + const struct tu_render_pass *pass); + +const struct tu_tiling_config * +tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb, + const struct tu_device *device, + const struct tu_render_pass *pass, + int gmem_layout, + uint32_t divisor); #define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1) diff --git a/src/util/driconf.h b/src/util/driconf.h index 42a1c213df1..1ec69d1bd09 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -657,6 +657,10 @@ DRI_CONF_OPT_B(tu_enable_softfloat32, def, \ "Enable softfloat emulation for float32 denormals") +#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \ + DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \ + "Set the preferred autotune algorithm") + /** * \brief Honeykrisp specific configuration options */ diff --git a/src/util/rand_xor.h b/src/util/rand_xor.h index b55598f228a..830c6c3e727 100644 --- a/src/util/rand_xor.h +++ b/src/util/rand_xor.h @@ -28,10 +28,18 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + uint64_t rand_xorshift128plus(uint64_t seed[2]); void s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed); +#ifdef __cplusplus +} /* end of extern "C" */ +#endif + #endif /* RAND_XOR_H */ diff --git a/src/util/u_math.h b/src/util/u_math.h index 354683bb4ce..2c5f97b9875 100644 --- a/src/util/u_math.h +++ b/src/util/u_math.h @@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment) return ((value) & ~(uint64_t)(alignment - 1)); } +static inline uint64_t +ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment) +{ + return value - (value % alignment); +} + /** * Align a value, only works pot alignemnts. */