mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-16 14:08:07 +02:00
tu: Rewrite autotune in C++
Completely overhauls the autotuner in C++ with the functionality being extended as well. Signed-off-by: Dhruv Mark Collins <mark@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37802>
This commit is contained in:
parent
bd88997c0a
commit
40ffc052af
9 changed files with 1234 additions and 842 deletions
|
|
@ -670,3 +670,38 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
|
||||||
Some of these options will behave differently when toggled at runtime, for example:
|
Some of these options will behave differently when toggled at runtime, for example:
|
||||||
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
||||||
was set in the environment variable.
|
was set in the environment variable.
|
||||||
|
|
||||||
|
Autotune
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
|
||||||
|
autotune system, the behavior of which can be controlled with the following
|
||||||
|
environment variables:
|
||||||
|
|
||||||
|
.. envvar:: TU_AUTOTUNE_ALGO
|
||||||
|
|
||||||
|
Selects the algorithm used for autotuning. Supported values are:
|
||||||
|
|
||||||
|
``bandwidth``
|
||||||
|
Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
|
||||||
|
the one with lower estimated bandwidth. This is the default algorithm.
|
||||||
|
|
||||||
|
.. envvar:: TU_AUTOTUNE_FLAGS
|
||||||
|
|
||||||
|
Modifies the behavior of the selected algorithm. Supported flags are:
|
||||||
|
|
||||||
|
``big_gmem``
|
||||||
|
Always chooses GMEM rendering if the amount of draw calls in the render pass
|
||||||
|
is greater than a certain threshold. Larger RPs generally benefit more from
|
||||||
|
GMEM rendering due to less overhead from tiling. This tends to lead to worse
|
||||||
|
performance in most cases, so it's only useful for testing.
|
||||||
|
|
||||||
|
``small_sysmem``
|
||||||
|
Always chooses SYSMEM rendering if the amount of draw calls in the render pass
|
||||||
|
is lower than a certain threshold. The benefits of GMEM rendering are less
|
||||||
|
pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
|
||||||
|
|
||||||
|
Multiple flags can be combined by separating them with commas, e.g.
|
||||||
|
``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
|
||||||
|
|
||||||
|
If no flags are specified, the default behavior is used.
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -8,150 +8,237 @@
|
||||||
|
|
||||||
#include "tu_common.h"
|
#include "tu_common.h"
|
||||||
|
|
||||||
#include "util/hash_table.h"
|
#include <atomic>
|
||||||
#include "util/rwlock.h"
|
#include <deque>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <shared_mutex>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "tu_cs.h"
|
||||||
#include "tu_suballoc.h"
|
#include "tu_suballoc.h"
|
||||||
|
|
||||||
struct tu_renderpass_history;
|
/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
|
||||||
|
* dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
|
||||||
/**
|
* analysis, since we can adapt to the actual workload rather than relying on heuristics.
|
||||||
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
|
|
||||||
* data about a given render target.
|
|
||||||
*
|
|
||||||
* In deciding which path to take there are tradeoffs, including some that
|
|
||||||
* are not reasonably estimateable without having some additional information:
|
|
||||||
*
|
|
||||||
* (1) If you know you are touching every pixel (ie. there is a clear),
|
|
||||||
* then the GMEM path will at least not cost more memory bandwidth than
|
|
||||||
* sysmem[1]
|
|
||||||
*
|
|
||||||
* (2) If there is no clear, GMEM could potentially cost *more* bandwidth
|
|
||||||
* if there is sysmem->GMEM restore pass.
|
|
||||||
*
|
|
||||||
* (3) If you see a high draw count, that is an indication that there will be
|
|
||||||
* enough pixels accessed multiple times to benefit from the reduced
|
|
||||||
* memory bandwidth that GMEM brings
|
|
||||||
*
|
|
||||||
* (4) But high draw count where there is not much overdraw can actually be
|
|
||||||
* faster in bypass mode if it is pushing a lot of state change, due to
|
|
||||||
* not having to go thru the state changes per-tile[1]
|
|
||||||
*
|
|
||||||
* The approach taken is to measure the samples-passed for the batch to estimate
|
|
||||||
* the amount of overdraw to detect cases where the number of pixels touched is
|
|
||||||
* low.
|
|
||||||
*
|
|
||||||
* [1] ignoring early-tile-exit optimizations, but any draw that touches all/
|
|
||||||
* most of the tiles late in the tile-pass can defeat that
|
|
||||||
*/
|
*/
|
||||||
struct tu_autotune {
|
struct tu_autotune {
|
||||||
|
private:
|
||||||
/* We may have to disable autotuner if there are too many
|
bool enabled = true;
|
||||||
* renderpasses in-flight.
|
|
||||||
*/
|
|
||||||
bool enabled;
|
|
||||||
|
|
||||||
struct tu_device *device;
|
struct tu_device *device;
|
||||||
|
|
||||||
/**
|
/** Configuration **/
|
||||||
* Cache to map renderpass key to historical information about
|
|
||||||
* rendering to that particular render target.
|
|
||||||
*/
|
|
||||||
struct hash_table *ht;
|
|
||||||
struct u_rwlock ht_lock;
|
|
||||||
|
|
||||||
/**
|
enum class algorithm : uint8_t;
|
||||||
* List of per-renderpass results that we are waiting for the GPU
|
enum class mod_flag : uint8_t;
|
||||||
* to finish with before reading back the results.
|
enum class metric_flag : uint8_t;
|
||||||
*/
|
/* Container for all autotune configuration options. */
|
||||||
struct list_head pending_results;
|
struct PACKED config_t;
|
||||||
|
union PACKED packed_config_t;
|
||||||
|
|
||||||
/**
|
/* Allows for thread-safe access to the configurations. */
|
||||||
* List of per-submission data that we may want to free after we
|
struct atomic_config_t {
|
||||||
* processed submission results.
|
private:
|
||||||
* This could happend after command buffers which were in the submission
|
std::atomic<uint32_t> config_bits = 0;
|
||||||
* are destroyed.
|
|
||||||
*/
|
|
||||||
struct list_head pending_submission_data;
|
|
||||||
|
|
||||||
/**
|
public:
|
||||||
* List of per-submission data that has been finished and can be reused.
|
atomic_config_t(config_t initial_config);
|
||||||
*/
|
|
||||||
struct list_head submission_data_pool;
|
|
||||||
|
|
||||||
uint32_t fence_counter;
|
config_t load() const;
|
||||||
uint32_t idx_counter;
|
|
||||||
|
bool compare_and_store(config_t expected, config_t updated);
|
||||||
|
} active_config;
|
||||||
|
|
||||||
|
config_t get_env_config();
|
||||||
|
|
||||||
|
/** Global Fence and Internal CS Management **/
|
||||||
|
|
||||||
|
/* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
|
||||||
|
* Synchronized by suballoc_mutex.
|
||||||
|
*/
|
||||||
|
struct tu_suballocator suballoc;
|
||||||
|
std::mutex suballoc_mutex;
|
||||||
|
|
||||||
|
/* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
|
||||||
|
uint32_t next_fence = 1;
|
||||||
|
|
||||||
|
/* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
|
||||||
|
* managing the lifetime of the CS including recycling it after the fence value has been reached.
|
||||||
|
*/
|
||||||
|
struct submission_entry {
|
||||||
|
private:
|
||||||
|
uint32_t fence;
|
||||||
|
struct tu_cs fence_cs;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit submission_entry(tu_device *device);
|
||||||
|
|
||||||
|
~submission_entry();
|
||||||
|
|
||||||
|
/* Disable move/copy, since this holds stable pointers to the fence_cs. */
|
||||||
|
submission_entry(const submission_entry &) = delete;
|
||||||
|
submission_entry &operator=(const submission_entry &) = delete;
|
||||||
|
submission_entry(submission_entry &&) = delete;
|
||||||
|
submission_entry &operator=(submission_entry &&) = delete;
|
||||||
|
|
||||||
|
/* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
|
||||||
|
* GPU completion or currently being processed.
|
||||||
|
*/
|
||||||
|
bool is_active() const;
|
||||||
|
|
||||||
|
/* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
|
||||||
|
struct tu_cs *try_get_cs(uint32_t new_fence);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Unified pool for submission CSes.
|
||||||
|
* Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
|
||||||
|
*/
|
||||||
|
std::deque<submission_entry> submission_entries;
|
||||||
|
|
||||||
|
/* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
|
||||||
|
struct tu_cs *get_cs_for_fence(uint32_t fence);
|
||||||
|
|
||||||
|
/** RP Entry Management **/
|
||||||
|
|
||||||
|
struct rp_gpu_data;
|
||||||
|
struct tile_gpu_data;
|
||||||
|
struct rp_entry;
|
||||||
|
|
||||||
|
/* A wrapper over all entries associated with a single command buffer. */
|
||||||
|
struct rp_entry_batch {
|
||||||
|
bool active; /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
|
||||||
|
valid fence. */
|
||||||
|
uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
|
||||||
|
determine when the entries can be processed. */
|
||||||
|
std::vector<std::unique_ptr<rp_entry>> entries;
|
||||||
|
|
||||||
|
rp_entry_batch();
|
||||||
|
|
||||||
|
/* Disable the copy/move to avoid performance hazards. */
|
||||||
|
rp_entry_batch(const rp_entry_batch &) = delete;
|
||||||
|
rp_entry_batch &operator=(const rp_entry_batch &) = delete;
|
||||||
|
rp_entry_batch(rp_entry_batch &&) = delete;
|
||||||
|
rp_entry_batch &operator=(rp_entry_batch &&) = delete;
|
||||||
|
|
||||||
|
void assign_fence(uint32_t new_fence);
|
||||||
|
|
||||||
|
void mark_inactive();
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
|
||||||
|
* iteration and to ensure that we process the entries in the same order they were submitted.
|
||||||
|
*/
|
||||||
|
std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
|
||||||
|
|
||||||
|
/* Handles processing of entry batches that are pending to be processed.
|
||||||
|
*
|
||||||
|
* Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
|
||||||
|
* in the on_submit() method, which is called on every submit of a command buffer.
|
||||||
|
*/
|
||||||
|
void process_entries();
|
||||||
|
|
||||||
|
/** Renderpass State Tracking **/
|
||||||
|
|
||||||
|
struct rp_history;
|
||||||
|
struct rp_history_handle;
|
||||||
|
|
||||||
|
/* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
|
||||||
|
* be stable across runs, so it can be used to identify the same renderpass instance consistently.
|
||||||
|
*
|
||||||
|
* Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
|
||||||
|
* rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
|
||||||
|
* but avoids hash collisions causing issues.
|
||||||
|
*/
|
||||||
|
struct rp_key {
|
||||||
|
uint64_t hash;
|
||||||
|
|
||||||
|
rp_key(const struct tu_render_pass *pass,
|
||||||
|
const struct tu_framebuffer *framebuffer,
|
||||||
|
const struct tu_cmd_buffer *cmd);
|
||||||
|
|
||||||
|
/* Equality operator, used in unordered_map. */
|
||||||
|
constexpr bool operator==(const rp_key &other) const noexcept
|
||||||
|
{
|
||||||
|
return hash == other.hash;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
|
||||||
|
*
|
||||||
|
* Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
|
||||||
|
* multiple times, rather than being calculated once and reused when there's multiple successive lookups like
|
||||||
|
* with find_or_create_rp_history() and providing the hash to the rp_history constructor.
|
||||||
|
*/
|
||||||
|
struct rp_hash {
|
||||||
|
constexpr size_t operator()(const rp_key &key) const noexcept
|
||||||
|
{
|
||||||
|
/* Note: This will throw away the upper 32-bits on 32-bit architectures. */
|
||||||
|
return static_cast<size_t>(key.hash);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
|
||||||
|
using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
|
||||||
|
rp_histories_t rp_histories;
|
||||||
|
std::shared_mutex rp_mutex;
|
||||||
|
uint64_t last_reap_ts = 0;
|
||||||
|
|
||||||
|
/* Note: These will internally lock rp_mutex internally, no need to lock it. */
|
||||||
|
rp_history_handle find_rp_history(const rp_key &key);
|
||||||
|
rp_history_handle find_or_create_rp_history(const rp_key &key);
|
||||||
|
void reap_old_rp_histories();
|
||||||
|
|
||||||
|
public:
|
||||||
|
tu_autotune(struct tu_device *device, VkResult &result);
|
||||||
|
|
||||||
|
~tu_autotune();
|
||||||
|
|
||||||
|
/* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
|
||||||
|
using rp_ctx_t = rp_entry *;
|
||||||
|
|
||||||
|
/* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
|
||||||
|
*
|
||||||
|
* Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
|
||||||
|
* done through tu_autotune.
|
||||||
|
*/
|
||||||
|
struct cmd_buf_ctx {
|
||||||
|
private:
|
||||||
|
/* A batch of all entries from RPs within this CB. */
|
||||||
|
std::shared_ptr<rp_entry_batch> batch;
|
||||||
|
|
||||||
|
/* Creates a new RP entry attached to this CB. */
|
||||||
|
rp_entry *
|
||||||
|
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
|
||||||
|
|
||||||
|
friend struct tu_autotune;
|
||||||
|
|
||||||
|
public:
|
||||||
|
cmd_buf_ctx();
|
||||||
|
~cmd_buf_ctx();
|
||||||
|
|
||||||
|
/* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
|
||||||
|
void reset();
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class render_mode {
|
||||||
|
SYSMEM,
|
||||||
|
GMEM,
|
||||||
|
};
|
||||||
|
|
||||||
|
render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
|
||||||
|
|
||||||
|
void begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem);
|
||||||
|
|
||||||
|
void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
|
||||||
|
|
||||||
|
/* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
|
||||||
|
* tracking to function correctly.
|
||||||
|
*
|
||||||
|
* Note: This must be called from a single-threaded context. There should never be multiple threads calling this
|
||||||
|
* function at the same time.
|
||||||
|
*/
|
||||||
|
struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
#endif /* TU_AUTOTUNE_H */
|
||||||
* From the cmdstream, the captured samples-passed values are recorded
|
|
||||||
* at the start and end of the batch.
|
|
||||||
*
|
|
||||||
* Note that we do the math on the CPU to avoid a WFI. But pre-emption
|
|
||||||
* may force us to revisit that.
|
|
||||||
*/
|
|
||||||
struct PACKED tu_renderpass_samples {
|
|
||||||
uint64_t samples_start;
|
|
||||||
/* hw requires the sample start/stop locations to be 128b aligned. */
|
|
||||||
uint64_t __pad0;
|
|
||||||
uint64_t samples_end;
|
|
||||||
uint64_t __pad1;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
|
|
||||||
static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tracks the results from an individual renderpass. Initially created
|
|
||||||
* per renderpass, and appended to the tail of at->pending_results. At a later
|
|
||||||
* time, when the GPU has finished writing the results, we fill samples_passed.
|
|
||||||
*/
|
|
||||||
struct tu_renderpass_result {
|
|
||||||
/* Points into GPU memory */
|
|
||||||
struct tu_renderpass_samples* samples;
|
|
||||||
|
|
||||||
struct tu_suballoc_bo bo;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Below here, only used internally within autotune
|
|
||||||
*/
|
|
||||||
uint64_t rp_key;
|
|
||||||
struct tu_renderpass_history *history;
|
|
||||||
struct list_head node;
|
|
||||||
uint32_t fence;
|
|
||||||
uint64_t samples_passed;
|
|
||||||
};
|
|
||||||
|
|
||||||
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
|
|
||||||
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
|
|
||||||
|
|
||||||
bool tu_autotune_use_bypass(struct tu_autotune *at,
|
|
||||||
struct tu_cmd_buffer *cmd_buffer,
|
|
||||||
struct tu_renderpass_result **autotune_result);
|
|
||||||
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
|
|
||||||
|
|
||||||
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
|
|
||||||
uint32_t cmd_buffer_count);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A magic 8-ball that tells the gmem code whether we should do bypass mode
|
|
||||||
* for moar fps.
|
|
||||||
*/
|
|
||||||
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
|
|
||||||
struct tu_autotune *at,
|
|
||||||
struct tu_cmd_buffer **cmd_buffers,
|
|
||||||
uint32_t cmd_buffer_count);
|
|
||||||
|
|
||||||
struct tu_autotune_results_buffer;
|
|
||||||
|
|
||||||
template <chip CHIP>
|
|
||||||
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
|
||||||
struct tu_cs *cs,
|
|
||||||
struct tu_renderpass_result *autotune_result);
|
|
||||||
|
|
||||||
template <chip CHIP>
|
|
||||||
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
|
|
||||||
struct tu_cs *cs,
|
|
||||||
struct tu_renderpass_result *autotune_result);
|
|
||||||
|
|
||||||
#endif /* TU_AUTOTUNE_H */
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
#include "vk_render_pass.h"
|
#include "vk_render_pass.h"
|
||||||
#include "vk_util.h"
|
#include "vk_util.h"
|
||||||
|
|
||||||
|
#include "tu_autotune.h"
|
||||||
#include "tu_buffer.h"
|
#include "tu_buffer.h"
|
||||||
#include "tu_clear_blit.h"
|
#include "tu_clear_blit.h"
|
||||||
#include "tu_cs.h"
|
#include "tu_cs.h"
|
||||||
|
|
@ -1314,7 +1315,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result **autotune_result)
|
tu_autotune::rp_ctx_t *rp_ctx)
|
||||||
{
|
{
|
||||||
if (TU_DEBUG(SYSMEM)) {
|
if (TU_DEBUG(SYSMEM)) {
|
||||||
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
|
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
|
||||||
|
|
@ -1375,15 +1376,9 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
||||||
if (TU_DEBUG(GMEM))
|
if (TU_DEBUG(GMEM))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
|
bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
|
||||||
cmd, autotune_result);
|
if (use_sysmem)
|
||||||
if (*autotune_result) {
|
|
||||||
list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_sysmem) {
|
|
||||||
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
|
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
|
||||||
}
|
|
||||||
|
|
||||||
return use_sysmem;
|
return use_sysmem;
|
||||||
}
|
}
|
||||||
|
|
@ -3128,7 +3123,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||||
|
|
||||||
|
|
@ -3181,7 +3176,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3189,7 +3184,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
/* Do any resolves of the last subpass. These are handled in the
|
/* Do any resolves of the last subpass. These are handled in the
|
||||||
* tile_store_cs in the gmem path.
|
* tile_store_cs in the gmem path.
|
||||||
|
|
@ -3229,7 +3224,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
tu_cs_emit(cs, 0); /* value */
|
tu_cs_emit(cs, 0); /* value */
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3379,7 +3374,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result,
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
const VkOffset2D *fdm_offsets)
|
const VkOffset2D *fdm_offsets)
|
||||||
{
|
{
|
||||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||||
|
|
@ -3565,7 +3560,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
if (use_cb)
|
if (use_cb)
|
||||||
tu_trace_start_render_pass(cmd);
|
tu_trace_start_render_pass(cmd);
|
||||||
|
|
||||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3628,7 +3623,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
|
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
|
||||||
|
|
||||||
|
|
@ -3658,7 +3653,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
|
|
||||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
||||||
|
|
||||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3767,7 +3762,7 @@ tu_emit_subsampled(struct tu_cmd_buffer *cmd,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result *autotune_result,
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
const VkOffset2D *fdm_offsets)
|
const VkOffset2D *fdm_offsets)
|
||||||
{
|
{
|
||||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||||
|
|
@ -3808,7 +3803,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
||||||
tu_cs_end(&cmd->tile_store_cs);
|
tu_cs_end(&cmd->tile_store_cs);
|
||||||
|
|
||||||
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
|
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
|
||||||
|
|
||||||
/* Note: we reverse the order of walking the pipes and tiles on every
|
/* Note: we reverse the order of walking the pipes and tiles on every
|
||||||
* other row, to improve texture cache locality compared to raster order.
|
* other row, to improve texture cache locality compared to raster order.
|
||||||
|
|
@ -3861,7 +3856,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
/* Outside of renderpasses we assume all draw states are disabled. We do
|
/* Outside of renderpasses we assume all draw states are disabled. We do
|
||||||
* this outside the draw CS for the normal case where 3d gmem stores aren't
|
* this outside the draw CS for the normal case where 3d gmem stores aren't
|
||||||
|
|
@ -3894,7 +3889,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
||||||
|
|
||||||
|
|
@ -3905,7 +3900,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
|
|
||||||
tu_trace_start_render_pass(cmd);
|
tu_trace_start_render_pass(cmd);
|
||||||
|
|
||||||
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
|
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
|
||||||
|
|
||||||
|
|
@ -3913,7 +3908,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
|
|
||||||
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
|
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
|
||||||
|
|
||||||
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
/* Outside of renderpasses we assume all draw states are disabled. */
|
/* Outside of renderpasses we assume all draw states are disabled. */
|
||||||
tu_disable_draw_states(cmd, &cmd->cs);
|
tu_disable_draw_states(cmd, &cmd->cs);
|
||||||
|
|
@ -3933,11 +3928,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
|
||||||
if (cmd_buffer->state.rp.has_tess)
|
if (cmd_buffer->state.rp.has_tess)
|
||||||
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
|
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
|
||||||
|
|
||||||
struct tu_renderpass_result *autotune_result = NULL;
|
tu_autotune::rp_ctx_t rp_ctx = NULL;
|
||||||
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
|
if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
|
||||||
tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
|
tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
|
||||||
else
|
else
|
||||||
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
|
tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
|
static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
|
||||||
|
|
@ -4003,7 +3998,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
|
||||||
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
|
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
|
||||||
cmd_buffer->trace_renderpass_start =
|
cmd_buffer->trace_renderpass_start =
|
||||||
u_trace_begin_iterator(&cmd_buffer->rp_trace);
|
u_trace_begin_iterator(&cmd_buffer->rp_trace);
|
||||||
list_inithead(&cmd_buffer->renderpass_autotune_results);
|
new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
|
||||||
|
|
||||||
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
|
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
|
||||||
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
|
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
|
||||||
|
|
@ -4052,7 +4047,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
|
||||||
u_trace_fini(&cmd_buffer->trace);
|
u_trace_fini(&cmd_buffer->trace);
|
||||||
u_trace_fini(&cmd_buffer->rp_trace);
|
u_trace_fini(&cmd_buffer->rp_trace);
|
||||||
|
|
||||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
cmd_buffer->autotune_ctx.~cmd_buf_ctx();
|
||||||
|
|
||||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||||
if (cmd_buffer->descriptors[i].push_set.layout)
|
if (cmd_buffer->descriptors[i].push_set.layout)
|
||||||
|
|
@ -4129,7 +4124,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
|
||||||
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
|
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
|
||||||
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
|
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
|
||||||
|
|
||||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
cmd_buffer->autotune_ctx.reset();
|
||||||
|
|
||||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||||
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
||||||
|
|
|
||||||
|
|
@ -653,8 +653,7 @@ struct tu_cmd_buffer
|
||||||
struct u_trace_iterator trace_renderpass_start;
|
struct u_trace_iterator trace_renderpass_start;
|
||||||
struct u_trace trace, rp_trace;
|
struct u_trace trace, rp_trace;
|
||||||
|
|
||||||
struct list_head renderpass_autotune_results;
|
tu_autotune::cmd_buf_ctx autotune_ctx;
|
||||||
struct tu_autotune_results_buffer* autotune_buffer;
|
|
||||||
|
|
||||||
void *patchpoints_ctx;
|
void *patchpoints_ctx;
|
||||||
struct util_dynarray fdm_bin_patchpoints;
|
struct util_dynarray fdm_bin_patchpoints;
|
||||||
|
|
|
||||||
|
|
@ -2692,7 +2692,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
|
||||||
{
|
{
|
||||||
mtx_destroy(&device->bo_mutex);
|
mtx_destroy(&device->bo_mutex);
|
||||||
mtx_destroy(&device->pipeline_mutex);
|
mtx_destroy(&device->pipeline_mutex);
|
||||||
mtx_destroy(&device->autotune_mutex);
|
|
||||||
mtx_destroy(&device->kgsl_profiling_mutex);
|
mtx_destroy(&device->kgsl_profiling_mutex);
|
||||||
mtx_destroy(&device->event_mutex);
|
mtx_destroy(&device->event_mutex);
|
||||||
mtx_destroy(&device->trace_mutex);
|
mtx_destroy(&device->trace_mutex);
|
||||||
|
|
@ -2808,7 +2807,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
|
|
||||||
mtx_init(&device->bo_mutex, mtx_plain);
|
mtx_init(&device->bo_mutex, mtx_plain);
|
||||||
mtx_init(&device->pipeline_mutex, mtx_plain);
|
mtx_init(&device->pipeline_mutex, mtx_plain);
|
||||||
mtx_init(&device->autotune_mutex, mtx_plain);
|
|
||||||
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
||||||
mtx_init(&device->event_mutex, mtx_plain);
|
mtx_init(&device->event_mutex, mtx_plain);
|
||||||
mtx_init(&device->trace_mutex, mtx_plain);
|
mtx_init(&device->trace_mutex, mtx_plain);
|
||||||
|
|
@ -2933,9 +2931,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
TU_BO_ALLOC_ALLOW_DUMP |
|
TU_BO_ALLOC_ALLOW_DUMP |
|
||||||
TU_BO_ALLOC_INTERNAL_RESOURCE),
|
TU_BO_ALLOC_INTERNAL_RESOURCE),
|
||||||
"pipeline_suballoc");
|
"pipeline_suballoc");
|
||||||
tu_bo_suballocator_init(&device->autotune_suballoc, device,
|
|
||||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
|
||||||
"autotune_suballoc");
|
|
||||||
if (is_kgsl(physical_device->instance)) {
|
if (is_kgsl(physical_device->instance)) {
|
||||||
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
||||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||||
|
|
@ -3083,10 +3078,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
}
|
}
|
||||||
pthread_condattr_destroy(&condattr);
|
pthread_condattr_destroy(&condattr);
|
||||||
|
|
||||||
result = tu_autotune_init(&device->autotune, device);
|
device->autotune = new tu_autotune(device, result);
|
||||||
if (result != VK_SUCCESS) {
|
if (result != VK_SUCCESS)
|
||||||
goto fail_timeline_cond;
|
goto fail_timeline_cond;
|
||||||
}
|
|
||||||
|
|
||||||
device->use_z24uint_s8uint =
|
device->use_z24uint_s8uint =
|
||||||
physical_device->info->props.has_z24uint_s8uint &&
|
physical_device->info->props.has_z24uint_s8uint &&
|
||||||
|
|
@ -3244,10 +3238,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
||||||
free(device->dbg_renderpass_stomp_cs);
|
free(device->dbg_renderpass_stomp_cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_autotune_fini(&device->autotune, device);
|
delete device->autotune;
|
||||||
|
|
||||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
|
||||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->event_suballoc);
|
tu_bo_suballocator_finish(&device->event_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@
|
||||||
#include "common/fd6_gmem_cache.h"
|
#include "common/fd6_gmem_cache.h"
|
||||||
#include "util/vma.h"
|
#include "util/vma.h"
|
||||||
#include "util/u_vector.h"
|
#include "util/u_vector.h"
|
||||||
|
#include "util/rwlock.h"
|
||||||
|
|
||||||
/* queue types */
|
/* queue types */
|
||||||
#define TU_QUEUE_GENERAL 0
|
#define TU_QUEUE_GENERAL 0
|
||||||
|
|
@ -267,7 +268,12 @@ struct tu6_global
|
||||||
|
|
||||||
volatile uint32_t vtx_stats_query_not_running;
|
volatile uint32_t vtx_stats_query_not_running;
|
||||||
|
|
||||||
/* To know when renderpass stats for autotune are valid */
|
/* A fence with a monotonically increasing value that is
|
||||||
|
* incremented by the GPU on each submission that includes
|
||||||
|
* a tu_autotune::submission_entry CS. This is used to track
|
||||||
|
* which submissions have been processed by the GPU before
|
||||||
|
* processing the autotune packet on the CPU.
|
||||||
|
*/
|
||||||
volatile uint32_t autotune_fence;
|
volatile uint32_t autotune_fence;
|
||||||
|
|
||||||
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
||||||
|
|
@ -357,12 +363,6 @@ struct tu_device
|
||||||
struct tu_suballocator pipeline_suballoc;
|
struct tu_suballocator pipeline_suballoc;
|
||||||
mtx_t pipeline_mutex;
|
mtx_t pipeline_mutex;
|
||||||
|
|
||||||
/* Device-global BO suballocator for reducing BO management for small
|
|
||||||
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
|
|
||||||
*/
|
|
||||||
struct tu_suballocator autotune_suballoc;
|
|
||||||
mtx_t autotune_mutex;
|
|
||||||
|
|
||||||
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
||||||
* each submission.
|
* each submission.
|
||||||
*/
|
*/
|
||||||
|
|
@ -460,7 +460,7 @@ struct tu_device
|
||||||
pthread_cond_t timeline_cond;
|
pthread_cond_t timeline_cond;
|
||||||
pthread_mutex_t submit_mutex;
|
pthread_mutex_t submit_mutex;
|
||||||
|
|
||||||
struct tu_autotune autotune;
|
struct tu_autotune *autotune;
|
||||||
|
|
||||||
struct breadcrumbs_context *breadcrumbs_ctx;
|
struct breadcrumbs_context *breadcrumbs_ctx;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -549,27 +549,6 @@ tu_render_pass_disable_fdm(struct tu_device *dev, struct tu_render_pass *pass)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
tu_render_pass_calc_hash(struct tu_render_pass *pass)
|
|
||||||
{
|
|
||||||
#define HASH(hash, data) XXH64(&(data), sizeof(data), hash)
|
|
||||||
|
|
||||||
uint64_t hash = HASH(0, pass->attachment_count);
|
|
||||||
hash = XXH64(pass->attachments,
|
|
||||||
pass->attachment_count * sizeof(pass->attachments[0]), hash);
|
|
||||||
hash = HASH(hash, pass->subpass_count);
|
|
||||||
for (unsigned i = 0; i < pass->subpass_count; i++) {
|
|
||||||
hash = HASH(hash, pass->subpasses[i].samples);
|
|
||||||
hash = HASH(hash, pass->subpasses[i].input_count);
|
|
||||||
hash = HASH(hash, pass->subpasses[i].color_count);
|
|
||||||
hash = HASH(hash, pass->subpasses[i].resolve_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
pass->autotune_hash = hash;
|
|
||||||
|
|
||||||
#undef HASH
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
tu_render_pass_cond_config(struct tu_device *device,
|
tu_render_pass_cond_config(struct tu_device *device,
|
||||||
struct tu_render_pass *pass)
|
struct tu_render_pass *pass)
|
||||||
|
|
@ -1354,7 +1333,6 @@ tu_CreateRenderPass2(VkDevice _device,
|
||||||
tu_render_pass_gmem_config(pass, device->physical_device);
|
tu_render_pass_gmem_config(pass, device->physical_device);
|
||||||
tu_render_pass_bandwidth_config(pass);
|
tu_render_pass_bandwidth_config(pass);
|
||||||
tu_render_pass_calc_views(pass);
|
tu_render_pass_calc_views(pass);
|
||||||
tu_render_pass_calc_hash(pass);
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
|
for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
|
||||||
tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
|
tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
|
||||||
|
|
@ -1834,7 +1812,6 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
|
||||||
tu_render_pass_gmem_config(pass, device->physical_device);
|
tu_render_pass_gmem_config(pass, device->physical_device);
|
||||||
tu_render_pass_bandwidth_config(pass);
|
tu_render_pass_bandwidth_config(pass);
|
||||||
tu_render_pass_calc_views(pass);
|
tu_render_pass_calc_views(pass);
|
||||||
tu_render_pass_calc_hash(pass);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
||||||
|
|
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
||||||
struct tu_device *device = queue->device;
|
struct tu_device *device = queue->device;
|
||||||
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
||||||
struct util_dynarray dump_cmds;
|
struct util_dynarray dump_cmds;
|
||||||
|
struct tu_cs *autotune_cs = NULL;
|
||||||
|
|
||||||
if (vk_submit->buffer_bind_count ||
|
if (vk_submit->buffer_bind_count ||
|
||||||
vk_submit->image_bind_count ||
|
vk_submit->image_bind_count ||
|
||||||
|
|
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
|
autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
|
||||||
struct tu_cs *autotune_cs = tu_autotune_on_submit(
|
if (autotune_cs) {
|
||||||
device, &device->autotune, cmd_buffers, cmdbuf_count);
|
|
||||||
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
|
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
|
||||||
autotune_cs->entry_count);
|
autotune_cs->entry_count);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue