mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 07:20:10 +01:00
Merge branch 'tu-newat' into 'main'
turnip: Autotuner Overhaul See merge request mesa/mesa!37802
This commit is contained in:
commit
adbb7f760f
17 changed files with 2309 additions and 848 deletions
|
|
@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
|
||||||
Some of these options will behave differently when toggled at runtime, for example:
|
Some of these options will behave differently when toggled at runtime, for example:
|
||||||
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
||||||
was set in the environment variable.
|
was set in the environment variable.
|
||||||
|
|
||||||
|
Autotune
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
|
||||||
|
autotune system, the behavior of which can be controlled with the following
|
||||||
|
environment variables:
|
||||||
|
|
||||||
|
.. envvar:: TU_AUTOTUNE_ALGO
|
||||||
|
|
||||||
|
Selects the algorithm used for autotuning. Supported values are:
|
||||||
|
|
||||||
|
``bandwidth``
|
||||||
|
Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
|
||||||
|
the one with lower estimated bandwidth.
|
||||||
|
|
||||||
|
``profiled``
|
||||||
|
Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
|
||||||
|
move a probability distribution towards the optimal choice over time. This
|
||||||
|
algorithm tends to be far more accurate than the bandwidth algorithm at choosing
|
||||||
|
the optimal rendering mode but may result in larger FPS variance due to being
|
||||||
|
based on a probability distribution with random sampling. This is the default
|
||||||
|
algorithm.
|
||||||
|
|
||||||
|
``profiled_imm``
|
||||||
|
Similar to ``profiled``, but only profiles the first few instances of a RP
|
||||||
|
and then sticks to the chosen mode for subsequent instances. This is meant
|
||||||
|
for single-frame traces run multiple times in a CI where this algorithm can
|
||||||
|
immediately chose the optimal rendering mode for each RP.
|
||||||
|
|
||||||
|
``prefer_sysmem``
|
||||||
|
Always chooses SYSMEM rendering. This is useful for games that don't benefit
|
||||||
|
from GMEM rendering due to their rendering patterns, setting this is better
|
||||||
|
than using ``TU_DEBUG=sysmem`` when done for performance reasons.
|
||||||
|
|
||||||
|
The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
|
||||||
|
|
||||||
|
.. envvar:: TU_AUTOTUNE_FLAGS
|
||||||
|
|
||||||
|
Modifies the behavior of the selected algorithm. Supported flags are:
|
||||||
|
|
||||||
|
``big_gmem``
|
||||||
|
Always chooses GMEM rendering if the amount of draw calls in the render pass
|
||||||
|
is greater than a certain threshold. Larger RPs generally benefit more from
|
||||||
|
GMEM rendering due to less overhead from tiling.
|
||||||
|
|
||||||
|
``small_sysmem``
|
||||||
|
Always chooses SYSMEM rendering if the amount of draw calls in the render pass
|
||||||
|
is lower than a certain threshold. The benefits of GMEM rendering are less
|
||||||
|
pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
|
||||||
|
|
||||||
|
``preempt_optimize``
|
||||||
|
Tries to keep non-preemptible time in the render pass is below a certain
|
||||||
|
threshold. This is useful for systems with GPU-based compositors where long
|
||||||
|
non-preemptible times can lead to missed frame deadlines, causing noticeable
|
||||||
|
stuttering. This flag will reduce the performance of the render pass in order
|
||||||
|
to improve overall system responsiveness, it should not be used unless the
|
||||||
|
rest of the system is affected by preemption delays.
|
||||||
|
|
||||||
|
Multiple flags can be combined by separating them with commas, e.g.
|
||||||
|
``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
|
||||||
|
|
||||||
|
If no flags are specified, the default behavior is used.
|
||||||
|
|
@ -4,7 +4,7 @@ DisableFormat: false
|
||||||
|
|
||||||
AlwaysBreakAfterReturnType: TopLevel
|
AlwaysBreakAfterReturnType: TopLevel
|
||||||
BinPackParameters: false
|
BinPackParameters: false
|
||||||
ColumnLimit: 78
|
ColumnLimit: 120
|
||||||
Cpp11BracedListStyle: false
|
Cpp11BracedListStyle: false
|
||||||
|
|
||||||
IncludeBlocks: Regroup
|
IncludeBlocks: Regroup
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -8,150 +8,265 @@
|
||||||
|
|
||||||
#include "tu_common.h"
|
#include "tu_common.h"
|
||||||
|
|
||||||
#include "util/hash_table.h"
|
#include <atomic>
|
||||||
#include "util/rwlock.h"
|
#include <deque>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <shared_mutex>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "tu_cs.h"
|
||||||
#include "tu_suballoc.h"
|
#include "tu_suballoc.h"
|
||||||
|
|
||||||
struct tu_renderpass_history;
|
/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
|
||||||
|
#define TU_AUTOTUNE_DEBUG_PERFCTR 0
|
||||||
|
|
||||||
/**
|
/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
|
||||||
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
|
* dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
|
||||||
* data about a given render target.
|
* analysis, since we can adapt to the actual workload rather than relying on heuristics.
|
||||||
*
|
|
||||||
* In deciding which path to take there are tradeoffs, including some that
|
|
||||||
* are not reasonably estimateable without having some additional information:
|
|
||||||
*
|
|
||||||
* (1) If you know you are touching every pixel (ie. there is a clear),
|
|
||||||
* then the GMEM path will at least not cost more memory bandwidth than
|
|
||||||
* sysmem[1]
|
|
||||||
*
|
|
||||||
* (2) If there is no clear, GMEM could potentially cost *more* bandwidth
|
|
||||||
* if there is sysmem->GMEM restore pass.
|
|
||||||
*
|
|
||||||
* (3) If you see a high draw count, that is an indication that there will be
|
|
||||||
* enough pixels accessed multiple times to benefit from the reduced
|
|
||||||
* memory bandwidth that GMEM brings
|
|
||||||
*
|
|
||||||
* (4) But high draw count where there is not much overdraw can actually be
|
|
||||||
* faster in bypass mode if it is pushing a lot of state change, due to
|
|
||||||
* not having to go thru the state changes per-tile[1]
|
|
||||||
*
|
|
||||||
* The approach taken is to measure the samples-passed for the batch to estimate
|
|
||||||
* the amount of overdraw to detect cases where the number of pixels touched is
|
|
||||||
* low.
|
|
||||||
*
|
|
||||||
* [1] ignoring early-tile-exit optimizations, but any draw that touches all/
|
|
||||||
* most of the tiles late in the tile-pass can defeat that
|
|
||||||
*/
|
*/
|
||||||
struct tu_autotune {
|
struct tu_autotune {
|
||||||
|
private:
|
||||||
/* We may have to disable autotuner if there are too many
|
bool enabled = true;
|
||||||
* renderpasses in-flight.
|
|
||||||
*/
|
|
||||||
bool enabled;
|
|
||||||
|
|
||||||
struct tu_device *device;
|
struct tu_device *device;
|
||||||
|
|
||||||
/**
|
/** Configuration **/
|
||||||
* Cache to map renderpass key to historical information about
|
|
||||||
* rendering to that particular render target.
|
|
||||||
*/
|
|
||||||
struct hash_table *ht;
|
|
||||||
struct u_rwlock ht_lock;
|
|
||||||
|
|
||||||
/**
|
enum class algorithm : uint8_t;
|
||||||
* List of per-renderpass results that we are waiting for the GPU
|
enum class mod_flag : uint8_t;
|
||||||
* to finish with before reading back the results.
|
enum class metric_flag : uint8_t;
|
||||||
*/
|
/* Container for all autotune configuration options. */
|
||||||
struct list_head pending_results;
|
struct PACKED config_t;
|
||||||
|
union PACKED packed_config_t;
|
||||||
|
|
||||||
/**
|
/* Allows for thread-safe access to the configurations. */
|
||||||
* List of per-submission data that we may want to free after we
|
struct atomic_config_t {
|
||||||
* processed submission results.
|
private:
|
||||||
* This could happend after command buffers which were in the submission
|
std::atomic<uint32_t> config_bits = 0;
|
||||||
* are destroyed.
|
|
||||||
*/
|
|
||||||
struct list_head pending_submission_data;
|
|
||||||
|
|
||||||
/**
|
public:
|
||||||
* List of per-submission data that has been finished and can be reused.
|
atomic_config_t(config_t initial_config);
|
||||||
*/
|
|
||||||
struct list_head submission_data_pool;
|
|
||||||
|
|
||||||
uint32_t fence_counter;
|
config_t load() const;
|
||||||
uint32_t idx_counter;
|
|
||||||
|
bool compare_and_store(config_t updated, config_t expected);
|
||||||
|
} active_config;
|
||||||
|
|
||||||
|
config_t get_env_config();
|
||||||
|
|
||||||
|
/** Global Fence and Internal CS Management **/
|
||||||
|
|
||||||
|
/* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
|
||||||
|
* Synchronized by suballoc_mutex.
|
||||||
|
*/
|
||||||
|
struct tu_suballocator suballoc;
|
||||||
|
std::mutex suballoc_mutex;
|
||||||
|
|
||||||
|
/* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
|
||||||
|
uint32_t next_fence = 1;
|
||||||
|
|
||||||
|
/* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
|
||||||
|
* managing the lifetime of the CS including recycling it after the fence value has been reached.
|
||||||
|
*/
|
||||||
|
struct submission_entry {
|
||||||
|
private:
|
||||||
|
uint32_t fence;
|
||||||
|
struct tu_cs fence_cs;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit submission_entry(tu_device *device);
|
||||||
|
|
||||||
|
~submission_entry();
|
||||||
|
|
||||||
|
/* Disable move/copy, since this holds stable pointers to the fence_cs. */
|
||||||
|
submission_entry(const submission_entry &) = delete;
|
||||||
|
submission_entry &operator=(const submission_entry &) = delete;
|
||||||
|
submission_entry(submission_entry &&) = delete;
|
||||||
|
submission_entry &operator=(submission_entry &&) = delete;
|
||||||
|
|
||||||
|
/* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
|
||||||
|
* GPU completion or currently being processed.
|
||||||
|
*/
|
||||||
|
bool is_active() const;
|
||||||
|
|
||||||
|
/* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
|
||||||
|
struct tu_cs *try_get_cs(uint32_t new_fence);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Unified pool for submission CSes.
|
||||||
|
* Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
|
||||||
|
*/
|
||||||
|
std::deque<submission_entry> submission_entries;
|
||||||
|
|
||||||
|
/* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
|
||||||
|
struct tu_cs *get_cs_for_fence(uint32_t fence);
|
||||||
|
|
||||||
|
/** RP Entry Management **/
|
||||||
|
|
||||||
|
struct rp_gpu_data;
|
||||||
|
struct tile_gpu_data;
|
||||||
|
struct rp_entry;
|
||||||
|
|
||||||
|
/* A wrapper over all entries associated with a single command buffer. */
|
||||||
|
struct rp_entry_batch {
|
||||||
|
bool active; /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
|
||||||
|
valid fence. */
|
||||||
|
uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
|
||||||
|
determine when the entries can be processed. */
|
||||||
|
std::vector<std::unique_ptr<rp_entry>> entries;
|
||||||
|
|
||||||
|
rp_entry_batch();
|
||||||
|
|
||||||
|
/* Disable the copy/move to avoid performance hazards. */
|
||||||
|
rp_entry_batch(const rp_entry_batch &) = delete;
|
||||||
|
rp_entry_batch &operator=(const rp_entry_batch &) = delete;
|
||||||
|
rp_entry_batch(rp_entry_batch &&) = delete;
|
||||||
|
rp_entry_batch &operator=(rp_entry_batch &&) = delete;
|
||||||
|
|
||||||
|
void assign_fence(uint32_t new_fence);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
|
||||||
|
* iteration and to ensure that we process the entries in the same order they were submitted.
|
||||||
|
*/
|
||||||
|
std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
|
||||||
|
|
||||||
|
/* Handles processing of entry batches that are pending to be processed.
|
||||||
|
*
|
||||||
|
* Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
|
||||||
|
* in the on_submit() method, which is called on every submit of a command buffer.
|
||||||
|
*/
|
||||||
|
void process_entries();
|
||||||
|
|
||||||
|
/** Renderpass State Tracking **/
|
||||||
|
|
||||||
|
struct rp_history;
|
||||||
|
struct rp_history_handle;
|
||||||
|
|
||||||
|
/* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
|
||||||
|
* be stable across runs, so it can be used to identify the same renderpass instance consistently.
|
||||||
|
*
|
||||||
|
* Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
|
||||||
|
* rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
|
||||||
|
* but avoids hash collisions causing issues.
|
||||||
|
*/
|
||||||
|
struct rp_key {
|
||||||
|
uint64_t hash;
|
||||||
|
|
||||||
|
rp_key(const struct tu_render_pass *pass,
|
||||||
|
const struct tu_framebuffer *framebuffer,
|
||||||
|
const struct tu_cmd_buffer *cmd);
|
||||||
|
|
||||||
|
/* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
|
||||||
|
rp_key(const rp_key &key, uint32_t duplicates);
|
||||||
|
|
||||||
|
/* Equality operator, used in unordered_map. */
|
||||||
|
constexpr bool operator==(const rp_key &other) const noexcept
|
||||||
|
{
|
||||||
|
return hash == other.hash;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
|
||||||
|
*
|
||||||
|
* Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
|
||||||
|
* multiple times, rather than being calculated once and reused when there's multiple successive lookups like
|
||||||
|
* with find_or_create_rp_history() and providing the hash to the rp_history constructor.
|
||||||
|
*/
|
||||||
|
struct rp_hash {
|
||||||
|
constexpr size_t operator()(const rp_key &key) const noexcept
|
||||||
|
{
|
||||||
|
/* Note: This will throw away the upper 32-bits on 32-bit architectures. */
|
||||||
|
return static_cast<size_t>(key.hash);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
|
||||||
|
using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
|
||||||
|
rp_histories_t rp_histories;
|
||||||
|
std::shared_mutex rp_mutex;
|
||||||
|
uint64_t last_reap_ts = 0;
|
||||||
|
|
||||||
|
/* Note: These will internally lock rp_mutex internally, no need to lock it. */
|
||||||
|
rp_history_handle find_rp_history(const rp_key &key);
|
||||||
|
rp_history_handle find_or_create_rp_history(const rp_key &key);
|
||||||
|
void reap_old_rp_histories();
|
||||||
|
|
||||||
|
/** Debug Performance Counters **/
|
||||||
|
|
||||||
|
#if TU_AUTOTUNE_DEBUG_PERFCTR
|
||||||
|
const fd_perfcntr_group *cp_group;
|
||||||
|
const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
public:
|
||||||
|
tu_autotune(struct tu_device *device, VkResult &result);
|
||||||
|
|
||||||
|
~tu_autotune();
|
||||||
|
|
||||||
|
/* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
|
||||||
|
using rp_ctx_t = rp_entry *;
|
||||||
|
|
||||||
|
/* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
|
||||||
|
*
|
||||||
|
* Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
|
||||||
|
* done through tu_autotune.
|
||||||
|
*/
|
||||||
|
struct cmd_buf_ctx {
|
||||||
|
private:
|
||||||
|
/* A batch of all entries from RPs within this CB. */
|
||||||
|
std::shared_ptr<rp_entry_batch> batch;
|
||||||
|
|
||||||
|
/* Creates a new RP entry attached to this CB. */
|
||||||
|
rp_entry *
|
||||||
|
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
|
||||||
|
|
||||||
|
rp_entry *find_rp_entry(const rp_key &key);
|
||||||
|
|
||||||
|
friend struct tu_autotune;
|
||||||
|
|
||||||
|
public:
|
||||||
|
cmd_buf_ctx();
|
||||||
|
~cmd_buf_ctx();
|
||||||
|
|
||||||
|
/* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
|
||||||
|
void reset();
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class render_mode {
|
||||||
|
SYSMEM,
|
||||||
|
GMEM,
|
||||||
|
};
|
||||||
|
|
||||||
|
render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
|
||||||
|
|
||||||
|
/* Returns the optimal tile size divisor for the given CB state. */
|
||||||
|
uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
|
||||||
|
|
||||||
|
/* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
|
||||||
|
* ensure that the autotuner does not interfere with the high-priority queue's performance.
|
||||||
|
*
|
||||||
|
* Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
|
||||||
|
*/
|
||||||
|
void disable_preempt_optimize();
|
||||||
|
|
||||||
|
void
|
||||||
|
begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
|
||||||
|
|
||||||
|
void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
|
||||||
|
|
||||||
|
void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
|
||||||
|
|
||||||
|
void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
|
||||||
|
|
||||||
|
/* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
|
||||||
|
* tracking to function correctly.
|
||||||
|
*
|
||||||
|
* Note: This must be called from a single-threaded context. There should never be multiple threads calling this
|
||||||
|
* function at the same time.
|
||||||
|
*/
|
||||||
|
struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* From the cmdstream, the captured samples-passed values are recorded
|
|
||||||
* at the start and end of the batch.
|
|
||||||
*
|
|
||||||
* Note that we do the math on the CPU to avoid a WFI. But pre-emption
|
|
||||||
* may force us to revisit that.
|
|
||||||
*/
|
|
||||||
struct PACKED tu_renderpass_samples {
|
|
||||||
uint64_t samples_start;
|
|
||||||
/* hw requires the sample start/stop locations to be 128b aligned. */
|
|
||||||
uint64_t __pad0;
|
|
||||||
uint64_t samples_end;
|
|
||||||
uint64_t __pad1;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
|
|
||||||
static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tracks the results from an individual renderpass. Initially created
|
|
||||||
* per renderpass, and appended to the tail of at->pending_results. At a later
|
|
||||||
* time, when the GPU has finished writing the results, we fill samples_passed.
|
|
||||||
*/
|
|
||||||
struct tu_renderpass_result {
|
|
||||||
/* Points into GPU memory */
|
|
||||||
struct tu_renderpass_samples* samples;
|
|
||||||
|
|
||||||
struct tu_suballoc_bo bo;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Below here, only used internally within autotune
|
|
||||||
*/
|
|
||||||
uint64_t rp_key;
|
|
||||||
struct tu_renderpass_history *history;
|
|
||||||
struct list_head node;
|
|
||||||
uint32_t fence;
|
|
||||||
uint64_t samples_passed;
|
|
||||||
};
|
|
||||||
|
|
||||||
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
|
|
||||||
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
|
|
||||||
|
|
||||||
bool tu_autotune_use_bypass(struct tu_autotune *at,
|
|
||||||
struct tu_cmd_buffer *cmd_buffer,
|
|
||||||
struct tu_renderpass_result **autotune_result);
|
|
||||||
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
|
|
||||||
|
|
||||||
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
|
|
||||||
uint32_t cmd_buffer_count);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A magic 8-ball that tells the gmem code whether we should do bypass mode
|
|
||||||
* for moar fps.
|
|
||||||
*/
|
|
||||||
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
|
|
||||||
struct tu_autotune *at,
|
|
||||||
struct tu_cmd_buffer **cmd_buffers,
|
|
||||||
uint32_t cmd_buffer_count);
|
|
||||||
|
|
||||||
struct tu_autotune_results_buffer;
|
|
||||||
|
|
||||||
template <chip CHIP>
|
|
||||||
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
|
||||||
struct tu_cs *cs,
|
|
||||||
struct tu_renderpass_result *autotune_result);
|
|
||||||
|
|
||||||
template <chip CHIP>
|
|
||||||
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
|
|
||||||
struct tu_cs *cs,
|
|
||||||
struct tu_renderpass_result *autotune_result);
|
|
||||||
|
|
||||||
#endif /* TU_AUTOTUNE_H */
|
#endif /* TU_AUTOTUNE_H */
|
||||||
|
|
@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
|
cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
|
||||||
|
|
||||||
|
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
|
||||||
|
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct apply_store_coords_state {
|
struct apply_store_coords_state {
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
#include "vk_render_pass.h"
|
#include "vk_render_pass.h"
|
||||||
#include "vk_util.h"
|
#include "vk_util.h"
|
||||||
|
|
||||||
|
#include "tu_autotune.h"
|
||||||
#include "tu_buffer.h"
|
#include "tu_buffer.h"
|
||||||
#include "tu_clear_blit.h"
|
#include "tu_clear_blit.h"
|
||||||
#include "tu_cs.h"
|
#include "tu_cs.h"
|
||||||
|
|
@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
|
||||||
static bool
|
static bool
|
||||||
use_hw_binning(struct tu_cmd_buffer *cmd)
|
use_hw_binning(struct tu_cmd_buffer *cmd)
|
||||||
{
|
{
|
||||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||||
const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
|
const struct tu_tiling_config *tiling =
|
||||||
|
tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
||||||
|
|
||||||
/* XFB commands are emitted for BINNING || SYSMEM, which makes it
|
/* XFB commands are emitted for BINNING || SYSMEM, which makes it
|
||||||
|
|
@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return vsc->binning;
|
return vsc->binning_possible && vsc->binning_useful;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result **autotune_result)
|
tu_autotune::rp_ctx_t *rp_ctx)
|
||||||
{
|
{
|
||||||
if (TU_DEBUG(SYSMEM)) {
|
if (TU_DEBUG(SYSMEM)) {
|
||||||
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
|
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
|
||||||
|
|
@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (TU_DEBUG(GMEM))
|
if (TU_DEBUG(GMEM)) {
|
||||||
|
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
|
|
||||||
cmd, autotune_result);
|
|
||||||
if (*autotune_result) {
|
|
||||||
list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_sysmem) {
|
/* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
|
||||||
|
if (!vsc->binning_possible && vsc->binning_useful) {
|
||||||
|
cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
|
||||||
|
if (use_sysmem)
|
||||||
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
|
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
|
||||||
}
|
|
||||||
|
|
||||||
return use_sysmem;
|
return use_sysmem;
|
||||||
}
|
}
|
||||||
|
|
@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||||
|
|
||||||
|
|
@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
|
||||||
|
|
||||||
/* Do any resolves of the last subpass. These are handled in the
|
/* Do any resolves of the last subpass. These are handled in the
|
||||||
* tile_store_cs in the gmem path.
|
* tile_store_cs in the gmem path.
|
||||||
*/
|
*/
|
||||||
|
|
@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
tu_cs_emit(cs, 0); /* value */
|
tu_cs_emit(cs, 0); /* value */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result,
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
const VkOffset2D *fdm_offsets)
|
const VkOffset2D *fdm_offsets)
|
||||||
{
|
{
|
||||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||||
|
|
@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
if (use_cb)
|
if (use_cb)
|
||||||
tu_trace_start_render_pass(cmd);
|
tu_trace_start_render_pass(cmd);
|
||||||
|
|
||||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
|
||||||
|
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
@ -3471,13 +3476,18 @@ template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
const struct tu_tile_config *tile,
|
const struct tu_tile_config *tile,
|
||||||
bool fdm, const VkOffset2D *fdm_offsets)
|
bool fdm, const VkOffset2D *fdm_offsets,
|
||||||
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
|
const struct tu_vsc_config *vsc)
|
||||||
{
|
{
|
||||||
|
uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
|
||||||
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
|
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
|
||||||
tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
|
tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
|
||||||
|
|
||||||
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
|
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
|
||||||
|
|
||||||
|
cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
|
||||||
|
|
||||||
/* Primitives that passed all tests are still counted in in each
|
/* Primitives that passed all tests are still counted in in each
|
||||||
* tile even with HW binning beforehand. Do not permit it.
|
* tile even with HW binning beforehand. Do not permit it.
|
||||||
*/
|
*/
|
||||||
|
|
@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
if (cmd->state.prim_generated_query_running_before_rp)
|
if (cmd->state.prim_generated_query_running_before_rp)
|
||||||
tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
|
tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
|
||||||
|
|
||||||
|
cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
|
||||||
|
|
||||||
if (use_hw_binning(cmd)) {
|
if (use_hw_binning(cmd)) {
|
||||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
|
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
|
||||||
|
|
@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
|
||||||
|
|
||||||
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
|
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
|
||||||
|
|
||||||
tu_lrz_tiling_end<CHIP>(cmd, cs);
|
tu_lrz_tiling_end<CHIP>(cmd, cs);
|
||||||
|
|
@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||||
|
|
||||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
||||||
|
|
||||||
|
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||||
|
|
||||||
tu_cs_sanity_check(cs);
|
tu_cs_sanity_check(cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3796,7 +3808,9 @@ void
|
||||||
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
||||||
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
|
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
|
||||||
const struct tu_image_view *fdm,
|
const struct tu_image_view *fdm,
|
||||||
const VkOffset2D *fdm_offsets)
|
const VkOffset2D *fdm_offsets,
|
||||||
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
|
const struct tu_vsc_config *vsc)
|
||||||
{
|
{
|
||||||
uint32_t width = tx2 - tx1;
|
uint32_t width = tx2 - tx1;
|
||||||
uint32_t height = ty2 - ty1;
|
uint32_t height = ty2 - ty1;
|
||||||
|
|
@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
|
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
|
||||||
true, fdm_offsets);
|
true, fdm_offsets,
|
||||||
|
rp_ctx, vsc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result *autotune_result,
|
tu_autotune::rp_ctx_t rp_ctx,
|
||||||
const VkOffset2D *fdm_offsets)
|
const VkOffset2D *fdm_offsets)
|
||||||
{
|
{
|
||||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||||
|
|
@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
||||||
tu_cs_end(&cmd->tile_store_cs);
|
tu_cs_end(&cmd->tile_store_cs);
|
||||||
|
|
||||||
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
|
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
|
||||||
|
|
||||||
/* Note: we reverse the order of walking the pipes and tiles on every
|
/* Note: we reverse the order of walking the pipes and tiles on every
|
||||||
* other row, to improve texture cache locality compared to raster order.
|
* other row, to improve texture cache locality compared to raster order.
|
||||||
|
|
@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
|
|
||||||
if (merge_tiles) {
|
if (merge_tiles) {
|
||||||
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
|
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
|
||||||
fdm_offsets);
|
fdm_offsets, rp_ctx, vsc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
|
tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
|
||||||
|
|
||||||
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
|
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
|
||||||
fdm_offsets);
|
fdm_offsets,
|
||||||
|
rp_ctx, vsc);
|
||||||
}
|
}
|
||||||
slot_row += tile_row_stride;
|
slot_row += tile_row_stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
tu_trace_end_render_pass<CHIP>(cmd, true);
|
tu_trace_end_render_pass<CHIP>(cmd, true);
|
||||||
|
|
||||||
|
|
@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||||
template <chip CHIP>
|
template <chip CHIP>
|
||||||
static void
|
static void
|
||||||
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
struct tu_renderpass_result *autotune_result)
|
tu_autotune::rp_ctx_t rp_ctx)
|
||||||
{
|
{
|
||||||
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
||||||
|
|
||||||
|
|
@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
|
|
||||||
tu_trace_start_render_pass(cmd);
|
tu_trace_start_render_pass(cmd);
|
||||||
|
|
||||||
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
|
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
|
||||||
|
|
||||||
|
|
@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||||
|
|
||||||
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
|
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
|
||||||
|
|
||||||
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||||
|
|
||||||
tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
|
tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
|
||||||
cmd->trace_renderpass_start,
|
cmd->trace_renderpass_start,
|
||||||
|
|
@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
|
||||||
if (cmd_buffer->state.rp.has_tess)
|
if (cmd_buffer->state.rp.has_tess)
|
||||||
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
|
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
|
||||||
|
|
||||||
struct tu_renderpass_result *autotune_result = NULL;
|
tu_autotune::rp_ctx_t rp_ctx = NULL;
|
||||||
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
|
if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
|
||||||
tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
|
tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
|
||||||
else
|
else
|
||||||
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
|
tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
|
||||||
|
|
||||||
/* Outside of renderpasses we assume all draw states are disabled. We do
|
/* Outside of renderpasses we assume all draw states are disabled. We do
|
||||||
* this outside the draw CS for the normal case where 3d gmem stores aren't
|
* this outside the draw CS for the normal case where 3d gmem stores aren't
|
||||||
|
|
@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
|
||||||
cmd_buffer->state.attachments = NULL;
|
cmd_buffer->state.attachments = NULL;
|
||||||
cmd_buffer->state.clear_values = NULL;
|
cmd_buffer->state.clear_values = NULL;
|
||||||
cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
|
cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
|
||||||
|
cmd_buffer->state.gmem_layout_divisor = 0;
|
||||||
cmd_buffer->state.renderpass_cb_disabled = false;
|
cmd_buffer->state.renderpass_cb_disabled = false;
|
||||||
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
|
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
|
||||||
|
|
||||||
|
|
@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
|
||||||
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
|
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
|
||||||
cmd_buffer->trace_renderpass_start =
|
cmd_buffer->trace_renderpass_start =
|
||||||
u_trace_begin_iterator(&cmd_buffer->rp_trace);
|
u_trace_begin_iterator(&cmd_buffer->rp_trace);
|
||||||
list_inithead(&cmd_buffer->renderpass_autotune_results);
|
new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
|
||||||
|
|
||||||
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
|
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
|
||||||
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
|
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
|
||||||
|
|
@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
|
||||||
u_trace_fini(&cmd_buffer->trace);
|
u_trace_fini(&cmd_buffer->trace);
|
||||||
u_trace_fini(&cmd_buffer->rp_trace);
|
u_trace_fini(&cmd_buffer->rp_trace);
|
||||||
|
|
||||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
cmd_buffer->autotune_ctx.~cmd_buf_ctx();
|
||||||
|
|
||||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||||
if (cmd_buffer->descriptors[i].push_set.layout)
|
if (cmd_buffer->descriptors[i].push_set.layout)
|
||||||
|
|
@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
|
||||||
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
|
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
|
||||||
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
|
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
|
||||||
|
|
||||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
cmd_buffer->autotune_ctx.reset();
|
||||||
|
|
||||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||||
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
||||||
|
|
@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
|
||||||
cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
|
cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
|
||||||
cmd->state.render_area = suspended->state.suspended_pass.render_area;
|
cmd->state.render_area = suspended->state.suspended_pass.render_area;
|
||||||
cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
|
cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
|
||||||
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
|
cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
|
||||||
|
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
|
||||||
|
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||||
cmd->state.lrz = suspended->state.suspended_pass.lrz;
|
cmd->state.lrz = suspended->state.suspended_pass.lrz;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
|
||||||
* (perf queries), then we can't do this optimization since the
|
* (perf queries), then we can't do this optimization since the
|
||||||
* start-of-the-CS geometry condition will have been overwritten.
|
* start-of-the-CS geometry condition will have been overwritten.
|
||||||
*/
|
*/
|
||||||
bool cond_load_allowed = vsc->binning &&
|
bool cond_load_allowed = vsc->binning_possible &&
|
||||||
cmd->state.pass->has_cond_load_store &&
|
cmd->state.pass->has_cond_load_store &&
|
||||||
!cmd->state.rp.draw_cs_writes_to_cond_pred;
|
!cmd->state.rp.draw_cs_writes_to_cond_pred;
|
||||||
|
|
||||||
|
|
@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
|
||||||
cmd->state.suspended_pass.attachments = cmd->state.attachments;
|
cmd->state.suspended_pass.attachments = cmd->state.attachments;
|
||||||
cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
|
cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
|
||||||
cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
|
cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
|
||||||
|
cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
|
tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
|
||||||
|
|
|
||||||
|
|
@ -524,11 +524,12 @@ struct tu_cmd_state
|
||||||
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
|
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
|
||||||
* might get used by tu_store_gmem_attachment().
|
* might get used by tu_store_gmem_attachment().
|
||||||
*/
|
*/
|
||||||
enum tu_gmem_layout gmem_layout;
|
tu_gmem_layout gmem_layout;
|
||||||
|
uint32_t gmem_layout_divisor;
|
||||||
|
|
||||||
const struct tu_render_pass *pass;
|
const struct tu_render_pass *pass;
|
||||||
const struct tu_subpass *subpass;
|
const struct tu_subpass *subpass;
|
||||||
const struct tu_framebuffer *framebuffer;
|
struct tu_framebuffer *framebuffer;
|
||||||
const struct tu_tiling_config *tiling;
|
const struct tu_tiling_config *tiling;
|
||||||
VkRect2D render_area;
|
VkRect2D render_area;
|
||||||
|
|
||||||
|
|
@ -543,9 +544,10 @@ struct tu_cmd_state
|
||||||
struct {
|
struct {
|
||||||
const struct tu_render_pass *pass;
|
const struct tu_render_pass *pass;
|
||||||
const struct tu_subpass *subpass;
|
const struct tu_subpass *subpass;
|
||||||
const struct tu_framebuffer *framebuffer;
|
struct tu_framebuffer *framebuffer;
|
||||||
VkRect2D render_area;
|
VkRect2D render_area;
|
||||||
enum tu_gmem_layout gmem_layout;
|
enum tu_gmem_layout gmem_layout;
|
||||||
|
uint32_t gmem_layout_divisor;
|
||||||
|
|
||||||
const struct tu_image_view **attachments;
|
const struct tu_image_view **attachments;
|
||||||
VkClearValue *clear_values;
|
VkClearValue *clear_values;
|
||||||
|
|
@ -644,8 +646,7 @@ struct tu_cmd_buffer
|
||||||
struct u_trace_iterator trace_renderpass_start;
|
struct u_trace_iterator trace_renderpass_start;
|
||||||
struct u_trace trace, rp_trace;
|
struct u_trace trace, rp_trace;
|
||||||
|
|
||||||
struct list_head renderpass_autotune_results;
|
tu_autotune::cmd_buf_ctx autotune_ctx;
|
||||||
struct tu_autotune_results_buffer* autotune_buffer;
|
|
||||||
|
|
||||||
void *patchpoints_ctx;
|
void *patchpoints_ctx;
|
||||||
struct util_dynarray fdm_bin_patchpoints;
|
struct util_dynarray fdm_bin_patchpoints;
|
||||||
|
|
|
||||||
|
|
@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
|
||||||
DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
|
DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
|
||||||
DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
|
DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
|
||||||
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
|
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
|
||||||
|
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
|
||||||
DRI_CONF_SECTION_END
|
DRI_CONF_SECTION_END
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
|
||||||
driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
|
driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
|
||||||
instance->enable_softfloat32 =
|
instance->enable_softfloat32 =
|
||||||
driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
|
driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
|
||||||
|
instance->autotune_algo =
|
||||||
|
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t instance_count = 0;
|
static uint32_t instance_count = 0;
|
||||||
|
|
@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
|
||||||
{
|
{
|
||||||
mtx_destroy(&device->bo_mutex);
|
mtx_destroy(&device->bo_mutex);
|
||||||
mtx_destroy(&device->pipeline_mutex);
|
mtx_destroy(&device->pipeline_mutex);
|
||||||
mtx_destroy(&device->autotune_mutex);
|
|
||||||
mtx_destroy(&device->kgsl_profiling_mutex);
|
mtx_destroy(&device->kgsl_profiling_mutex);
|
||||||
mtx_destroy(&device->event_mutex);
|
mtx_destroy(&device->event_mutex);
|
||||||
mtx_destroy(&device->trace_mutex);
|
mtx_destroy(&device->trace_mutex);
|
||||||
|
|
@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
VkResult result;
|
VkResult result;
|
||||||
struct tu_device *device;
|
struct tu_device *device;
|
||||||
bool border_color_without_format = false;
|
bool border_color_without_format = false;
|
||||||
|
bool autotune_disable_preempt_optimize = false;
|
||||||
|
|
||||||
vk_foreach_struct_const (ext, pCreateInfo->pNext) {
|
vk_foreach_struct_const (ext, pCreateInfo->pNext) {
|
||||||
switch (ext->sType) {
|
switch (ext->sType) {
|
||||||
|
|
@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
|
|
||||||
mtx_init(&device->bo_mutex, mtx_plain);
|
mtx_init(&device->bo_mutex, mtx_plain);
|
||||||
mtx_init(&device->pipeline_mutex, mtx_plain);
|
mtx_init(&device->pipeline_mutex, mtx_plain);
|
||||||
mtx_init(&device->autotune_mutex, mtx_plain);
|
|
||||||
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
||||||
mtx_init(&device->event_mutex, mtx_plain);
|
mtx_init(&device->event_mutex, mtx_plain);
|
||||||
mtx_init(&device->trace_mutex, mtx_plain);
|
mtx_init(&device->trace_mutex, mtx_plain);
|
||||||
|
|
@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
||||||
const VkDeviceQueueCreateInfo *queue_create =
|
const VkDeviceQueueCreateInfo *queue_create =
|
||||||
&pCreateInfo->pQueueCreateInfos[i];
|
&pCreateInfo->pQueueCreateInfos[i];
|
||||||
|
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
|
||||||
|
vk_find_struct_const(queue_create->pNext,
|
||||||
|
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
|
||||||
|
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
|
||||||
|
priority_info->globalPriority :
|
||||||
|
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
|
||||||
|
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
|
||||||
uint32_t qfi = queue_create->queueFamilyIndex;
|
uint32_t qfi = queue_create->queueFamilyIndex;
|
||||||
enum tu_queue_type type = physical_device->queue_families[qfi].type;
|
enum tu_queue_type type = physical_device->queue_families[qfi].type;
|
||||||
device->queues[qfi] = (struct tu_queue *) vk_alloc(
|
device->queues[qfi] = (struct tu_queue *) vk_alloc(
|
||||||
|
|
@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
device->queue_count[qfi] = queue_create->queueCount;
|
device->queue_count[qfi] = queue_create->queueCount;
|
||||||
|
|
||||||
for (unsigned q = 0; q < queue_create->queueCount; q++) {
|
for (unsigned q = 0; q < queue_create->queueCount; q++) {
|
||||||
result = tu_queue_init(device, &device->queues[qfi][q], type, q,
|
result = tu_queue_init(device, &device->queues[qfi][q], type,
|
||||||
queue_create);
|
global_priority, q, queue_create);
|
||||||
if (result != VK_SUCCESS) {
|
if (result != VK_SUCCESS) {
|
||||||
device->queue_count[qfi] = q;
|
device->queue_count[qfi] = q;
|
||||||
goto fail_queues;
|
goto fail_queues;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
autotune_disable_preempt_optimize |=
|
||||||
|
(global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
|
||||||
}
|
}
|
||||||
|
|
||||||
result = vk_meta_device_init(&device->vk, &device->meta);
|
result = vk_meta_device_init(&device->vk, &device->meta);
|
||||||
|
|
@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
TU_BO_ALLOC_ALLOW_DUMP |
|
TU_BO_ALLOC_ALLOW_DUMP |
|
||||||
TU_BO_ALLOC_INTERNAL_RESOURCE),
|
TU_BO_ALLOC_INTERNAL_RESOURCE),
|
||||||
"pipeline_suballoc");
|
"pipeline_suballoc");
|
||||||
tu_bo_suballocator_init(&device->autotune_suballoc, device,
|
|
||||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
|
||||||
"autotune_suballoc");
|
|
||||||
if (is_kgsl(physical_device->instance)) {
|
if (is_kgsl(physical_device->instance)) {
|
||||||
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
||||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||||
|
|
@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||||
}
|
}
|
||||||
pthread_condattr_destroy(&condattr);
|
pthread_condattr_destroy(&condattr);
|
||||||
|
|
||||||
result = tu_autotune_init(&device->autotune, device);
|
device->autotune = new tu_autotune(device, result);
|
||||||
if (result != VK_SUCCESS) {
|
if (result != VK_SUCCESS)
|
||||||
goto fail_timeline_cond;
|
goto fail_timeline_cond;
|
||||||
}
|
|
||||||
|
if (autotune_disable_preempt_optimize)
|
||||||
|
device->autotune->disable_preempt_optimize();
|
||||||
|
|
||||||
device->use_z24uint_s8uint =
|
device->use_z24uint_s8uint =
|
||||||
physical_device->info->props.has_z24uint_s8uint &&
|
physical_device->info->props.has_z24uint_s8uint &&
|
||||||
|
|
@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
||||||
free(device->dbg_renderpass_stomp_cs);
|
free(device->dbg_renderpass_stomp_cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_autotune_fini(&device->autotune, device);
|
delete device->autotune;
|
||||||
|
|
||||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
|
||||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->event_suballoc);
|
tu_bo_suballocator_finish(&device->event_suballoc);
|
||||||
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
||||||
|
|
@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_framebuffer_tiling_config(framebuffer, device, pass);
|
tu_framebuffer_init_tiling_config(framebuffer, device, pass);
|
||||||
|
|
||||||
/* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
|
/* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
|
||||||
if (msrtss_attachment_count > 0) {
|
if (msrtss_attachment_count > 0) {
|
||||||
|
|
@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
|
||||||
view->image->max_tile_h_constraint_fdm;
|
view->image->max_tile_h_constraint_fdm;
|
||||||
}
|
}
|
||||||
|
|
||||||
tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
|
tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
VkResult
|
VkResult
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@
|
||||||
#include "common/freedreno_rd_output.h"
|
#include "common/freedreno_rd_output.h"
|
||||||
#include "util/vma.h"
|
#include "util/vma.h"
|
||||||
#include "util/u_vector.h"
|
#include "util/u_vector.h"
|
||||||
|
#include "util/rwlock.h"
|
||||||
|
|
||||||
/* queue types */
|
/* queue types */
|
||||||
#define TU_QUEUE_GENERAL 0
|
#define TU_QUEUE_GENERAL 0
|
||||||
|
|
@ -233,6 +234,9 @@ struct tu_instance
|
||||||
* However we don't want native Vulkan apps using this.
|
* However we don't want native Vulkan apps using this.
|
||||||
*/
|
*/
|
||||||
bool enable_softfloat32;
|
bool enable_softfloat32;
|
||||||
|
|
||||||
|
/* Configuration option to use a specific autotune algorithm by default. */
|
||||||
|
const char *autotune_algo;
|
||||||
};
|
};
|
||||||
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
||||||
VK_OBJECT_TYPE_INSTANCE)
|
VK_OBJECT_TYPE_INSTANCE)
|
||||||
|
|
@ -265,7 +269,12 @@ struct tu6_global
|
||||||
|
|
||||||
volatile uint32_t vtx_stats_query_not_running;
|
volatile uint32_t vtx_stats_query_not_running;
|
||||||
|
|
||||||
/* To know when renderpass stats for autotune are valid */
|
/* A fence with a monotonically increasing value that is
|
||||||
|
* incremented by the GPU on each submission that includes
|
||||||
|
* a tu_autotune::submission_entry CS. This is used to track
|
||||||
|
* which submissions have been processed by the GPU before
|
||||||
|
* processing the autotune packet on the CPU.
|
||||||
|
*/
|
||||||
volatile uint32_t autotune_fence;
|
volatile uint32_t autotune_fence;
|
||||||
|
|
||||||
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
||||||
|
|
@ -355,12 +364,6 @@ struct tu_device
|
||||||
struct tu_suballocator pipeline_suballoc;
|
struct tu_suballocator pipeline_suballoc;
|
||||||
mtx_t pipeline_mutex;
|
mtx_t pipeline_mutex;
|
||||||
|
|
||||||
/* Device-global BO suballocator for reducing BO management for small
|
|
||||||
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
|
|
||||||
*/
|
|
||||||
struct tu_suballocator autotune_suballoc;
|
|
||||||
mtx_t autotune_mutex;
|
|
||||||
|
|
||||||
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
||||||
* each submission.
|
* each submission.
|
||||||
*/
|
*/
|
||||||
|
|
@ -462,7 +465,7 @@ struct tu_device
|
||||||
pthread_cond_t timeline_cond;
|
pthread_cond_t timeline_cond;
|
||||||
pthread_mutex_t submit_mutex;
|
pthread_mutex_t submit_mutex;
|
||||||
|
|
||||||
struct tu_autotune autotune;
|
struct tu_autotune *autotune;
|
||||||
|
|
||||||
struct breadcrumbs_context *breadcrumbs_ctx;
|
struct breadcrumbs_context *breadcrumbs_ctx;
|
||||||
|
|
||||||
|
|
@ -547,8 +550,11 @@ struct tu_vsc_config {
|
||||||
/* Whether binning could be used for gmem rendering using this framebuffer. */
|
/* Whether binning could be used for gmem rendering using this framebuffer. */
|
||||||
bool binning_possible;
|
bool binning_possible;
|
||||||
|
|
||||||
/* Whether binning should be used for gmem rendering using this framebuffer. */
|
/* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
|
||||||
bool binning;
|
* binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
|
||||||
|
* hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
|
||||||
|
*/
|
||||||
|
bool binning_useful;
|
||||||
|
|
||||||
/* pipe register values */
|
/* pipe register values */
|
||||||
uint32_t pipe_config[MAX_VSC_PIPES];
|
uint32_t pipe_config[MAX_VSC_PIPES];
|
||||||
|
|
@ -577,7 +583,8 @@ struct tu_framebuffer
|
||||||
|
|
||||||
uint32_t max_tile_w_constraint;
|
uint32_t max_tile_w_constraint;
|
||||||
uint32_t max_tile_h_constraint;
|
uint32_t max_tile_h_constraint;
|
||||||
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
|
uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
|
||||||
|
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
|
||||||
|
|
||||||
uint32_t attachment_count;
|
uint32_t attachment_count;
|
||||||
const struct tu_image_view *attachments[0];
|
const struct tu_image_view *attachments[0];
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,8 @@ enum tu_gmem_layout
|
||||||
TU_GMEM_LAYOUT_COUNT,
|
TU_GMEM_LAYOUT_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
|
||||||
|
|
||||||
struct tu_subpass_barrier {
|
struct tu_subpass_barrier {
|
||||||
VkPipelineStageFlags2 src_stage_mask;
|
VkPipelineStageFlags2 src_stage_mask;
|
||||||
VkPipelineStageFlags2 dst_stage_mask;
|
VkPipelineStageFlags2 dst_stage_mask;
|
||||||
|
|
|
||||||
|
|
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
||||||
struct tu_device *device = queue->device;
|
struct tu_device *device = queue->device;
|
||||||
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
||||||
struct util_dynarray dump_cmds;
|
struct util_dynarray dump_cmds;
|
||||||
|
struct tu_cs *autotune_cs = NULL;
|
||||||
|
|
||||||
if (vk_submit->buffer_bind_count ||
|
if (vk_submit->buffer_bind_count ||
|
||||||
vk_submit->image_bind_count ||
|
vk_submit->image_bind_count ||
|
||||||
|
|
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
|
autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
|
||||||
struct tu_cs *autotune_cs = tu_autotune_on_submit(
|
if (autotune_cs) {
|
||||||
device, &device->autotune, cmd_buffers, cmdbuf_count);
|
|
||||||
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
|
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
|
||||||
autotune_cs->entry_count);
|
autotune_cs->entry_count);
|
||||||
}
|
}
|
||||||
|
|
@ -605,17 +605,10 @@ VkResult
|
||||||
tu_queue_init(struct tu_device *device,
|
tu_queue_init(struct tu_device *device,
|
||||||
struct tu_queue *queue,
|
struct tu_queue *queue,
|
||||||
enum tu_queue_type type,
|
enum tu_queue_type type,
|
||||||
|
const VkQueueGlobalPriorityKHR global_priority,
|
||||||
int idx,
|
int idx,
|
||||||
const VkDeviceQueueCreateInfo *create_info)
|
const VkDeviceQueueCreateInfo *create_info)
|
||||||
{
|
{
|
||||||
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
|
|
||||||
vk_find_struct_const(create_info->pNext,
|
|
||||||
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
|
|
||||||
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
|
|
||||||
priority_info->globalPriority :
|
|
||||||
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
|
|
||||||
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
|
|
||||||
|
|
||||||
const int priority = tu_get_submitqueue_priority(
|
const int priority = tu_get_submitqueue_priority(
|
||||||
device->physical_device, global_priority, type,
|
device->physical_device, global_priority, type,
|
||||||
device->vk.enabled_features.globalPriorityQuery);
|
device->vk.enabled_features.globalPriorityQuery);
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ VkResult
|
||||||
tu_queue_init(struct tu_device *device,
|
tu_queue_init(struct tu_device *device,
|
||||||
struct tu_queue *queue,
|
struct tu_queue *queue,
|
||||||
enum tu_queue_type type,
|
enum tu_queue_type type,
|
||||||
|
const VkQueueGlobalPriorityKHR global_priority,
|
||||||
int idx,
|
int idx,
|
||||||
const VkDeviceQueueCreateInfo *create_info);
|
const VkDeviceQueueCreateInfo *create_info);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
|
||||||
return tiles_per_pipe <= 32;
|
return tiles_per_pipe <= 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
tu_tiling_config_divide_tile(const struct tu_device *dev,
|
||||||
|
const struct tu_render_pass *pass,
|
||||||
|
const struct tu_framebuffer *fb,
|
||||||
|
const struct tu_tiling_config *tiling,
|
||||||
|
struct tu_tiling_config *new_tiling,
|
||||||
|
uint32_t divisor)
|
||||||
|
{
|
||||||
|
assert(divisor > 0);
|
||||||
|
|
||||||
|
*new_tiling = *tiling;
|
||||||
|
if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
|
||||||
|
/* If the divisor is 1, or if the tiling is not possible, or if the
|
||||||
|
* tiling is invalid, just return the original tiling. */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the hardware-specified alignment values. */
|
||||||
|
const uint32_t tile_align_w = pass->tile_align_w;
|
||||||
|
const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
|
||||||
|
|
||||||
|
/* Divide the current tile dimensions by the divisor. */
|
||||||
|
uint32_t new_tile_width = tiling->tile0.width / divisor;
|
||||||
|
uint32_t new_tile_height = tiling->tile0.height / divisor;
|
||||||
|
|
||||||
|
/* Clamp to the minimum alignment if necessary and align down. */
|
||||||
|
if (new_tile_width < tile_align_w)
|
||||||
|
new_tile_width = tile_align_w;
|
||||||
|
else
|
||||||
|
new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
|
||||||
|
|
||||||
|
if (new_tile_height < tile_align_h)
|
||||||
|
new_tile_height = tile_align_h;
|
||||||
|
else
|
||||||
|
new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
|
||||||
|
|
||||||
|
new_tiling->tile0.width = new_tile_width;
|
||||||
|
new_tiling->tile0.height = new_tile_height;
|
||||||
|
|
||||||
|
/* Recalculate the tile count from the framebuffer dimensions to ensure
|
||||||
|
* full coverage. */
|
||||||
|
new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
|
||||||
|
new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
|
tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
|
||||||
const struct tu_device *dev,
|
const struct tu_device *dev,
|
||||||
|
|
@ -460,22 +505,18 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
|
||||||
static void
|
static void
|
||||||
tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
|
tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
|
||||||
{
|
{
|
||||||
if (vsc->binning_possible) {
|
vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
|
||||||
vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
|
|
||||||
|
|
||||||
if (TU_DEBUG(FORCEBIN))
|
if (TU_DEBUG(FORCEBIN))
|
||||||
vsc->binning = true;
|
vsc->binning_useful = true;
|
||||||
if (TU_DEBUG(NOBIN))
|
if (TU_DEBUG(NOBIN))
|
||||||
vsc->binning = false;
|
vsc->binning_useful = false;
|
||||||
} else {
|
|
||||||
vsc->binning = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
|
||||||
const struct tu_device *device,
|
const struct tu_device *device,
|
||||||
const struct tu_render_pass *pass)
|
const struct tu_render_pass *pass)
|
||||||
{
|
{
|
||||||
for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
|
for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
|
||||||
struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
|
struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
|
||||||
|
|
@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
||||||
tu_tiling_config_update_binning(fdm_offset_vsc, device);
|
tu_tiling_config_update_binning(fdm_offset_vsc, device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fb->initd_divisor = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const struct tu_tiling_config *
|
||||||
|
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
|
||||||
|
const struct tu_device *device,
|
||||||
|
const struct tu_render_pass *pass,
|
||||||
|
int gmem_layout,
|
||||||
|
uint32_t divisor)
|
||||||
|
{
|
||||||
|
assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
|
||||||
|
assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
|
||||||
|
appropriately size the tiles for the framebuffer.*/
|
||||||
|
struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
|
||||||
|
|
||||||
|
if (divisor > fb->initd_divisor) {
|
||||||
|
const struct tu_tiling_config *base_tiling =
|
||||||
|
tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
|
||||||
|
tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
|
||||||
|
|
||||||
|
struct tu_vsc_config *vsc = &tiling->vsc;
|
||||||
|
if (tiling->possible) {
|
||||||
|
tu_tiling_config_update_pipe_layout(vsc, device, false);
|
||||||
|
tu_tiling_config_update_pipes(vsc, device);
|
||||||
|
tu_tiling_config_update_binning(vsc, device);
|
||||||
|
|
||||||
|
struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
|
||||||
|
fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tiling->possible || /* If tiling is no longer possible, this is pointless. */
|
||||||
|
(vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea. */
|
||||||
|
(vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning. */
|
||||||
|
) {
|
||||||
|
/* Revert to the previous level's tiling configuration. */
|
||||||
|
*tiling = *base_tiling;
|
||||||
|
}
|
||||||
|
|
||||||
|
fb->initd_divisor = divisor;
|
||||||
|
}
|
||||||
|
|
||||||
|
return tiling;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
||||||
|
|
@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...)
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
void
|
void
|
||||||
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
|
||||||
const struct tu_device *device,
|
const struct tu_device *device,
|
||||||
const struct tu_render_pass *pass);
|
const struct tu_render_pass *pass);
|
||||||
|
|
||||||
|
const struct tu_tiling_config *
|
||||||
|
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
|
||||||
|
const struct tu_device *device,
|
||||||
|
const struct tu_render_pass *pass,
|
||||||
|
int gmem_layout,
|
||||||
|
uint32_t divisor);
|
||||||
|
|
||||||
#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
|
#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -657,6 +657,10 @@
|
||||||
DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
|
DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
|
||||||
"Enable softfloat emulation for float32 denormals")
|
"Enable softfloat emulation for float32 denormals")
|
||||||
|
|
||||||
|
#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
|
||||||
|
DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
|
||||||
|
"Set the preferred autotune algorithm")
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Honeykrisp specific configuration options
|
* \brief Honeykrisp specific configuration options
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -28,10 +28,18 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
uint64_t
|
uint64_t
|
||||||
rand_xorshift128plus(uint64_t seed[2]);
|
rand_xorshift128plus(uint64_t seed[2]);
|
||||||
|
|
||||||
void
|
void
|
||||||
s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
|
s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} /* end of extern "C" */
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* RAND_XOR_H */
|
#endif /* RAND_XOR_H */
|
||||||
|
|
|
||||||
|
|
@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
|
||||||
return ((value) & ~(uint64_t)(alignment - 1));
|
return ((value) & ~(uint64_t)(alignment - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint64_t
|
||||||
|
ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
|
||||||
|
{
|
||||||
|
return value - (value % alignment);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Align a value, only works pot alignemnts.
|
* Align a value, only works pot alignemnts.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue