mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 13:50:11 +01:00
Merge branch 'tu-newat' into 'main'
turnip: Autotuner Overhaul See merge request mesa/mesa!37802
This commit is contained in:
commit
adbb7f760f
17 changed files with 2309 additions and 848 deletions
|
|
@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
|
|||
Some of these options will behave differently when toggled at runtime, for example:
|
||||
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
||||
was set in the environment variable.
|
||||
|
||||
Autotune
|
||||
^^^^^^^^
|
||||
|
||||
Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
|
||||
autotune system, the behavior of which can be controlled with the following
|
||||
environment variables:
|
||||
|
||||
.. envvar:: TU_AUTOTUNE_ALGO
|
||||
|
||||
Selects the algorithm used for autotuning. Supported values are:
|
||||
|
||||
``bandwidth``
|
||||
Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
|
||||
the one with lower estimated bandwidth.
|
||||
|
||||
``profiled``
|
||||
Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
|
||||
move a probability distribution towards the optimal choice over time. This
|
||||
algorithm tends to be far more accurate than the bandwidth algorithm at choosing
|
||||
the optimal rendering mode but may result in larger FPS variance due to being
|
||||
based on a probability distribution with random sampling. This is the default
|
||||
algorithm.
|
||||
|
||||
``profiled_imm``
|
||||
Similar to ``profiled``, but only profiles the first few instances of a RP
|
||||
and then sticks to the chosen mode for subsequent instances. This is meant
|
||||
for single-frame traces run multiple times in a CI where this algorithm can
|
||||
immediately chose the optimal rendering mode for each RP.
|
||||
|
||||
``prefer_sysmem``
|
||||
Always chooses SYSMEM rendering. This is useful for games that don't benefit
|
||||
from GMEM rendering due to their rendering patterns, setting this is better
|
||||
than using ``TU_DEBUG=sysmem`` when done for performance reasons.
|
||||
|
||||
The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
|
||||
|
||||
.. envvar:: TU_AUTOTUNE_FLAGS
|
||||
|
||||
Modifies the behavior of the selected algorithm. Supported flags are:
|
||||
|
||||
``big_gmem``
|
||||
Always chooses GMEM rendering if the amount of draw calls in the render pass
|
||||
is greater than a certain threshold. Larger RPs generally benefit more from
|
||||
GMEM rendering due to less overhead from tiling.
|
||||
|
||||
``small_sysmem``
|
||||
Always chooses SYSMEM rendering if the amount of draw calls in the render pass
|
||||
is lower than a certain threshold. The benefits of GMEM rendering are less
|
||||
pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
|
||||
|
||||
``preempt_optimize``
|
||||
Tries to keep non-preemptible time in the render pass is below a certain
|
||||
threshold. This is useful for systems with GPU-based compositors where long
|
||||
non-preemptible times can lead to missed frame deadlines, causing noticeable
|
||||
stuttering. This flag will reduce the performance of the render pass in order
|
||||
to improve overall system responsiveness, it should not be used unless the
|
||||
rest of the system is affected by preemption delays.
|
||||
|
||||
Multiple flags can be combined by separating them with commas, e.g.
|
||||
``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
|
||||
|
||||
If no flags are specified, the default behavior is used.
|
||||
|
|
@ -4,7 +4,7 @@ DisableFormat: false
|
|||
|
||||
AlwaysBreakAfterReturnType: TopLevel
|
||||
BinPackParameters: false
|
||||
ColumnLimit: 78
|
||||
ColumnLimit: 120
|
||||
Cpp11BracedListStyle: false
|
||||
|
||||
IncludeBlocks: Regroup
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -8,150 +8,265 @@
|
|||
|
||||
#include "tu_common.h"
|
||||
|
||||
#include "util/hash_table.h"
|
||||
#include "util/rwlock.h"
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "tu_cs.h"
|
||||
#include "tu_suballoc.h"
|
||||
|
||||
struct tu_renderpass_history;
|
||||
/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
|
||||
#define TU_AUTOTUNE_DEBUG_PERFCTR 0
|
||||
|
||||
/**
|
||||
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
|
||||
* data about a given render target.
|
||||
*
|
||||
* In deciding which path to take there are tradeoffs, including some that
|
||||
* are not reasonably estimateable without having some additional information:
|
||||
*
|
||||
* (1) If you know you are touching every pixel (ie. there is a clear),
|
||||
* then the GMEM path will at least not cost more memory bandwidth than
|
||||
* sysmem[1]
|
||||
*
|
||||
* (2) If there is no clear, GMEM could potentially cost *more* bandwidth
|
||||
* if there is sysmem->GMEM restore pass.
|
||||
*
|
||||
* (3) If you see a high draw count, that is an indication that there will be
|
||||
* enough pixels accessed multiple times to benefit from the reduced
|
||||
* memory bandwidth that GMEM brings
|
||||
*
|
||||
* (4) But high draw count where there is not much overdraw can actually be
|
||||
* faster in bypass mode if it is pushing a lot of state change, due to
|
||||
* not having to go thru the state changes per-tile[1]
|
||||
*
|
||||
* The approach taken is to measure the samples-passed for the batch to estimate
|
||||
* the amount of overdraw to detect cases where the number of pixels touched is
|
||||
* low.
|
||||
*
|
||||
* [1] ignoring early-tile-exit optimizations, but any draw that touches all/
|
||||
* most of the tiles late in the tile-pass can defeat that
|
||||
/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
|
||||
* dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
|
||||
* analysis, since we can adapt to the actual workload rather than relying on heuristics.
|
||||
*/
|
||||
struct tu_autotune {
|
||||
|
||||
/* We may have to disable autotuner if there are too many
|
||||
* renderpasses in-flight.
|
||||
*/
|
||||
bool enabled;
|
||||
|
||||
private:
|
||||
bool enabled = true;
|
||||
struct tu_device *device;
|
||||
|
||||
/**
|
||||
* Cache to map renderpass key to historical information about
|
||||
* rendering to that particular render target.
|
||||
*/
|
||||
struct hash_table *ht;
|
||||
struct u_rwlock ht_lock;
|
||||
/** Configuration **/
|
||||
|
||||
/**
|
||||
* List of per-renderpass results that we are waiting for the GPU
|
||||
* to finish with before reading back the results.
|
||||
*/
|
||||
struct list_head pending_results;
|
||||
enum class algorithm : uint8_t;
|
||||
enum class mod_flag : uint8_t;
|
||||
enum class metric_flag : uint8_t;
|
||||
/* Container for all autotune configuration options. */
|
||||
struct PACKED config_t;
|
||||
union PACKED packed_config_t;
|
||||
|
||||
/**
|
||||
* List of per-submission data that we may want to free after we
|
||||
* processed submission results.
|
||||
* This could happend after command buffers which were in the submission
|
||||
* are destroyed.
|
||||
*/
|
||||
struct list_head pending_submission_data;
|
||||
/* Allows for thread-safe access to the configurations. */
|
||||
struct atomic_config_t {
|
||||
private:
|
||||
std::atomic<uint32_t> config_bits = 0;
|
||||
|
||||
/**
|
||||
* List of per-submission data that has been finished and can be reused.
|
||||
*/
|
||||
struct list_head submission_data_pool;
|
||||
public:
|
||||
atomic_config_t(config_t initial_config);
|
||||
|
||||
uint32_t fence_counter;
|
||||
uint32_t idx_counter;
|
||||
config_t load() const;
|
||||
|
||||
bool compare_and_store(config_t updated, config_t expected);
|
||||
} active_config;
|
||||
|
||||
config_t get_env_config();
|
||||
|
||||
/** Global Fence and Internal CS Management **/
|
||||
|
||||
/* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
|
||||
* Synchronized by suballoc_mutex.
|
||||
*/
|
||||
struct tu_suballocator suballoc;
|
||||
std::mutex suballoc_mutex;
|
||||
|
||||
/* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
|
||||
uint32_t next_fence = 1;
|
||||
|
||||
/* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
|
||||
* managing the lifetime of the CS including recycling it after the fence value has been reached.
|
||||
*/
|
||||
struct submission_entry {
|
||||
private:
|
||||
uint32_t fence;
|
||||
struct tu_cs fence_cs;
|
||||
|
||||
public:
|
||||
explicit submission_entry(tu_device *device);
|
||||
|
||||
~submission_entry();
|
||||
|
||||
/* Disable move/copy, since this holds stable pointers to the fence_cs. */
|
||||
submission_entry(const submission_entry &) = delete;
|
||||
submission_entry &operator=(const submission_entry &) = delete;
|
||||
submission_entry(submission_entry &&) = delete;
|
||||
submission_entry &operator=(submission_entry &&) = delete;
|
||||
|
||||
/* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
|
||||
* GPU completion or currently being processed.
|
||||
*/
|
||||
bool is_active() const;
|
||||
|
||||
/* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
|
||||
struct tu_cs *try_get_cs(uint32_t new_fence);
|
||||
};
|
||||
|
||||
/* Unified pool for submission CSes.
|
||||
* Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
|
||||
*/
|
||||
std::deque<submission_entry> submission_entries;
|
||||
|
||||
/* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
|
||||
struct tu_cs *get_cs_for_fence(uint32_t fence);
|
||||
|
||||
/** RP Entry Management **/
|
||||
|
||||
struct rp_gpu_data;
|
||||
struct tile_gpu_data;
|
||||
struct rp_entry;
|
||||
|
||||
/* A wrapper over all entries associated with a single command buffer. */
|
||||
struct rp_entry_batch {
|
||||
bool active; /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
|
||||
valid fence. */
|
||||
uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
|
||||
determine when the entries can be processed. */
|
||||
std::vector<std::unique_ptr<rp_entry>> entries;
|
||||
|
||||
rp_entry_batch();
|
||||
|
||||
/* Disable the copy/move to avoid performance hazards. */
|
||||
rp_entry_batch(const rp_entry_batch &) = delete;
|
||||
rp_entry_batch &operator=(const rp_entry_batch &) = delete;
|
||||
rp_entry_batch(rp_entry_batch &&) = delete;
|
||||
rp_entry_batch &operator=(rp_entry_batch &&) = delete;
|
||||
|
||||
void assign_fence(uint32_t new_fence);
|
||||
};
|
||||
|
||||
/* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
|
||||
* iteration and to ensure that we process the entries in the same order they were submitted.
|
||||
*/
|
||||
std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
|
||||
|
||||
/* Handles processing of entry batches that are pending to be processed.
|
||||
*
|
||||
* Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
|
||||
* in the on_submit() method, which is called on every submit of a command buffer.
|
||||
*/
|
||||
void process_entries();
|
||||
|
||||
/** Renderpass State Tracking **/
|
||||
|
||||
struct rp_history;
|
||||
struct rp_history_handle;
|
||||
|
||||
/* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
|
||||
* be stable across runs, so it can be used to identify the same renderpass instance consistently.
|
||||
*
|
||||
* Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
|
||||
* rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
|
||||
* but avoids hash collisions causing issues.
|
||||
*/
|
||||
struct rp_key {
|
||||
uint64_t hash;
|
||||
|
||||
rp_key(const struct tu_render_pass *pass,
|
||||
const struct tu_framebuffer *framebuffer,
|
||||
const struct tu_cmd_buffer *cmd);
|
||||
|
||||
/* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
|
||||
rp_key(const rp_key &key, uint32_t duplicates);
|
||||
|
||||
/* Equality operator, used in unordered_map. */
|
||||
constexpr bool operator==(const rp_key &other) const noexcept
|
||||
{
|
||||
return hash == other.hash;
|
||||
}
|
||||
};
|
||||
|
||||
/* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
|
||||
*
|
||||
* Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
|
||||
* multiple times, rather than being calculated once and reused when there's multiple successive lookups like
|
||||
* with find_or_create_rp_history() and providing the hash to the rp_history constructor.
|
||||
*/
|
||||
struct rp_hash {
|
||||
constexpr size_t operator()(const rp_key &key) const noexcept
|
||||
{
|
||||
/* Note: This will throw away the upper 32-bits on 32-bit architectures. */
|
||||
return static_cast<size_t>(key.hash);
|
||||
}
|
||||
};
|
||||
|
||||
/* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
|
||||
using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
|
||||
rp_histories_t rp_histories;
|
||||
std::shared_mutex rp_mutex;
|
||||
uint64_t last_reap_ts = 0;
|
||||
|
||||
/* Note: These will internally lock rp_mutex internally, no need to lock it. */
|
||||
rp_history_handle find_rp_history(const rp_key &key);
|
||||
rp_history_handle find_or_create_rp_history(const rp_key &key);
|
||||
void reap_old_rp_histories();
|
||||
|
||||
/** Debug Performance Counters **/
|
||||
|
||||
#if TU_AUTOTUNE_DEBUG_PERFCTR
|
||||
const fd_perfcntr_group *cp_group;
|
||||
const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
|
||||
#endif
|
||||
|
||||
public:
|
||||
tu_autotune(struct tu_device *device, VkResult &result);
|
||||
|
||||
~tu_autotune();
|
||||
|
||||
/* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
|
||||
using rp_ctx_t = rp_entry *;
|
||||
|
||||
/* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
|
||||
*
|
||||
* Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
|
||||
* done through tu_autotune.
|
||||
*/
|
||||
struct cmd_buf_ctx {
|
||||
private:
|
||||
/* A batch of all entries from RPs within this CB. */
|
||||
std::shared_ptr<rp_entry_batch> batch;
|
||||
|
||||
/* Creates a new RP entry attached to this CB. */
|
||||
rp_entry *
|
||||
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
|
||||
|
||||
rp_entry *find_rp_entry(const rp_key &key);
|
||||
|
||||
friend struct tu_autotune;
|
||||
|
||||
public:
|
||||
cmd_buf_ctx();
|
||||
~cmd_buf_ctx();
|
||||
|
||||
/* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
|
||||
void reset();
|
||||
};
|
||||
|
||||
enum class render_mode {
|
||||
SYSMEM,
|
||||
GMEM,
|
||||
};
|
||||
|
||||
render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
|
||||
|
||||
/* Returns the optimal tile size divisor for the given CB state. */
|
||||
uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
|
||||
|
||||
/* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
|
||||
* ensure that the autotuner does not interfere with the high-priority queue's performance.
|
||||
*
|
||||
* Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
|
||||
*/
|
||||
void disable_preempt_optimize();
|
||||
|
||||
void
|
||||
begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
|
||||
|
||||
void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
|
||||
|
||||
void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
|
||||
|
||||
void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
|
||||
|
||||
/* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
|
||||
* tracking to function correctly.
|
||||
*
|
||||
* Note: This must be called from a single-threaded context. There should never be multiple threads calling this
|
||||
* function at the same time.
|
||||
*/
|
||||
struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
|
||||
};
|
||||
|
||||
/**
|
||||
* From the cmdstream, the captured samples-passed values are recorded
|
||||
* at the start and end of the batch.
|
||||
*
|
||||
* Note that we do the math on the CPU to avoid a WFI. But pre-emption
|
||||
* may force us to revisit that.
|
||||
*/
|
||||
struct PACKED tu_renderpass_samples {
|
||||
uint64_t samples_start;
|
||||
/* hw requires the sample start/stop locations to be 128b aligned. */
|
||||
uint64_t __pad0;
|
||||
uint64_t samples_end;
|
||||
uint64_t __pad1;
|
||||
};
|
||||
|
||||
/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
|
||||
static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
|
||||
|
||||
/**
|
||||
* Tracks the results from an individual renderpass. Initially created
|
||||
* per renderpass, and appended to the tail of at->pending_results. At a later
|
||||
* time, when the GPU has finished writing the results, we fill samples_passed.
|
||||
*/
|
||||
struct tu_renderpass_result {
|
||||
/* Points into GPU memory */
|
||||
struct tu_renderpass_samples* samples;
|
||||
|
||||
struct tu_suballoc_bo bo;
|
||||
|
||||
/*
|
||||
* Below here, only used internally within autotune
|
||||
*/
|
||||
uint64_t rp_key;
|
||||
struct tu_renderpass_history *history;
|
||||
struct list_head node;
|
||||
uint32_t fence;
|
||||
uint64_t samples_passed;
|
||||
};
|
||||
|
||||
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
|
||||
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
|
||||
|
||||
bool tu_autotune_use_bypass(struct tu_autotune *at,
|
||||
struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_renderpass_result **autotune_result);
|
||||
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
|
||||
|
||||
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
|
||||
uint32_t cmd_buffer_count);
|
||||
|
||||
/**
|
||||
* A magic 8-ball that tells the gmem code whether we should do bypass mode
|
||||
* for moar fps.
|
||||
*/
|
||||
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
|
||||
struct tu_autotune *at,
|
||||
struct tu_cmd_buffer **cmd_buffers,
|
||||
uint32_t cmd_buffer_count);
|
||||
|
||||
struct tu_autotune_results_buffer;
|
||||
|
||||
template <chip CHIP>
|
||||
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result);
|
||||
|
||||
template <chip CHIP>
|
||||
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result);
|
||||
|
||||
#endif /* TU_AUTOTUNE_H */
|
||||
#endif /* TU_AUTOTUNE_H */
|
||||
|
|
@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
|
|||
}
|
||||
}
|
||||
|
||||
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
|
||||
cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
|
||||
|
||||
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
|
||||
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||
}
|
||||
|
||||
struct apply_store_coords_state {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@
|
|||
#include "vk_render_pass.h"
|
||||
#include "vk_util.h"
|
||||
|
||||
#include "tu_autotune.h"
|
||||
#include "tu_buffer.h"
|
||||
#include "tu_clear_blit.h"
|
||||
#include "tu_cs.h"
|
||||
|
|
@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
|
|||
static bool
|
||||
use_hw_binning(struct tu_cmd_buffer *cmd)
|
||||
{
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
|
||||
struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
const struct tu_tiling_config *tiling =
|
||||
tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
||||
|
||||
/* XFB commands are emitted for BINNING || SYSMEM, which makes it
|
||||
|
|
@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
|
|||
return true;
|
||||
}
|
||||
|
||||
return vsc->binning;
|
||||
return vsc->binning_possible && vsc->binning_useful;
|
||||
}
|
||||
|
||||
static bool
|
||||
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
||||
struct tu_renderpass_result **autotune_result)
|
||||
tu_autotune::rp_ctx_t *rp_ctx)
|
||||
{
|
||||
if (TU_DEBUG(SYSMEM)) {
|
||||
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
|
||||
|
|
@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
|||
return true;
|
||||
}
|
||||
|
||||
if (TU_DEBUG(GMEM))
|
||||
if (TU_DEBUG(GMEM)) {
|
||||
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
|
||||
return false;
|
||||
|
||||
bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
|
||||
cmd, autotune_result);
|
||||
if (*autotune_result) {
|
||||
list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
|
||||
}
|
||||
|
||||
if (use_sysmem) {
|
||||
/* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
|
||||
if (!vsc->binning_possible && vsc->binning_useful) {
|
||||
cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
|
||||
if (use_sysmem)
|
||||
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
|
||||
}
|
||||
|
||||
return use_sysmem;
|
||||
}
|
||||
|
|
@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result)
|
||||
tu_autotune::rp_ctx_t rp_ctx)
|
||||
{
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
|
||||
|
|
@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
||||
}
|
||||
|
||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
|
@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result)
|
||||
tu_autotune::rp_ctx_t rp_ctx)
|
||||
{
|
||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
|
||||
/* Do any resolves of the last subpass. These are handled in the
|
||||
* tile_store_cs in the gmem path.
|
||||
*/
|
||||
|
|
@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit(cs, 0); /* value */
|
||||
}
|
||||
|
||||
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
||||
|
|
@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result,
|
||||
tu_autotune::rp_ctx_t rp_ctx,
|
||||
const VkOffset2D *fdm_offsets)
|
||||
{
|
||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||
|
|
@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
if (use_cb)
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
|
||||
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
|
@ -3471,13 +3476,18 @@ template <chip CHIP>
|
|||
static void
|
||||
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
const struct tu_tile_config *tile,
|
||||
bool fdm, const VkOffset2D *fdm_offsets)
|
||||
bool fdm, const VkOffset2D *fdm_offsets,
|
||||
tu_autotune::rp_ctx_t rp_ctx,
|
||||
const struct tu_vsc_config *vsc)
|
||||
{
|
||||
uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
|
||||
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
|
||||
tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
|
||||
|
||||
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
|
||||
|
||||
cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
|
||||
|
||||
/* Primitives that passed all tests are still counted in in each
|
||||
* tile even with HW binning beforehand. Do not permit it.
|
||||
*/
|
||||
|
|
@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
if (cmd->state.prim_generated_query_running_before_rp)
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
|
||||
|
||||
cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
|
||||
|
||||
if (use_hw_binning(cmd)) {
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
|
||||
|
|
@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result)
|
||||
tu_autotune::rp_ctx_t rp_ctx)
|
||||
{
|
||||
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
|
||||
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
|
||||
|
||||
tu_lrz_tiling_end<CHIP>(cmd, cs);
|
||||
|
|
@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
|
||||
|
||||
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
|
||||
|
||||
tu_cs_sanity_check(cs);
|
||||
}
|
||||
|
||||
|
|
@ -3796,7 +3808,9 @@ void
|
|||
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
||||
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
|
||||
const struct tu_image_view *fdm,
|
||||
const VkOffset2D *fdm_offsets)
|
||||
const VkOffset2D *fdm_offsets,
|
||||
tu_autotune::rp_ctx_t rp_ctx,
|
||||
const struct tu_vsc_config *vsc)
|
||||
{
|
||||
uint32_t width = tx2 - tx1;
|
||||
uint32_t height = ty2 - ty1;
|
||||
|
|
@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
|||
continue;
|
||||
|
||||
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
|
||||
true, fdm_offsets);
|
||||
true, fdm_offsets,
|
||||
rp_ctx, vsc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||
struct tu_renderpass_result *autotune_result,
|
||||
tu_autotune::rp_ctx_t rp_ctx,
|
||||
const VkOffset2D *fdm_offsets)
|
||||
{
|
||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
|
|
@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
|
||||
tu_cs_end(&cmd->tile_store_cs);
|
||||
|
||||
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
|
||||
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
|
||||
|
||||
/* Note: we reverse the order of walking the pipes and tiles on every
|
||||
* other row, to improve texture cache locality compared to raster order.
|
||||
|
|
@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
|
||||
if (merge_tiles) {
|
||||
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
|
||||
fdm_offsets);
|
||||
fdm_offsets, rp_ctx, vsc);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
|
||||
|
||||
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
|
||||
fdm_offsets);
|
||||
fdm_offsets,
|
||||
rp_ctx, vsc);
|
||||
}
|
||||
slot_row += tile_row_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
||||
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||
|
||||
tu_trace_end_render_pass<CHIP>(cmd, true);
|
||||
|
||||
|
|
@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
template <chip CHIP>
|
||||
static void
|
||||
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
||||
struct tu_renderpass_result *autotune_result)
|
||||
tu_autotune::rp_ctx_t rp_ctx)
|
||||
{
|
||||
VkResult result = tu_allocate_transient_attachments(cmd, true);
|
||||
|
||||
|
|
@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
|||
|
||||
tu_trace_start_render_pass(cmd);
|
||||
|
||||
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
|
||||
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||
|
||||
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
|
||||
|
||||
|
|
@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
|
|||
|
||||
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
|
||||
|
||||
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
|
||||
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
|
||||
|
||||
tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
|
||||
cmd->trace_renderpass_start,
|
||||
|
|
@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
|
|||
if (cmd_buffer->state.rp.has_tess)
|
||||
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
|
||||
|
||||
struct tu_renderpass_result *autotune_result = NULL;
|
||||
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
|
||||
tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
|
||||
tu_autotune::rp_ctx_t rp_ctx = NULL;
|
||||
if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
|
||||
tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
|
||||
else
|
||||
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
|
||||
tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
|
||||
|
||||
/* Outside of renderpasses we assume all draw states are disabled. We do
|
||||
* this outside the draw CS for the normal case where 3d gmem stores aren't
|
||||
|
|
@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->state.attachments = NULL;
|
||||
cmd_buffer->state.clear_values = NULL;
|
||||
cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
|
||||
cmd_buffer->state.gmem_layout_divisor = 0;
|
||||
cmd_buffer->state.renderpass_cb_disabled = false;
|
||||
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
|
||||
|
||||
|
|
@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
|
|||
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
|
||||
cmd_buffer->trace_renderpass_start =
|
||||
u_trace_begin_iterator(&cmd_buffer->rp_trace);
|
||||
list_inithead(&cmd_buffer->renderpass_autotune_results);
|
||||
new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
|
||||
|
||||
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
|
||||
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
|
||||
|
|
@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
|
|||
u_trace_fini(&cmd_buffer->trace);
|
||||
u_trace_fini(&cmd_buffer->rp_trace);
|
||||
|
||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
||||
cmd_buffer->autotune_ctx.~cmd_buf_ctx();
|
||||
|
||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||
if (cmd_buffer->descriptors[i].push_set.layout)
|
||||
|
|
@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
|
|||
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
|
||||
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
|
||||
|
||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
||||
cmd_buffer->autotune_ctx.reset();
|
||||
|
||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
||||
|
|
@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
|
|||
cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
|
||||
cmd->state.render_area = suspended->state.suspended_pass.render_area;
|
||||
cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
|
||||
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
|
||||
cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
|
||||
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
|
||||
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
|
||||
cmd->state.lrz = suspended->state.suspended_pass.lrz;
|
||||
}
|
||||
|
||||
|
|
@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
|
|||
* (perf queries), then we can't do this optimization since the
|
||||
* start-of-the-CS geometry condition will have been overwritten.
|
||||
*/
|
||||
bool cond_load_allowed = vsc->binning &&
|
||||
bool cond_load_allowed = vsc->binning_possible &&
|
||||
cmd->state.pass->has_cond_load_store &&
|
||||
!cmd->state.rp.draw_cs_writes_to_cond_pred;
|
||||
|
||||
|
|
@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
|
|||
cmd->state.suspended_pass.attachments = cmd->state.attachments;
|
||||
cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
|
||||
cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
|
||||
cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
|
||||
}
|
||||
|
||||
tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
|
||||
|
|
|
|||
|
|
@ -524,11 +524,12 @@ struct tu_cmd_state
|
|||
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
|
||||
* might get used by tu_store_gmem_attachment().
|
||||
*/
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
tu_gmem_layout gmem_layout;
|
||||
uint32_t gmem_layout_divisor;
|
||||
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
struct tu_framebuffer *framebuffer;
|
||||
const struct tu_tiling_config *tiling;
|
||||
VkRect2D render_area;
|
||||
|
||||
|
|
@ -543,9 +544,10 @@ struct tu_cmd_state
|
|||
struct {
|
||||
const struct tu_render_pass *pass;
|
||||
const struct tu_subpass *subpass;
|
||||
const struct tu_framebuffer *framebuffer;
|
||||
struct tu_framebuffer *framebuffer;
|
||||
VkRect2D render_area;
|
||||
enum tu_gmem_layout gmem_layout;
|
||||
uint32_t gmem_layout_divisor;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
VkClearValue *clear_values;
|
||||
|
|
@ -644,8 +646,7 @@ struct tu_cmd_buffer
|
|||
struct u_trace_iterator trace_renderpass_start;
|
||||
struct u_trace trace, rp_trace;
|
||||
|
||||
struct list_head renderpass_autotune_results;
|
||||
struct tu_autotune_results_buffer* autotune_buffer;
|
||||
tu_autotune::cmd_buf_ctx autotune_ctx;
|
||||
|
||||
void *patchpoints_ctx;
|
||||
struct util_dynarray fdm_bin_patchpoints;
|
||||
|
|
|
|||
|
|
@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
|
|||
DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
|
||||
DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
|
||||
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
|
||||
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
|
||||
DRI_CONF_SECTION_END
|
||||
};
|
||||
|
||||
|
|
@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
|
|||
driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
|
||||
instance->enable_softfloat32 =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
|
||||
instance->autotune_algo =
|
||||
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
|
||||
}
|
||||
|
||||
static uint32_t instance_count = 0;
|
||||
|
|
@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
|
|||
{
|
||||
mtx_destroy(&device->bo_mutex);
|
||||
mtx_destroy(&device->pipeline_mutex);
|
||||
mtx_destroy(&device->autotune_mutex);
|
||||
mtx_destroy(&device->kgsl_profiling_mutex);
|
||||
mtx_destroy(&device->event_mutex);
|
||||
mtx_destroy(&device->trace_mutex);
|
||||
|
|
@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
VkResult result;
|
||||
struct tu_device *device;
|
||||
bool border_color_without_format = false;
|
||||
bool autotune_disable_preempt_optimize = false;
|
||||
|
||||
vk_foreach_struct_const (ext, pCreateInfo->pNext) {
|
||||
switch (ext->sType) {
|
||||
|
|
@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
|
||||
mtx_init(&device->bo_mutex, mtx_plain);
|
||||
mtx_init(&device->pipeline_mutex, mtx_plain);
|
||||
mtx_init(&device->autotune_mutex, mtx_plain);
|
||||
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
||||
mtx_init(&device->event_mutex, mtx_plain);
|
||||
mtx_init(&device->trace_mutex, mtx_plain);
|
||||
|
|
@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
||||
const VkDeviceQueueCreateInfo *queue_create =
|
||||
&pCreateInfo->pQueueCreateInfos[i];
|
||||
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
|
||||
vk_find_struct_const(queue_create->pNext,
|
||||
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
|
||||
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
|
||||
priority_info->globalPriority :
|
||||
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
|
||||
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
|
||||
uint32_t qfi = queue_create->queueFamilyIndex;
|
||||
enum tu_queue_type type = physical_device->queue_families[qfi].type;
|
||||
device->queues[qfi] = (struct tu_queue *) vk_alloc(
|
||||
|
|
@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
device->queue_count[qfi] = queue_create->queueCount;
|
||||
|
||||
for (unsigned q = 0; q < queue_create->queueCount; q++) {
|
||||
result = tu_queue_init(device, &device->queues[qfi][q], type, q,
|
||||
queue_create);
|
||||
result = tu_queue_init(device, &device->queues[qfi][q], type,
|
||||
global_priority, q, queue_create);
|
||||
if (result != VK_SUCCESS) {
|
||||
device->queue_count[qfi] = q;
|
||||
goto fail_queues;
|
||||
}
|
||||
}
|
||||
|
||||
autotune_disable_preempt_optimize |=
|
||||
(global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
|
||||
}
|
||||
|
||||
result = vk_meta_device_init(&device->vk, &device->meta);
|
||||
|
|
@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
TU_BO_ALLOC_ALLOW_DUMP |
|
||||
TU_BO_ALLOC_INTERNAL_RESOURCE),
|
||||
"pipeline_suballoc");
|
||||
tu_bo_suballocator_init(&device->autotune_suballoc, device,
|
||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
"autotune_suballoc");
|
||||
if (is_kgsl(physical_device->instance)) {
|
||||
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
||||
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
|
|
@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
}
|
||||
pthread_condattr_destroy(&condattr);
|
||||
|
||||
result = tu_autotune_init(&device->autotune, device);
|
||||
if (result != VK_SUCCESS) {
|
||||
device->autotune = new tu_autotune(device, result);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_timeline_cond;
|
||||
}
|
||||
|
||||
if (autotune_disable_preempt_optimize)
|
||||
device->autotune->disable_preempt_optimize();
|
||||
|
||||
device->use_z24uint_s8uint =
|
||||
physical_device->info->props.has_z24uint_s8uint &&
|
||||
|
|
@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
free(device->dbg_renderpass_stomp_cs);
|
||||
}
|
||||
|
||||
tu_autotune_fini(&device->autotune, device);
|
||||
delete device->autotune;
|
||||
|
||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||
tu_bo_suballocator_finish(&device->event_suballoc);
|
||||
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
||||
|
|
@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
|
|||
}
|
||||
}
|
||||
|
||||
tu_framebuffer_tiling_config(framebuffer, device, pass);
|
||||
tu_framebuffer_init_tiling_config(framebuffer, device, pass);
|
||||
|
||||
/* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
|
||||
if (msrtss_attachment_count > 0) {
|
||||
|
|
@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
|
|||
view->image->max_tile_h_constraint_fdm;
|
||||
}
|
||||
|
||||
tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
|
||||
tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
|
||||
}
|
||||
|
||||
VkResult
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
#include "common/freedreno_rd_output.h"
|
||||
#include "util/vma.h"
|
||||
#include "util/u_vector.h"
|
||||
#include "util/rwlock.h"
|
||||
|
||||
/* queue types */
|
||||
#define TU_QUEUE_GENERAL 0
|
||||
|
|
@ -233,6 +234,9 @@ struct tu_instance
|
|||
* However we don't want native Vulkan apps using this.
|
||||
*/
|
||||
bool enable_softfloat32;
|
||||
|
||||
/* Configuration option to use a specific autotune algorithm by default. */
|
||||
const char *autotune_algo;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
||||
VK_OBJECT_TYPE_INSTANCE)
|
||||
|
|
@ -265,7 +269,12 @@ struct tu6_global
|
|||
|
||||
volatile uint32_t vtx_stats_query_not_running;
|
||||
|
||||
/* To know when renderpass stats for autotune are valid */
|
||||
/* A fence with a monotonically increasing value that is
|
||||
* incremented by the GPU on each submission that includes
|
||||
* a tu_autotune::submission_entry CS. This is used to track
|
||||
* which submissions have been processed by the GPU before
|
||||
* processing the autotune packet on the CPU.
|
||||
*/
|
||||
volatile uint32_t autotune_fence;
|
||||
|
||||
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
||||
|
|
@ -355,12 +364,6 @@ struct tu_device
|
|||
struct tu_suballocator pipeline_suballoc;
|
||||
mtx_t pipeline_mutex;
|
||||
|
||||
/* Device-global BO suballocator for reducing BO management for small
|
||||
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
|
||||
*/
|
||||
struct tu_suballocator autotune_suballoc;
|
||||
mtx_t autotune_mutex;
|
||||
|
||||
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
||||
* each submission.
|
||||
*/
|
||||
|
|
@ -462,7 +465,7 @@ struct tu_device
|
|||
pthread_cond_t timeline_cond;
|
||||
pthread_mutex_t submit_mutex;
|
||||
|
||||
struct tu_autotune autotune;
|
||||
struct tu_autotune *autotune;
|
||||
|
||||
struct breadcrumbs_context *breadcrumbs_ctx;
|
||||
|
||||
|
|
@ -547,8 +550,11 @@ struct tu_vsc_config {
|
|||
/* Whether binning could be used for gmem rendering using this framebuffer. */
|
||||
bool binning_possible;
|
||||
|
||||
/* Whether binning should be used for gmem rendering using this framebuffer. */
|
||||
bool binning;
|
||||
/* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
|
||||
* binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
|
||||
* hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
|
||||
*/
|
||||
bool binning_useful;
|
||||
|
||||
/* pipe register values */
|
||||
uint32_t pipe_config[MAX_VSC_PIPES];
|
||||
|
|
@ -577,7 +583,8 @@ struct tu_framebuffer
|
|||
|
||||
uint32_t max_tile_w_constraint;
|
||||
uint32_t max_tile_h_constraint;
|
||||
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
|
||||
uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
|
||||
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
|
||||
|
||||
uint32_t attachment_count;
|
||||
const struct tu_image_view *attachments[0];
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ enum tu_gmem_layout
|
|||
TU_GMEM_LAYOUT_COUNT,
|
||||
};
|
||||
|
||||
constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
|
||||
|
||||
struct tu_subpass_barrier {
|
||||
VkPipelineStageFlags2 src_stage_mask;
|
||||
VkPipelineStageFlags2 dst_stage_mask;
|
||||
|
|
|
|||
|
|
@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
|||
struct tu_device *device = queue->device;
|
||||
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
||||
struct util_dynarray dump_cmds;
|
||||
struct tu_cs *autotune_cs = NULL;
|
||||
|
||||
if (vk_submit->buffer_bind_count ||
|
||||
vk_submit->image_bind_count ||
|
||||
|
|
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
|||
}
|
||||
}
|
||||
|
||||
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
|
||||
struct tu_cs *autotune_cs = tu_autotune_on_submit(
|
||||
device, &device->autotune, cmd_buffers, cmdbuf_count);
|
||||
autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
|
||||
if (autotune_cs) {
|
||||
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
|
||||
autotune_cs->entry_count);
|
||||
}
|
||||
|
|
@ -605,17 +605,10 @@ VkResult
|
|||
tu_queue_init(struct tu_device *device,
|
||||
struct tu_queue *queue,
|
||||
enum tu_queue_type type,
|
||||
const VkQueueGlobalPriorityKHR global_priority,
|
||||
int idx,
|
||||
const VkDeviceQueueCreateInfo *create_info)
|
||||
{
|
||||
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
|
||||
vk_find_struct_const(create_info->pNext,
|
||||
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
|
||||
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
|
||||
priority_info->globalPriority :
|
||||
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
|
||||
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
|
||||
|
||||
const int priority = tu_get_submitqueue_priority(
|
||||
device->physical_device, global_priority, type,
|
||||
device->vk.enabled_features.globalPriorityQuery);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ VkResult
|
|||
tu_queue_init(struct tu_device *device,
|
||||
struct tu_queue *queue,
|
||||
enum tu_queue_type type,
|
||||
const VkQueueGlobalPriorityKHR global_priority,
|
||||
int idx,
|
||||
const VkDeviceQueueCreateInfo *create_info);
|
||||
|
||||
|
|
|
|||
|
|
@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
|
|||
return tiles_per_pipe <= 32;
|
||||
}
|
||||
|
||||
static void
|
||||
tu_tiling_config_divide_tile(const struct tu_device *dev,
|
||||
const struct tu_render_pass *pass,
|
||||
const struct tu_framebuffer *fb,
|
||||
const struct tu_tiling_config *tiling,
|
||||
struct tu_tiling_config *new_tiling,
|
||||
uint32_t divisor)
|
||||
{
|
||||
assert(divisor > 0);
|
||||
|
||||
*new_tiling = *tiling;
|
||||
if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
|
||||
/* If the divisor is 1, or if the tiling is not possible, or if the
|
||||
* tiling is invalid, just return the original tiling. */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Get the hardware-specified alignment values. */
|
||||
const uint32_t tile_align_w = pass->tile_align_w;
|
||||
const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
|
||||
|
||||
/* Divide the current tile dimensions by the divisor. */
|
||||
uint32_t new_tile_width = tiling->tile0.width / divisor;
|
||||
uint32_t new_tile_height = tiling->tile0.height / divisor;
|
||||
|
||||
/* Clamp to the minimum alignment if necessary and align down. */
|
||||
if (new_tile_width < tile_align_w)
|
||||
new_tile_width = tile_align_w;
|
||||
else
|
||||
new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
|
||||
|
||||
if (new_tile_height < tile_align_h)
|
||||
new_tile_height = tile_align_h;
|
||||
else
|
||||
new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
|
||||
|
||||
new_tiling->tile0.width = new_tile_width;
|
||||
new_tiling->tile0.height = new_tile_height;
|
||||
|
||||
/* Recalculate the tile count from the framebuffer dimensions to ensure
|
||||
* full coverage. */
|
||||
new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
|
||||
new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
|
||||
}
|
||||
|
||||
static void
|
||||
tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
|
||||
const struct tu_device *dev,
|
||||
|
|
@ -460,22 +505,18 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
|
|||
static void
|
||||
tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
|
||||
{
|
||||
if (vsc->binning_possible) {
|
||||
vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
|
||||
vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
|
||||
|
||||
if (TU_DEBUG(FORCEBIN))
|
||||
vsc->binning = true;
|
||||
if (TU_DEBUG(NOBIN))
|
||||
vsc->binning = false;
|
||||
} else {
|
||||
vsc->binning = false;
|
||||
}
|
||||
if (TU_DEBUG(FORCEBIN))
|
||||
vsc->binning_useful = true;
|
||||
if (TU_DEBUG(NOBIN))
|
||||
vsc->binning_useful = false;
|
||||
}
|
||||
|
||||
void
|
||||
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass)
|
||||
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass)
|
||||
{
|
||||
for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
|
||||
struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
|
||||
|
|
@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
|||
tu_tiling_config_update_binning(fdm_offset_vsc, device);
|
||||
}
|
||||
}
|
||||
|
||||
fb->initd_divisor = 1;
|
||||
}
|
||||
|
||||
const struct tu_tiling_config *
|
||||
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass,
|
||||
int gmem_layout,
|
||||
uint32_t divisor)
|
||||
{
|
||||
assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
|
||||
assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
|
||||
appropriately size the tiles for the framebuffer.*/
|
||||
struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
|
||||
|
||||
if (divisor > fb->initd_divisor) {
|
||||
const struct tu_tiling_config *base_tiling =
|
||||
tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
|
||||
tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
|
||||
|
||||
struct tu_vsc_config *vsc = &tiling->vsc;
|
||||
if (tiling->possible) {
|
||||
tu_tiling_config_update_pipe_layout(vsc, device, false);
|
||||
tu_tiling_config_update_pipes(vsc, device);
|
||||
tu_tiling_config_update_binning(vsc, device);
|
||||
|
||||
struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
|
||||
fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
|
||||
}
|
||||
|
||||
if (!tiling->possible || /* If tiling is no longer possible, this is pointless. */
|
||||
(vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea. */
|
||||
(vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning. */
|
||||
) {
|
||||
/* Revert to the previous level's tiling configuration. */
|
||||
*tiling = *base_tiling;
|
||||
}
|
||||
|
||||
fb->initd_divisor = divisor;
|
||||
}
|
||||
|
||||
return tiling;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -136,9 +136,16 @@ __tu_finishme(const char *file, int line, const char *format, ...)
|
|||
} while (0)
|
||||
|
||||
void
|
||||
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass);
|
||||
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass);
|
||||
|
||||
const struct tu_tiling_config *
|
||||
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
|
||||
const struct tu_device *device,
|
||||
const struct tu_render_pass *pass,
|
||||
int gmem_layout,
|
||||
uint32_t divisor);
|
||||
|
||||
#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
|
||||
|
||||
|
|
|
|||
|
|
@ -657,6 +657,10 @@
|
|||
DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
|
||||
"Enable softfloat emulation for float32 denormals")
|
||||
|
||||
#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
|
||||
DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
|
||||
"Set the preferred autotune algorithm")
|
||||
|
||||
/**
|
||||
* \brief Honeykrisp specific configuration options
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -28,10 +28,18 @@
|
|||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
uint64_t
|
||||
rand_xorshift128plus(uint64_t seed[2]);
|
||||
|
||||
void
|
||||
s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* end of extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* RAND_XOR_H */
|
||||
|
|
|
|||
|
|
@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
|
|||
return ((value) & ~(uint64_t)(alignment - 1));
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
|
||||
{
|
||||
return value - (value % alignment);
|
||||
}
|
||||
|
||||
/**
|
||||
* Align a value, only works pot alignemnts.
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue