Merge branch 'tu-newat' into 'main'

turnip: Autotuner Overhaul

See merge request mesa/mesa!37802
This commit is contained in:
Dhruv Mark Collins 2025-12-20 06:20:41 +05:30
commit adbb7f760f
17 changed files with 2309 additions and 848 deletions

View file

@ -665,3 +665,66 @@ are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebi
Some of these options will behave differently when toggled at runtime, for example:
``nolrz`` will still result in LRZ allocation which would not happen if the option
was set in the environment variable.
Autotune
^^^^^^^^
Turnip supports dynamically selecting between SYSMEM and GMEM rendering with the
autotune system, the behavior of which can be controlled with the following
environment variables:
.. envvar:: TU_AUTOTUNE_ALGO
Selects the algorithm used for autotuning. Supported values are:
``bandwidth``
Estimates the bandwidth usage of rendering in SYSMEM and GMEM modes, and chooses
the one with lower estimated bandwidth.
``profiled``
Dynamically profiles the RP timings in SYSMEM and GMEM modes, and uses that to
move a probability distribution towards the optimal choice over time. This
algorithm tends to be far more accurate than the bandwidth algorithm at choosing
the optimal rendering mode but may result in larger FPS variance due to being
based on a probability distribution with random sampling. This is the default
algorithm.
``profiled_imm``
Similar to ``profiled``, but only profiles the first few instances of a RP
and then sticks to the chosen mode for subsequent instances. This is meant
for single-frame traces run multiple times in a CI where this algorithm can
immediately chose the optimal rendering mode for each RP.
``prefer_sysmem``
Always chooses SYSMEM rendering. This is useful for games that don't benefit
from GMEM rendering due to their rendering patterns, setting this is better
than using ``TU_DEBUG=sysmem`` when done for performance reasons.
The algorithm can be set via the driconf option ``tu_autotune_algorithm`` as well.
.. envvar:: TU_AUTOTUNE_FLAGS
Modifies the behavior of the selected algorithm. Supported flags are:
``big_gmem``
Always chooses GMEM rendering if the amount of draw calls in the render pass
is greater than a certain threshold. Larger RPs generally benefit more from
GMEM rendering due to less overhead from tiling.
``small_sysmem``
Always chooses SYSMEM rendering if the amount of draw calls in the render pass
is lower than a certain threshold. The benefits of GMEM rendering are less
pronounced in these smaller RPs and SYSMEM rendering tends to win more often.
``preempt_optimize``
Tries to keep non-preemptible time in the render pass is below a certain
threshold. This is useful for systems with GPU-based compositors where long
non-preemptible times can lead to missed frame deadlines, causing noticeable
stuttering. This flag will reduce the performance of the render pass in order
to improve overall system responsiveness, it should not be used unless the
rest of the system is affected by preemption delays.
Multiple flags can be combined by separating them with commas, e.g.
``TU_AUTOTUNE_FLAGS=big_gmem,small_sysmem``.
If no flags are specified, the default behavior is used.

View file

@ -4,7 +4,7 @@ DisableFormat: false
AlwaysBreakAfterReturnType: TopLevel
BinPackParameters: false
ColumnLimit: 78
ColumnLimit: 120
Cpp11BracedListStyle: false
IncludeBlocks: Regroup

File diff suppressed because it is too large Load diff

View file

@ -8,150 +8,265 @@
#include "tu_common.h"
#include "util/hash_table.h"
#include "util/rwlock.h"
#include <atomic>
#include <deque>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <unordered_map>
#include <vector>
#include "tu_cs.h"
#include "tu_suballoc.h"
struct tu_renderpass_history;
/* Compile-time toggle for debugging preemption latency with CP preemption performance counters. */
#define TU_AUTOTUNE_DEBUG_PERFCTR 0
/**
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
* data about a given render target.
*
* In deciding which path to take there are tradeoffs, including some that
* are not reasonably estimateable without having some additional information:
*
* (1) If you know you are touching every pixel (ie. there is a clear),
* then the GMEM path will at least not cost more memory bandwidth than
* sysmem[1]
*
* (2) If there is no clear, GMEM could potentially cost *more* bandwidth
* if there is sysmem->GMEM restore pass.
*
* (3) If you see a high draw count, that is an indication that there will be
* enough pixels accessed multiple times to benefit from the reduced
* memory bandwidth that GMEM brings
*
* (4) But high draw count where there is not much overdraw can actually be
* faster in bypass mode if it is pushing a lot of state change, due to
* not having to go thru the state changes per-tile[1]
*
* The approach taken is to measure the samples-passed for the batch to estimate
* the amount of overdraw to detect cases where the number of pixels touched is
* low.
*
* [1] ignoring early-tile-exit optimizations, but any draw that touches all/
* most of the tiles late in the tile-pass can defeat that
/* Autotune allows for us to tune rendering parameters (such as GMEM vs SYSMEM, tile size divisor, etc.) based on
* dynamic analysis of the rendering workload via on-GPU profiling. This lets us make much better decisions than static
* analysis, since we can adapt to the actual workload rather than relying on heuristics.
*/
struct tu_autotune {
/* We may have to disable autotuner if there are too many
* renderpasses in-flight.
*/
bool enabled;
private:
bool enabled = true;
struct tu_device *device;
/**
* Cache to map renderpass key to historical information about
* rendering to that particular render target.
/** Configuration **/
enum class algorithm : uint8_t;
enum class mod_flag : uint8_t;
enum class metric_flag : uint8_t;
/* Container for all autotune configuration options. */
struct PACKED config_t;
union PACKED packed_config_t;
/* Allows for thread-safe access to the configurations. */
struct atomic_config_t {
private:
std::atomic<uint32_t> config_bits = 0;
public:
atomic_config_t(config_t initial_config);
config_t load() const;
bool compare_and_store(config_t updated, config_t expected);
} active_config;
config_t get_env_config();
/** Global Fence and Internal CS Management **/
/* BO suballocator for reducing BO management for small GMEM/SYSMEM autotune result buffers.
* Synchronized by suballoc_mutex.
*/
struct hash_table *ht;
struct u_rwlock ht_lock;
struct tu_suballocator suballoc;
std::mutex suballoc_mutex;
/**
* List of per-renderpass results that we are waiting for the GPU
* to finish with before reading back the results.
/* The next value to assign to tu6_global::autotune_fence, this is incremented during on_submit. */
uint32_t next_fence = 1;
/* A wrapper around a CS which sets the global autotune fence to a certain fence value, this allows for ergonomically
* managing the lifetime of the CS including recycling it after the fence value has been reached.
*/
struct list_head pending_results;
/**
* List of per-submission data that we may want to free after we
* processed submission results.
* This could happend after command buffers which were in the submission
* are destroyed.
*/
struct list_head pending_submission_data;
/**
* List of per-submission data that has been finished and can be reused.
*/
struct list_head submission_data_pool;
uint32_t fence_counter;
uint32_t idx_counter;
};
/**
* From the cmdstream, the captured samples-passed values are recorded
* at the start and end of the batch.
*
* Note that we do the math on the CPU to avoid a WFI. But pre-emption
* may force us to revisit that.
*/
struct PACKED tu_renderpass_samples {
uint64_t samples_start;
/* hw requires the sample start/stop locations to be 128b aligned. */
uint64_t __pad0;
uint64_t samples_end;
uint64_t __pad1;
};
/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
/**
* Tracks the results from an individual renderpass. Initially created
* per renderpass, and appended to the tail of at->pending_results. At a later
* time, when the GPU has finished writing the results, we fill samples_passed.
*/
struct tu_renderpass_result {
/* Points into GPU memory */
struct tu_renderpass_samples* samples;
struct tu_suballoc_bo bo;
/*
* Below here, only used internally within autotune
*/
uint64_t rp_key;
struct tu_renderpass_history *history;
struct list_head node;
struct submission_entry {
private:
uint32_t fence;
uint64_t samples_passed;
struct tu_cs fence_cs;
public:
explicit submission_entry(tu_device *device);
~submission_entry();
/* Disable move/copy, since this holds stable pointers to the fence_cs. */
submission_entry(const submission_entry &) = delete;
submission_entry &operator=(const submission_entry &) = delete;
submission_entry(submission_entry &&) = delete;
submission_entry &operator=(submission_entry &&) = delete;
/* The current state of the submission entry, this is used to track whether the CS is available for reuse, pending
* GPU completion or currently being processed.
*/
bool is_active() const;
/* If the CS is free, returns the CS which will write out the specified fence value. Otherwise, returns nullptr. */
struct tu_cs *try_get_cs(uint32_t new_fence);
};
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
bool tu_autotune_use_bypass(struct tu_autotune *at,
struct tu_cmd_buffer *cmd_buffer,
struct tu_renderpass_result **autotune_result);
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count);
/**
* A magic 8-ball that tells the gmem code whether we should do bypass mode
* for moar fps.
/* Unified pool for submission CSes.
* Note: This is a deque rather than a vector due to the lack of move semantics in the submission_entry.
*/
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
struct tu_autotune *at,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count);
std::deque<submission_entry> submission_entries;
struct tu_autotune_results_buffer;
/* Returns a CS which will write out the specified fence value to the global BO's autotune fence. */
struct tu_cs *get_cs_for_fence(uint32_t fence);
template <chip CHIP>
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_renderpass_result *autotune_result);
/** RP Entry Management **/
template <chip CHIP>
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_renderpass_result *autotune_result);
struct rp_gpu_data;
struct tile_gpu_data;
struct rp_entry;
/* A wrapper over all entries associated with a single command buffer. */
struct rp_entry_batch {
bool active; /* If the entry is ready to be processed, i.e. the entry is submitted to the GPU queue and has a
valid fence. */
uint32_t fence; /* The fence value which is used to signal the completion of the CB submission. This is used to
determine when the entries can be processed. */
std::vector<std::unique_ptr<rp_entry>> entries;
rp_entry_batch();
/* Disable the copy/move to avoid performance hazards. */
rp_entry_batch(const rp_entry_batch &) = delete;
rp_entry_batch &operator=(const rp_entry_batch &) = delete;
rp_entry_batch(rp_entry_batch &&) = delete;
rp_entry_batch &operator=(rp_entry_batch &&) = delete;
void assign_fence(uint32_t new_fence);
};
/* A deque of entry batches that are strongly ordered by the fence value that was written by the GPU, for efficient
* iteration and to ensure that we process the entries in the same order they were submitted.
*/
std::deque<std::shared_ptr<rp_entry_batch>> active_batches;
/* Handles processing of entry batches that are pending to be processed.
*
* Note: This must be called regularly to process the entries that have been written by the GPU. We currently do this
* in the on_submit() method, which is called on every submit of a command buffer.
*/
void process_entries();
/** Renderpass State Tracking **/
struct rp_history;
struct rp_history_handle;
/* A strongly typed key which generates a hash to uniquely identify a renderpass instance. This hash is expected to
* be stable across runs, so it can be used to identify the same renderpass instance consistently.
*
* Note: We can potentially include the vector of data we extract from the parameters to generate the hash into
* rp_key, which would lead to true value-based equality rather than just hash-based equality which has a cost
* but avoids hash collisions causing issues.
*/
struct rp_key {
uint64_t hash;
rp_key(const struct tu_render_pass *pass,
const struct tu_framebuffer *framebuffer,
const struct tu_cmd_buffer *cmd);
/* Further salt the hash to distinguish between multiple instances of the same RP within a single command buffer. */
rp_key(const rp_key &key, uint32_t duplicates);
/* Equality operator, used in unordered_map. */
constexpr bool operator==(const rp_key &other) const noexcept
{
return hash == other.hash;
}
};
/* A thin wrapper to satisfy C++'s Hash named requirement for rp_key.
*
* Note: This should *NEVER* be used to calculate the hash itself as it would lead to the hash being calculated
* multiple times, rather than being calculated once and reused when there's multiple successive lookups like
* with find_or_create_rp_history() and providing the hash to the rp_history constructor.
*/
struct rp_hash {
constexpr size_t operator()(const rp_key &key) const noexcept
{
/* Note: This will throw away the upper 32-bits on 32-bit architectures. */
return static_cast<size_t>(key.hash);
}
};
/* A map between the hash of an RP and the historical state of the RP. Synchronized by rp_mutex. */
using rp_histories_t = std::unordered_map<rp_key, rp_history, rp_hash>;
rp_histories_t rp_histories;
std::shared_mutex rp_mutex;
uint64_t last_reap_ts = 0;
/* Note: These will internally lock rp_mutex internally, no need to lock it. */
rp_history_handle find_rp_history(const rp_key &key);
rp_history_handle find_or_create_rp_history(const rp_key &key);
void reap_old_rp_histories();
/** Debug Performance Counters **/
#if TU_AUTOTUNE_DEBUG_PERFCTR
const fd_perfcntr_group *cp_group;
const fd_perfcntr_countable *preemption_reaction_delay, *num_preemptions, *always_count;
#endif
public:
tu_autotune(struct tu_device *device, VkResult &result);
~tu_autotune();
/* Opaque pointer to internal structure with RP context that needs to be preserved across begin/end calls. */
using rp_ctx_t = rp_entry *;
/* An internal structure that needs to be held by tu_cmd_buffer to track the state of the autotuner for a given CB.
*
* Note: tu_cmd_buffer is only responsible for the lifetime of this object, all the access to the context state is
* done through tu_autotune.
*/
struct cmd_buf_ctx {
private:
/* A batch of all entries from RPs within this CB. */
std::shared_ptr<rp_entry_batch> batch;
/* Creates a new RP entry attached to this CB. */
rp_entry *
attach_rp_entry(struct tu_device *device, rp_history_handle &&history, config_t config, uint32_t draw_count);
rp_entry *find_rp_entry(const rp_key &key);
friend struct tu_autotune;
public:
cmd_buf_ctx();
~cmd_buf_ctx();
/* Resets the internal context, should be called when tu_cmd_buffer state has been reset. */
void reset();
};
enum class render_mode {
SYSMEM,
GMEM,
};
render_mode get_optimal_mode(struct tu_cmd_buffer *cmd_buffer, rp_ctx_t *rp_ctx);
/* Returns the optimal tile size divisor for the given CB state. */
uint32_t get_tile_size_divisor(struct tu_cmd_buffer *cmd_buffer);
/* Disables preemption latency optimization within the autotuner, this is used when high-priority queues are used to
* ensure that the autotuner does not interfere with the high-priority queue's performance.
*
* Note: This should be called before any renderpass is started, otherwise it may lead to undefined behavior.
*/
void disable_preempt_optimize();
void
begin_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, bool sysmem, uint32_t tile_count);
void end_renderpass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx);
void begin_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
void end_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, rp_ctx_t rp_ctx, uint32_t tile_idx);
/* The submit-time hook for autotuner, this may return a CS (can be NULL) which must be amended for autotuner
* tracking to function correctly.
*
* Note: This must be called from a single-threaded context. There should never be multiple threads calling this
* function at the same time.
*/
struct tu_cs *on_submit(struct tu_cmd_buffer **cmd_buffers, uint32_t cmd_buffer_count);
};
#endif /* TU_AUTOTUNE_H */

View file

@ -5466,7 +5466,10 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
}
}
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
cmd->state.gmem_layout_divisor = cmd->device->autotune->get_tile_size_divisor(cmd);
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
}
struct apply_store_coords_state {

View file

@ -14,6 +14,7 @@
#include "vk_render_pass.h"
#include "vk_util.h"
#include "tu_autotune.h"
#include "tu_buffer.h"
#include "tu_clear_blit.h"
#include "tu_cs.h"
@ -1262,8 +1263,9 @@ tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
static bool
use_hw_binning(struct tu_cmd_buffer *cmd)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_tiling_config *tiling =
tu_framebuffer_get_tiling_config(fb, cmd->device, cmd->state.pass, cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
/* XFB commands are emitted for BINNING || SYSMEM, which makes it
@ -1288,12 +1290,12 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
return true;
}
return vsc->binning;
return vsc->binning_possible && vsc->binning_useful;
}
static bool
use_sysmem_rendering(struct tu_cmd_buffer *cmd,
struct tu_renderpass_result **autotune_result)
tu_autotune::rp_ctx_t *rp_ctx)
{
if (TU_DEBUG(SYSMEM)) {
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(SYSMEM)";
@ -1343,18 +1345,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
return true;
}
if (TU_DEBUG(GMEM))
if (TU_DEBUG(GMEM)) {
cmd->state.rp.gmem_disable_reason = "TU_DEBUG(GMEM)";
return false;
bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
cmd, autotune_result);
if (*autotune_result) {
list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
}
if (use_sysmem) {
/* This is a case where it's better to avoid GMEM, too many tiles but no HW binning possible. */
if (!vsc->binning_possible && vsc->binning_useful) {
cmd->state.rp.gmem_disable_reason = "Too many tiles and HW binning is not possible";
return true;
}
bool use_sysmem = cmd->device->autotune->get_optimal_mode(cmd, rp_ctx) == tu_autotune::render_mode::SYSMEM;
if (use_sysmem)
cmd->state.rp.gmem_disable_reason = "Autotune selected sysmem";
}
return use_sysmem;
}
@ -3035,7 +3039,7 @@ tu7_emit_concurrent_binning_sysmem(struct tu_cmd_buffer *cmd,
template <chip CHIP>
static void
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_renderpass_result *autotune_result)
tu_autotune::rp_ctx_t rp_ctx)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
@ -3089,7 +3093,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
}
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, true, 0);
tu_cs_sanity_check(cs);
}
@ -3097,10 +3101,8 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
template <chip CHIP>
static void
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_renderpass_result *autotune_result)
tu_autotune::rp_ctx_t rp_ctx)
{
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
/* Do any resolves of the last subpass. These are handled in the
* tile_store_cs in the gmem path.
*/
@ -3127,6 +3129,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit(cs, 0); /* value */
}
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
tu_cs_sanity_check(cs);
}
@ -3275,7 +3279,7 @@ tu7_emit_concurrent_binning_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
template <chip CHIP>
static void
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_renderpass_result *autotune_result,
tu_autotune::rp_ctx_t rp_ctx,
const VkOffset2D *fdm_offsets)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
@ -3462,7 +3466,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
if (use_cb)
tu_trace_start_render_pass(cmd);
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
uint32_t tile_count = vsc->tile_count.width * vsc->tile_count.height;
cmd->device->autotune->begin_renderpass(cmd, cs, rp_ctx, false, tile_count);
tu_cs_sanity_check(cs);
}
@ -3471,13 +3476,18 @@ template <chip CHIP>
static void
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
const struct tu_tile_config *tile,
bool fdm, const VkOffset2D *fdm_offsets)
bool fdm, const VkOffset2D *fdm_offsets,
tu_autotune::rp_ctx_t rp_ctx,
const struct tu_vsc_config *vsc)
{
uint32_t tile_idx = (tile->pos.y * vsc->tile_count.width) + tile->pos.x;
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs, cmd);
cmd->device->autotune->begin_tile(cmd, cs, rp_ctx, tile_idx);
/* Primitives that passed all tests are still counted in in each
* tile even with HW binning beforehand. Do not permit it.
*/
@ -3489,6 +3499,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
if (cmd->state.prim_generated_query_running_before_rp)
tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
cmd->device->autotune->end_tile(cmd, cs, rp_ctx, tile_idx);
if (use_hw_binning(cmd)) {
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
@ -3528,10 +3540,8 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
template <chip CHIP>
static void
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_renderpass_result *autotune_result)
tu_autotune::rp_ctx_t rp_ctx)
{
tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
tu_lrz_tiling_end<CHIP>(cmd, cs);
@ -3560,6 +3570,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
cmd->device->autotune->end_renderpass(cmd, cs, rp_ctx);
tu_cs_sanity_check(cs);
}
@ -3796,7 +3808,9 @@ void
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
const struct tu_image_view *fdm,
const VkOffset2D *fdm_offsets)
const VkOffset2D *fdm_offsets,
tu_autotune::rp_ctx_t rp_ctx,
const struct tu_vsc_config *vsc)
{
uint32_t width = tx2 - tx1;
uint32_t height = ty2 - ty1;
@ -3859,7 +3873,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
continue;
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
true, fdm_offsets);
true, fdm_offsets,
rp_ctx, vsc);
}
}
}
@ -3892,7 +3907,7 @@ tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem)
template <chip CHIP>
static void
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
struct tu_renderpass_result *autotune_result,
tu_autotune::rp_ctx_t rp_ctx,
const VkOffset2D *fdm_offsets)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
@ -3926,7 +3941,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
tu6_emit_tile_store_cs<CHIP>(cmd, &cmd->tile_store_cs);
tu_cs_end(&cmd->tile_store_cs);
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx, fdm_offsets);
/* Note: we reverse the order of walking the pipes and tiles on every
* other row, to improve texture cache locality compared to raster order.
@ -3947,7 +3962,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
if (merge_tiles) {
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
fdm_offsets);
fdm_offsets, rp_ctx, vsc);
continue;
}
@ -3971,14 +3986,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
fdm_offsets);
fdm_offsets,
rp_ctx, vsc);
}
slot_row += tile_row_stride;
}
}
}
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
tu6_tile_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
tu_trace_end_render_pass<CHIP>(cmd, true);
@ -3998,7 +4014,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
template <chip CHIP>
static void
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
struct tu_renderpass_result *autotune_result)
tu_autotune::rp_ctx_t rp_ctx)
{
VkResult result = tu_allocate_transient_attachments(cmd, true);
@ -4009,7 +4025,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
tu_trace_start_render_pass(cmd);
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, rp_ctx);
trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs, cmd);
@ -4017,7 +4033,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, rp_ctx);
tu_clone_trace_range(cmd, &cmd->cs, &cmd->trace,
cmd->trace_renderpass_start,
@ -4034,11 +4050,11 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
struct tu_renderpass_result *autotune_result = NULL;
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
tu_autotune::rp_ctx_t rp_ctx = NULL;
if (use_sysmem_rendering(cmd_buffer, &rp_ctx))
tu_cmd_render_sysmem<CHIP>(cmd_buffer, rp_ctx);
else
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
tu_cmd_render_tiles<CHIP>(cmd_buffer, rp_ctx, fdm_offsets);
/* Outside of renderpasses we assume all draw states are disabled. We do
* this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4063,6 +4079,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
cmd_buffer->state.attachments = NULL;
cmd_buffer->state.clear_values = NULL;
cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
cmd_buffer->state.gmem_layout_divisor = 0;
cmd_buffer->state.renderpass_cb_disabled = false;
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
@ -4111,7 +4128,7 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
u_trace_init(&cmd_buffer->rp_trace, &device->trace_context);
cmd_buffer->trace_renderpass_start =
u_trace_begin_iterator(&cmd_buffer->rp_trace);
list_inithead(&cmd_buffer->renderpass_autotune_results);
new (&cmd_buffer->autotune_ctx) tu_autotune::cmd_buf_ctx();
if (TU_DEBUG_START(CHECK_CMD_BUFFER_STATUS)) {
cmd_buffer->status_bo = tu_cmd_buffer_setup_status_tracking(device);
@ -4160,7 +4177,7 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
u_trace_fini(&cmd_buffer->trace);
u_trace_fini(&cmd_buffer->rp_trace);
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
cmd_buffer->autotune_ctx.~cmd_buf_ctx();
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
if (cmd_buffer->descriptors[i].push_set.layout)
@ -4238,7 +4255,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
cmd_buffer->autotune_ctx.reset();
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
@ -6100,7 +6117,9 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
cmd->state.render_area = suspended->state.suspended_pass.render_area;
cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
cmd->state.gmem_layout_divisor = suspended->state.suspended_pass.gmem_layout_divisor;
cmd->state.tiling = tu_framebuffer_get_tiling_config(cmd->state.framebuffer, cmd->device, cmd->state.pass,
cmd->state.gmem_layout, cmd->state.gmem_layout_divisor);
cmd->state.lrz = suspended->state.suspended_pass.lrz;
}
@ -6483,7 +6502,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
* (perf queries), then we can't do this optimization since the
* start-of-the-CS geometry condition will have been overwritten.
*/
bool cond_load_allowed = vsc->binning &&
bool cond_load_allowed = vsc->binning_possible &&
cmd->state.pass->has_cond_load_store &&
!cmd->state.rp.draw_cs_writes_to_cond_pred;
@ -7051,6 +7070,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
cmd->state.suspended_pass.attachments = cmd->state.attachments;
cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
cmd->state.suspended_pass.gmem_layout_divisor = cmd->state.gmem_layout_divisor;
}
tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);

View file

@ -524,11 +524,12 @@ struct tu_cmd_state
/* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
* might get used by tu_store_gmem_attachment().
*/
enum tu_gmem_layout gmem_layout;
tu_gmem_layout gmem_layout;
uint32_t gmem_layout_divisor;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
struct tu_framebuffer *framebuffer;
const struct tu_tiling_config *tiling;
VkRect2D render_area;
@ -543,9 +544,10 @@ struct tu_cmd_state
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
struct tu_framebuffer *framebuffer;
VkRect2D render_area;
enum tu_gmem_layout gmem_layout;
uint32_t gmem_layout_divisor;
const struct tu_image_view **attachments;
VkClearValue *clear_values;
@ -644,8 +646,7 @@ struct tu_cmd_buffer
struct u_trace_iterator trace_renderpass_start;
struct u_trace trace, rp_trace;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
tu_autotune::cmd_buf_ctx autotune_ctx;
void *patchpoints_ctx;
struct util_dynarray fdm_bin_patchpoints;

View file

@ -1795,6 +1795,7 @@ static const driOptionDescription tu_dri_options[] = {
DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
DRI_CONF_SECTION_END
};
@ -1825,6 +1826,8 @@ tu_init_dri_options(struct tu_instance *instance)
driQueryOptionb(&instance->dri_options, "tu_ignore_frag_depth_direction");
instance->enable_softfloat32 =
driQueryOptionb(&instance->dri_options, "tu_enable_softfloat32");
instance->autotune_algo =
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
}
static uint32_t instance_count = 0;
@ -2633,7 +2636,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
{
mtx_destroy(&device->bo_mutex);
mtx_destroy(&device->pipeline_mutex);
mtx_destroy(&device->autotune_mutex);
mtx_destroy(&device->kgsl_profiling_mutex);
mtx_destroy(&device->event_mutex);
mtx_destroy(&device->trace_mutex);
@ -2667,6 +2669,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
VkResult result;
struct tu_device *device;
bool border_color_without_format = false;
bool autotune_disable_preempt_optimize = false;
vk_foreach_struct_const (ext, pCreateInfo->pNext) {
switch (ext->sType) {
@ -2743,7 +2746,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->bo_mutex, mtx_plain);
mtx_init(&device->pipeline_mutex, mtx_plain);
mtx_init(&device->autotune_mutex, mtx_plain);
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
mtx_init(&device->event_mutex, mtx_plain);
mtx_init(&device->trace_mutex, mtx_plain);
@ -2789,6 +2791,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queue_create =
&pCreateInfo->pQueueCreateInfos[i];
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
vk_find_struct_const(queue_create->pNext,
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
priority_info->globalPriority :
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
uint32_t qfi = queue_create->queueFamilyIndex;
enum tu_queue_type type = physical_device->queue_families[qfi].type;
device->queues[qfi] = (struct tu_queue *) vk_alloc(
@ -2808,13 +2817,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->queue_count[qfi] = queue_create->queueCount;
for (unsigned q = 0; q < queue_create->queueCount; q++) {
result = tu_queue_init(device, &device->queues[qfi][q], type, q,
queue_create);
result = tu_queue_init(device, &device->queues[qfi][q], type,
global_priority, q, queue_create);
if (result != VK_SUCCESS) {
device->queue_count[qfi] = q;
goto fail_queues;
}
}
autotune_disable_preempt_optimize |=
(global_priority == VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR);
}
result = vk_meta_device_init(&device->vk, &device->meta);
@ -2868,9 +2880,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
TU_BO_ALLOC_ALLOW_DUMP |
TU_BO_ALLOC_INTERNAL_RESOURCE),
"pipeline_suballoc");
tu_bo_suballocator_init(&device->autotune_suballoc, device,
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
"autotune_suballoc");
if (is_kgsl(physical_device->instance)) {
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE,
@ -3019,10 +3028,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
}
pthread_condattr_destroy(&condattr);
result = tu_autotune_init(&device->autotune, device);
if (result != VK_SUCCESS) {
device->autotune = new tu_autotune(device, result);
if (result != VK_SUCCESS)
goto fail_timeline_cond;
}
if (autotune_disable_preempt_optimize)
device->autotune->disable_preempt_optimize();
device->use_z24uint_s8uint =
physical_device->info->props.has_z24uint_s8uint &&
@ -3180,10 +3191,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
free(device->dbg_renderpass_stomp_cs);
}
tu_autotune_fini(&device->autotune, device);
delete device->autotune;
tu_bo_suballocator_finish(&device->pipeline_suballoc);
tu_bo_suballocator_finish(&device->autotune_suballoc);
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
tu_bo_suballocator_finish(&device->event_suballoc);
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
@ -4009,7 +4019,7 @@ tu_CreateFramebuffer(VkDevice _device,
}
}
tu_framebuffer_tiling_config(framebuffer, device, pass);
tu_framebuffer_init_tiling_config(framebuffer, device, pass);
/* For MSRTSS, allocate extra images that are tied to the VkFramebuffer */
if (msrtss_attachment_count > 0) {
@ -4071,7 +4081,7 @@ tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
view->image->max_tile_h_constraint_fdm;
}
tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
tu_framebuffer_init_tiling_config(framebuffer, cmd_buffer->device, pass);
}
VkResult

View file

@ -28,6 +28,7 @@
#include "common/freedreno_rd_output.h"
#include "util/vma.h"
#include "util/u_vector.h"
#include "util/rwlock.h"
/* queue types */
#define TU_QUEUE_GENERAL 0
@ -233,6 +234,9 @@ struct tu_instance
* However we don't want native Vulkan apps using this.
*/
bool enable_softfloat32;
/* Configuration option to use a specific autotune algorithm by default. */
const char *autotune_algo;
};
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
VK_OBJECT_TYPE_INSTANCE)
@ -265,7 +269,12 @@ struct tu6_global
volatile uint32_t vtx_stats_query_not_running;
/* To know when renderpass stats for autotune are valid */
/* A fence with a monotonically increasing value that is
* incremented by the GPU on each submission that includes
* a tu_autotune::submission_entry CS. This is used to track
* which submissions have been processed by the GPU before
* processing the autotune packet on the CPU.
*/
volatile uint32_t autotune_fence;
/* For recycling command buffers for dynamic suspend/resume comamnds */
@ -355,12 +364,6 @@ struct tu_device
struct tu_suballocator pipeline_suballoc;
mtx_t pipeline_mutex;
/* Device-global BO suballocator for reducing BO management for small
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
*/
struct tu_suballocator autotune_suballoc;
mtx_t autotune_mutex;
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
* each submission.
*/
@ -462,7 +465,7 @@ struct tu_device
pthread_cond_t timeline_cond;
pthread_mutex_t submit_mutex;
struct tu_autotune autotune;
struct tu_autotune *autotune;
struct breadcrumbs_context *breadcrumbs_ctx;
@ -547,8 +550,11 @@ struct tu_vsc_config {
/* Whether binning could be used for gmem rendering using this framebuffer. */
bool binning_possible;
/* Whether binning should be used for gmem rendering using this framebuffer. */
bool binning;
/* Whether binning is useful for GMEM rendering performance using this framebuffer. This is independent of whether
* binning is possible, and is determined by the tile count. Not binning when it's useful would be a performance
* hazard, and GMEM rendering should be avoided in the case where it's useful to bin but not possible to do so.
*/
bool binning_useful;
/* pipe register values */
uint32_t pipe_config[MAX_VSC_PIPES];
@ -577,7 +583,8 @@ struct tu_framebuffer
uint32_t max_tile_w_constraint;
uint32_t max_tile_h_constraint;
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
uint32_t initd_divisor; /* The tile divisors up to this have been initialized, for lazy init. */
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT * TU_GMEM_LAYOUT_DIVISOR_MAX];
uint32_t attachment_count;
const struct tu_image_view *attachments[0];

View file

@ -22,6 +22,8 @@ enum tu_gmem_layout
TU_GMEM_LAYOUT_COUNT,
};
constexpr uint32_t TU_GMEM_LAYOUT_DIVISOR_MAX = 6; /* 1x (no divisor), 2 (1/2), 3 (1/3) */
struct tu_subpass_barrier {
VkPipelineStageFlags2 src_stage_mask;
VkPipelineStageFlags2 dst_stage_mask;

View file

@ -418,6 +418,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
struct tu_device *device = queue->device;
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
struct util_dynarray dump_cmds;
struct tu_cs *autotune_cs = NULL;
if (vk_submit->buffer_bind_count ||
vk_submit->image_bind_count ||
@ -495,9 +496,8 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
}
}
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
struct tu_cs *autotune_cs = tu_autotune_on_submit(
device, &device->autotune, cmd_buffers, cmdbuf_count);
autotune_cs = device->autotune->on_submit(cmd_buffers, cmdbuf_count);
if (autotune_cs) {
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
autotune_cs->entry_count);
}
@ -605,17 +605,10 @@ VkResult
tu_queue_init(struct tu_device *device,
struct tu_queue *queue,
enum tu_queue_type type,
const VkQueueGlobalPriorityKHR global_priority,
int idx,
const VkDeviceQueueCreateInfo *create_info)
{
const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
vk_find_struct_const(create_info->pNext,
DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
const VkQueueGlobalPriorityKHR global_priority = priority_info ?
priority_info->globalPriority :
(TU_DEBUG(HIPRIO) ? VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR :
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR);
const int priority = tu_get_submitqueue_priority(
device->physical_device, global_priority, type,
device->vk.enabled_features.globalPriorityQuery);

View file

@ -43,6 +43,7 @@ VkResult
tu_queue_init(struct tu_device *device,
struct tu_queue *queue,
enum tu_queue_type type,
const VkQueueGlobalPriorityKHR global_priority,
int idx,
const VkDeviceQueueCreateInfo *create_info);

View file

@ -365,6 +365,51 @@ is_hw_binning_possible(const struct tu_vsc_config *vsc)
return tiles_per_pipe <= 32;
}
static void
tu_tiling_config_divide_tile(const struct tu_device *dev,
const struct tu_render_pass *pass,
const struct tu_framebuffer *fb,
const struct tu_tiling_config *tiling,
struct tu_tiling_config *new_tiling,
uint32_t divisor)
{
assert(divisor > 0);
*new_tiling = *tiling;
if (divisor == 1 || !tiling->possible || tiling->tile0.width == ~0) {
/* If the divisor is 1, or if the tiling is not possible, or if the
* tiling is invalid, just return the original tiling. */
return;
}
/* Get the hardware-specified alignment values. */
const uint32_t tile_align_w = pass->tile_align_w;
const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
/* Divide the current tile dimensions by the divisor. */
uint32_t new_tile_width = tiling->tile0.width / divisor;
uint32_t new_tile_height = tiling->tile0.height / divisor;
/* Clamp to the minimum alignment if necessary and align down. */
if (new_tile_width < tile_align_w)
new_tile_width = tile_align_w;
else
new_tile_width = ROUND_DOWN_TO_NPOT(new_tile_width, tile_align_w);
if (new_tile_height < tile_align_h)
new_tile_height = tile_align_h;
else
new_tile_height = ROUND_DOWN_TO_NPOT(new_tile_height, tile_align_h);
new_tiling->tile0.width = new_tile_width;
new_tiling->tile0.height = new_tile_height;
/* Recalculate the tile count from the framebuffer dimensions to ensure
* full coverage. */
new_tiling->vsc.tile_count.width = DIV_ROUND_UP(fb->width, new_tile_width);
new_tiling->vsc.tile_count.height = DIV_ROUND_UP(fb->height, new_tile_height);
}
static void
tu_tiling_config_update_pipe_layout(struct tu_vsc_config *vsc,
const struct tu_device *dev,
@ -460,20 +505,16 @@ tu_tiling_config_update_pipes(struct tu_vsc_config *vsc,
static void
tu_tiling_config_update_binning(struct tu_vsc_config *vsc, const struct tu_device *device)
{
if (vsc->binning_possible) {
vsc->binning = (vsc->tile_count.width * vsc->tile_count.height) > 2;
vsc->binning_useful = (vsc->tile_count.width * vsc->tile_count.height) > 2;
if (TU_DEBUG(FORCEBIN))
vsc->binning = true;
vsc->binning_useful = true;
if (TU_DEBUG(NOBIN))
vsc->binning = false;
} else {
vsc->binning = false;
}
vsc->binning_useful = false;
}
void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass)
{
@ -499,6 +540,49 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
tu_tiling_config_update_binning(fdm_offset_vsc, device);
}
}
fb->initd_divisor = 1;
}
const struct tu_tiling_config *
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass,
int gmem_layout,
uint32_t divisor)
{
assert(divisor >= 1 && divisor <= TU_GMEM_LAYOUT_DIVISOR_MAX);
assert(divisor == 1 || !pass->has_fdm); /* For FDM, it's expected that FDM alone will be sufficient to
appropriately size the tiles for the framebuffer.*/
struct tu_tiling_config *tiling = &fb->tiling[(TU_GMEM_LAYOUT_COUNT * (divisor - 1)) + gmem_layout];
if (divisor > fb->initd_divisor) {
const struct tu_tiling_config *base_tiling =
tu_framebuffer_get_tiling_config(fb, device, pass, gmem_layout, divisor - 1);
tu_tiling_config_divide_tile(device, pass, fb, base_tiling, tiling, divisor);
struct tu_vsc_config *vsc = &tiling->vsc;
if (tiling->possible) {
tu_tiling_config_update_pipe_layout(vsc, device, false);
tu_tiling_config_update_pipes(vsc, device);
tu_tiling_config_update_binning(vsc, device);
struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
fdm_offset_vsc->tile_count = (VkExtent2D) { ~1, ~1 };
}
if (!tiling->possible || /* If tiling is no longer possible, this is pointless. */
(vsc->binning_useful && !vsc->binning_possible) || /* Dividing further without HW binning is a bad idea. */
(vsc->tile_count.width * vsc->tile_count.height > 100) /* 100 tiles are too many, even with HW binning. */
) {
/* Revert to the previous level's tiling configuration. */
*tiling = *base_tiling;
}
fb->initd_divisor = divisor;
}
return tiling;
}
void

View file

@ -136,10 +136,17 @@ __tu_finishme(const char *file, int line, const char *format, ...)
} while (0)
void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
tu_framebuffer_init_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass);
const struct tu_tiling_config *
tu_framebuffer_get_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass,
int gmem_layout,
uint32_t divisor);
#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
#define tu_foreach_stage(stage, stage_bits) \

View file

@ -657,6 +657,10 @@
DRI_CONF_OPT_B(tu_enable_softfloat32, def, \
"Enable softfloat emulation for float32 denormals")
#define DRI_CONF_TU_AUTOTUNE_ALGORITHM() \
DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
"Set the preferred autotune algorithm")
/**
* \brief Honeykrisp specific configuration options
*/

View file

@ -28,10 +28,18 @@
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
uint64_t
rand_xorshift128plus(uint64_t seed[2]);
void
s_rand_xorshift128plus(uint64_t seed[2], bool randomised_seed);
#ifdef __cplusplus
} /* end of extern "C" */
#endif
#endif /* RAND_XOR_H */

View file

@ -674,6 +674,12 @@ ROUND_DOWN_TO(uint64_t value, uint32_t alignment)
return ((value) & ~(uint64_t)(alignment - 1));
}
static inline uint64_t
ROUND_DOWN_TO_NPOT(uint64_t value, uint32_t alignment)
{
return value - (value % alignment);
}
/**
* Align a value, only works pot alignemnts.
*/