mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 13:20:14 +01:00
We're already looking at this data to calculate the per-component vars_from_vgrf[] and vgrf_from_vars[] mappings, so just record the largest VGRF size while we're here. This will allow passes to size arrays based on the actual size needed, rather than hardcoding some fixed size. In many cases, MAX_VGRF_SIZE(devinfo) is larger than necessary, because e.g. vec5 sparse sampling results aren't used. Not hardcoding this means we can also temporarily handle very large VGRFs which we know will be split eventually, without having to increase the maximum which is ultimately used for RA classes. Cc: mesa-stable Reviewed-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34461>
560 lines
14 KiB
C++
560 lines
14 KiB
C++
/*
|
|
* Copyright © 2010-2020 Intel Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "brw_cfg.h"
|
|
#include "brw_inst.h"
|
|
#include "util/bitset.h"
|
|
|
|
struct brw_shader;
|
|
|
|
/**
|
|
* Bitset of state categories that can influence the result of IR analysis
|
|
* passes.
|
|
*/
|
|
enum brw_analysis_dependency_class {
|
|
/**
|
|
* The analysis doesn't depend on the IR, its result is effectively a
|
|
* constant during the compilation.
|
|
*/
|
|
BRW_DEPENDENCY_NOTHING = 0,
|
|
/**
|
|
* The analysis depends on the set of instructions in the program and
|
|
* their naming. Note that because instructions are named sequentially
|
|
* by IP this implies a dependency on the control flow edges between
|
|
* instructions. This will be signaled whenever instructions are
|
|
* inserted, removed or reordered in the program.
|
|
*/
|
|
BRW_DEPENDENCY_INSTRUCTION_IDENTITY = 0x1,
|
|
/**
|
|
* The analysis is sensitive to the detailed semantics of instructions
|
|
* in the program, where "detailed" means any change in the instruction
|
|
* data structures other than the linked-list pointers (which are
|
|
* already covered by DEPENDENCY_INSTRUCTION_IDENTITY). E.g. changing
|
|
* the negate or abs flags of an instruction source would signal this
|
|
* flag alone because it would preserve all other instruction dependency
|
|
* classes.
|
|
*/
|
|
BRW_DEPENDENCY_INSTRUCTION_DETAIL = 0x2,
|
|
/**
|
|
* The analysis depends on the set of data flow edges between
|
|
* instructions. This will be signaled whenever the dataflow relation
|
|
* between instructions has potentially changed, e.g. when the VGRF
|
|
* index of an instruction source or destination changes (in which case
|
|
* it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or
|
|
* when data-dependent instructions are reordered (in which case it will
|
|
* appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY).
|
|
*/
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4,
|
|
/**
|
|
* The analysis depends on all instruction dependency classes. These
|
|
* will typically be signaled simultaneously when inserting or removing
|
|
* instructions in the program (or if you're feeling too lazy to read
|
|
* through your optimization pass to figure out which of the instruction
|
|
* dependency classes above it invalidates).
|
|
*/
|
|
BRW_DEPENDENCY_INSTRUCTIONS = 0x7,
|
|
/**
|
|
* The analysis depends on the set of VGRFs in the program and their
|
|
* naming. This will be signaled when VGRFs are allocated or released.
|
|
*/
|
|
BRW_DEPENDENCY_VARIABLES = 0x8,
|
|
/**
|
|
* The analysis depends on the set of basic blocks in the program, their
|
|
* control flow edges and naming.
|
|
*/
|
|
BRW_DEPENDENCY_BLOCKS = 0x10,
|
|
/**
|
|
* The analysis depends on the program being literally the same (good
|
|
* luck...), any change in the input invalidates previous analysis
|
|
* computations.
|
|
*/
|
|
BRW_DEPENDENCY_EVERYTHING = ~0
|
|
};
|
|
|
|
inline brw_analysis_dependency_class
|
|
operator|(brw_analysis_dependency_class x, brw_analysis_dependency_class y)
|
|
{
|
|
return static_cast<brw_analysis_dependency_class>(
|
|
static_cast<unsigned>(x) | static_cast<unsigned>(y));
|
|
}
|
|
|
|
/**
|
|
* Instantiate a program analysis class \p L which can calculate an object of
|
|
* type \p T as result. \p C is a closure that encapsulates whatever
|
|
* information is required as argument to run the analysis pass. The purpose
|
|
* of this class is to make sure that:
|
|
*
|
|
* - The analysis pass is executed lazily whenever it's needed and multiple
|
|
* executions are optimized out as long as the cached result remains marked
|
|
* up-to-date.
|
|
*
|
|
* - There is no way to access the cached analysis result without first
|
|
* calling L::require(), which makes sure that the analysis pass is rerun
|
|
* if necessary.
|
|
*
|
|
* - The cached result doesn't become inconsistent with the program for as
|
|
* long as it remains marked up-to-date. (This is only enforced in debug
|
|
* builds for performance reasons)
|
|
*
|
|
* The requirements on \p T are the following:
|
|
*
|
|
* - Constructible with a single argument, as in 'x = T(c)' for \p c of type
|
|
* \p C.
|
|
*
|
|
* - 'x.dependency_class()' on const \p x returns a bitset of
|
|
* brw::analysis_dependency_class specifying the set of IR objects that are
|
|
* required to remain invariant for the cached analysis result to be
|
|
* considered valid.
|
|
*
|
|
* - 'x.validate(c)' on const \p x returns a boolean result specifying
|
|
* whether the analysis result \p x is consistent with the input IR. This
|
|
* is currently only used for validation in debug builds.
|
|
*/
|
|
template<class T, class C>
|
|
class brw_analysis {
|
|
public:
|
|
/**
|
|
* Construct a program analysis. \p c is an arbitrary object
|
|
* passed as argument to the constructor of the analysis result
|
|
* object of type \p T.
|
|
*/
|
|
brw_analysis(const C *c) : c(c), p(NULL) {}
|
|
|
|
/**
|
|
* Destroy a program analysis.
|
|
*/
|
|
~brw_analysis()
|
|
{
|
|
delete p;
|
|
}
|
|
|
|
/**
|
|
* Obtain the result of a program analysis. This gives a
|
|
* guaranteed up-to-date result, the analysis pass will be
|
|
* rerun implicitly if it has become stale.
|
|
*/
|
|
T &
|
|
require()
|
|
{
|
|
if (p)
|
|
assert(p->validate(c));
|
|
else
|
|
p = new T(c);
|
|
|
|
return *p;
|
|
}
|
|
|
|
const T &
|
|
require() const
|
|
{
|
|
return const_cast<brw_analysis<T, C> *>(this)->require();
|
|
}
|
|
|
|
/**
|
|
* Report that dependencies of the analysis pass may have changed
|
|
* since the last calculation and the cached analysis result may
|
|
* have to be discarded.
|
|
*/
|
|
void
|
|
invalidate(brw_analysis_dependency_class c)
|
|
{
|
|
if (p && (c & p->dependency_class())) {
|
|
delete p;
|
|
p = NULL;
|
|
}
|
|
}
|
|
|
|
private:
|
|
const C *c;
|
|
T *p;
|
|
};
|
|
|
|
/**
|
|
* Immediate dominator tree analysis of a shader.
|
|
*/
|
|
struct brw_idom_tree {
|
|
brw_idom_tree(const brw_shader *s);
|
|
~brw_idom_tree();
|
|
|
|
bool
|
|
validate(const brw_shader *) const
|
|
{
|
|
/* FINISHME */
|
|
return true;
|
|
}
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return BRW_DEPENDENCY_BLOCKS;
|
|
}
|
|
|
|
const bblock_t *
|
|
parent(const bblock_t *b) const
|
|
{
|
|
assert(unsigned(b->num) < num_parents);
|
|
return parents[b->num];
|
|
}
|
|
|
|
bblock_t *
|
|
parent(bblock_t *b) const
|
|
{
|
|
assert(unsigned(b->num) < num_parents);
|
|
return parents[b->num];
|
|
}
|
|
|
|
bblock_t *
|
|
intersect(bblock_t *b1, bblock_t *b2) const;
|
|
|
|
/**
|
|
* Returns true if block `a` dominates block `b`.
|
|
*/
|
|
bool
|
|
dominates(const bblock_t *a, const bblock_t *b) const
|
|
{
|
|
while (a != b) {
|
|
if (b->num == 0)
|
|
return false;
|
|
|
|
b = parent(b);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void dump(FILE *file = stderr) const;
|
|
|
|
private:
|
|
unsigned num_parents;
|
|
bblock_t **parents;
|
|
};
|
|
|
|
struct brw_range {
|
|
int start;
|
|
int end;
|
|
|
|
/* If range not empty, this is the last value inside the range. */
|
|
inline int last() const
|
|
{
|
|
return end - 1;
|
|
}
|
|
|
|
inline bool is_empty() const
|
|
{
|
|
return end <= start;
|
|
}
|
|
|
|
inline int len() const
|
|
{
|
|
return end - start;
|
|
}
|
|
|
|
inline bool contains(int x) const
|
|
{
|
|
return start <= x && x < end;
|
|
}
|
|
|
|
inline bool contains(brw_range r) const
|
|
{
|
|
return start <= r.start && r.end <= end;
|
|
}
|
|
};
|
|
|
|
inline brw_range
|
|
merge(brw_range a, brw_range b)
|
|
{
|
|
if (a.is_empty())
|
|
return b;
|
|
if (b.is_empty())
|
|
return a;
|
|
return { MIN2(a.start, b.start), MAX2(a.end, b.end) };
|
|
}
|
|
|
|
inline brw_range
|
|
merge(brw_range r, int x)
|
|
{
|
|
if (r.is_empty())
|
|
return { x, x + 1 };
|
|
return { MIN2(r.start, x), MAX2(r.end, x + 1) };
|
|
}
|
|
|
|
inline bool
|
|
overlaps(brw_range a, brw_range b)
|
|
{
|
|
return a.start < b.end &&
|
|
b.start < a.end;
|
|
}
|
|
|
|
inline brw_range
|
|
intersect(brw_range a, brw_range b)
|
|
{
|
|
if (overlaps(a, b))
|
|
return { MAX2(a.start, b.start),
|
|
MIN2(a.end, b.end) };
|
|
else
|
|
return { 0, 0 };
|
|
}
|
|
|
|
inline brw_range
|
|
clip_end(brw_range r, int n)
|
|
{
|
|
assert(n >= 0);
|
|
return { r.start, r.end - n };
|
|
}
|
|
|
|
struct brw_ip_ranges {
|
|
brw_ip_ranges(const brw_shader *s);
|
|
~brw_ip_ranges();
|
|
|
|
bool validate(const brw_shader *) const;
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return BRW_DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
BRW_DEPENDENCY_BLOCKS;
|
|
}
|
|
|
|
brw_range range(const bblock_t *block) const {
|
|
int start = start_ip[block->num];
|
|
return { start, start + (int)block->num_instructions };
|
|
}
|
|
|
|
private:
|
|
int num_blocks;
|
|
int *start_ip;
|
|
};
|
|
|
|
/**
|
|
* Register pressure analysis of a shader. Estimates how many registers
|
|
* are live at any point of the program in GRF units.
|
|
*/
|
|
struct brw_register_pressure {
|
|
brw_register_pressure(const brw_shader *v);
|
|
~brw_register_pressure();
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return (BRW_DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
BRW_DEPENDENCY_VARIABLES);
|
|
}
|
|
|
|
bool
|
|
validate(const brw_shader *) const
|
|
{
|
|
/* FINISHME */
|
|
return true;
|
|
}
|
|
|
|
unsigned *regs_live_at_ip;
|
|
};
|
|
|
|
class brw_def_analysis {
|
|
public:
|
|
brw_def_analysis(const brw_shader *v);
|
|
~brw_def_analysis();
|
|
|
|
brw_inst *
|
|
get(const brw_reg ®) const
|
|
{
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
def_insts[reg.nr] : NULL;
|
|
}
|
|
|
|
uint32_t
|
|
get_use_count(const brw_reg ®) const
|
|
{
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
def_use_counts[reg.nr] : 0;
|
|
}
|
|
|
|
unsigned count() const { return def_count; }
|
|
unsigned ssa_count() const;
|
|
|
|
void print_stats(const brw_shader *) const;
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return BRW_DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
BRW_DEPENDENCY_VARIABLES |
|
|
BRW_DEPENDENCY_BLOCKS;
|
|
}
|
|
|
|
bool validate(const brw_shader *) const;
|
|
|
|
private:
|
|
void mark_invalid(int);
|
|
bool fully_defines(const brw_shader *v, brw_inst *);
|
|
void update_for_reads(const brw_idom_tree &idom, brw_inst *);
|
|
void update_for_write(const brw_shader *v, brw_inst *);
|
|
|
|
brw_inst **def_insts;
|
|
uint32_t *def_use_counts;
|
|
unsigned def_count;
|
|
};
|
|
|
|
class brw_live_variables {
|
|
public:
|
|
struct block_data {
|
|
/**
|
|
* Which variables are defined before being used in the block.
|
|
*
|
|
* Note that for our purposes, "defined" means unconditionally, completely
|
|
* defined.
|
|
*/
|
|
BITSET_WORD *def;
|
|
|
|
/**
|
|
* Which variables are used before being defined in the block.
|
|
*/
|
|
BITSET_WORD *use;
|
|
|
|
/** Which defs reach the entry point of the block. */
|
|
BITSET_WORD *livein;
|
|
|
|
/** Which defs reach the exit point of the block. */
|
|
BITSET_WORD *liveout;
|
|
|
|
/**
|
|
* Variables such that the entry point of the block may be reached from any
|
|
* of their definitions.
|
|
*/
|
|
BITSET_WORD *defin;
|
|
|
|
/**
|
|
* Variables such that the exit point of the block may be reached from any
|
|
* of their definitions.
|
|
*/
|
|
BITSET_WORD *defout;
|
|
|
|
BITSET_WORD flag_def[1];
|
|
BITSET_WORD flag_use[1];
|
|
BITSET_WORD flag_livein[1];
|
|
BITSET_WORD flag_liveout[1];
|
|
|
|
brw_range ip_range;
|
|
};
|
|
|
|
brw_live_variables(const brw_shader *s);
|
|
~brw_live_variables();
|
|
|
|
bool validate(const brw_shader *s) const;
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return (BRW_DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
BRW_DEPENDENCY_VARIABLES);
|
|
}
|
|
|
|
bool vars_interfere(int a, int b) const;
|
|
bool vgrfs_interfere(int a, int b) const;
|
|
int var_from_reg(const brw_reg ®) const
|
|
{
|
|
return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
|
|
}
|
|
|
|
/** Map from virtual GRF number to index in block_data arrays. */
|
|
int *var_from_vgrf;
|
|
|
|
/**
|
|
* Map from any index in block_data to the virtual GRF containing it.
|
|
*
|
|
* For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
|
|
* [0, 1, 1, 2, 2, 2].
|
|
*/
|
|
int *vgrf_from_var;
|
|
|
|
int num_vars;
|
|
int num_vgrfs;
|
|
int bitset_words;
|
|
|
|
unsigned max_vgrf_size;
|
|
|
|
/** @{
|
|
* Final computed live ranges for each var (each component of each virtual
|
|
* GRF).
|
|
*/
|
|
brw_range *vars_range;
|
|
/** @} */
|
|
|
|
/** @{
|
|
* Final computed live ranges for each VGRF.
|
|
*/
|
|
brw_range *vgrf_range;
|
|
/** @} */
|
|
|
|
/** Per-basic-block information on live variables */
|
|
struct block_data *block_data;
|
|
|
|
protected:
|
|
void setup_def_use();
|
|
void setup_one_read(struct block_data *bd, int ip, const brw_reg ®);
|
|
void setup_one_write(struct block_data *bd, brw_inst *inst, int ip,
|
|
const brw_reg ®);
|
|
void compute_live_variables();
|
|
void compute_start_end();
|
|
|
|
const struct intel_device_info *devinfo;
|
|
const cfg_t *cfg;
|
|
void *mem_ctx;
|
|
};
|
|
|
|
/**
|
|
* Various estimates of the performance of a shader based on static
|
|
* analysis.
|
|
*/
|
|
struct brw_performance {
|
|
brw_performance(const brw_shader *v);
|
|
~brw_performance();
|
|
|
|
brw_analysis_dependency_class
|
|
dependency_class() const
|
|
{
|
|
return (BRW_DEPENDENCY_INSTRUCTIONS |
|
|
BRW_DEPENDENCY_BLOCKS);
|
|
}
|
|
|
|
bool
|
|
validate(const brw_shader *) const
|
|
{
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Array containing estimates of the runtime of each basic block of the
|
|
* program in cycle units.
|
|
*/
|
|
unsigned *block_latency;
|
|
|
|
/**
|
|
* Estimate of the runtime of the whole program in cycle units assuming
|
|
* uncontended execution.
|
|
*/
|
|
unsigned latency;
|
|
|
|
/**
|
|
* Estimate of the throughput of the whole program in
|
|
* invocations-per-cycle units.
|
|
*
|
|
* Note that this might be lower than the ratio between the dispatch
|
|
* width of the program and its latency estimate in cases where
|
|
* performance doesn't scale without limits as a function of its thread
|
|
* parallelism, e.g. due to the existence of a bottleneck in a shared
|
|
* function.
|
|
*/
|
|
float throughput;
|
|
|
|
private:
|
|
brw_performance(const brw_performance &perf);
|
|
brw_performance &
|
|
operator=(brw_performance u);
|
|
};
|