mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 15:40:11 +01:00
ir3: Rewrite register allocation
Switch to the new SSA-based register allocator. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9842>
This commit is contained in:
parent
df9f41cc02
commit
0ffcb19b9d
20 changed files with 3851 additions and 2414 deletions
|
|
@ -435,11 +435,11 @@ traces:
|
||||||
- path: minetest/minetest.trace
|
- path: minetest/minetest.trace
|
||||||
expectations:
|
expectations:
|
||||||
- device: freedreno-a306
|
- device: freedreno-a306
|
||||||
checksum: 9227cc8d4e6445f2323438340f2a5d9b
|
checksum: daedbc987cc1b1f934364ce6b633bc54
|
||||||
- device: freedreno-a530
|
- device: freedreno-a530
|
||||||
checksum: d0d655d81fabeb4087bf7c4837301f2a
|
checksum: 0054f0ba67ace5d2defe17b74b5364e9
|
||||||
- device: freedreno-a630
|
- device: freedreno-a630
|
||||||
checksum: c7349124612a8760ddd825b903561ec4
|
checksum: b47f8151d4310d87070deea9059d001b
|
||||||
- path: neverball/neverball.trace
|
- path: neverball/neverball.trace
|
||||||
expectations:
|
expectations:
|
||||||
# Skipped since it's long on a530.
|
# Skipped since it's long on a530.
|
||||||
|
|
|
||||||
|
|
@ -83,6 +83,17 @@ struct ir3_info {
|
||||||
uint16_t instrs_per_cat[8];
|
uint16_t instrs_per_cat[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ir3_merge_set {
|
||||||
|
uint16_t preferred_reg;
|
||||||
|
uint16_t size;
|
||||||
|
uint16_t alignment;
|
||||||
|
|
||||||
|
unsigned interval_start;
|
||||||
|
|
||||||
|
unsigned regs_count;
|
||||||
|
struct ir3_register **regs;
|
||||||
|
};
|
||||||
|
|
||||||
struct ir3_register {
|
struct ir3_register {
|
||||||
enum {
|
enum {
|
||||||
IR3_REG_CONST = 0x001,
|
IR3_REG_CONST = 0x001,
|
||||||
|
|
@ -119,6 +130,9 @@ struct ir3_register {
|
||||||
IR3_REG_ARRAY = 0x8000,
|
IR3_REG_ARRAY = 0x8000,
|
||||||
|
|
||||||
IR3_REG_DEST = 0x10000,
|
IR3_REG_DEST = 0x10000,
|
||||||
|
IR3_REG_KILL = 0x20000,
|
||||||
|
IR3_REG_FIRST_KILL = 0x40000,
|
||||||
|
IR3_REG_UNUSED = 0x80000,
|
||||||
} flags;
|
} flags;
|
||||||
|
|
||||||
/* used for cat5 instructions, but also for internal/IR level
|
/* used for cat5 instructions, but also for internal/IR level
|
||||||
|
|
@ -142,6 +156,7 @@ struct ir3_register {
|
||||||
* rN.x becomes: (N << 2) | x
|
* rN.x becomes: (N << 2) | x
|
||||||
*/
|
*/
|
||||||
uint16_t num;
|
uint16_t num;
|
||||||
|
uint16_t name;
|
||||||
union {
|
union {
|
||||||
/* immediate: */
|
/* immediate: */
|
||||||
int32_t iim_val;
|
int32_t iim_val;
|
||||||
|
|
@ -169,6 +184,10 @@ struct ir3_register {
|
||||||
* back to a previous instruction that we depend on).
|
* back to a previous instruction that we depend on).
|
||||||
*/
|
*/
|
||||||
struct ir3_register *def;
|
struct ir3_register *def;
|
||||||
|
|
||||||
|
unsigned merge_set_offset;
|
||||||
|
struct ir3_merge_set *merge_set;
|
||||||
|
unsigned interval_start, interval_end;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -178,12 +197,12 @@ struct ir3_register {
|
||||||
unsigned name ## _count, name ## _sz; \
|
unsigned name ## _count, name ## _sz; \
|
||||||
type * name;
|
type * name;
|
||||||
|
|
||||||
#define array_insert(ctx, arr, val) do { \
|
#define array_insert(ctx, arr, ...) do { \
|
||||||
if (arr ## _count == arr ## _sz) { \
|
if (arr ## _count == arr ## _sz) { \
|
||||||
arr ## _sz = MAX2(2 * arr ## _sz, 16); \
|
arr ## _sz = MAX2(2 * arr ## _sz, 16); \
|
||||||
arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
|
arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
|
||||||
} \
|
} \
|
||||||
arr[arr ##_count++] = val; \
|
arr[arr ##_count++] = __VA_ARGS__; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
struct ir3_instruction {
|
struct ir3_instruction {
|
||||||
|
|
@ -696,12 +715,12 @@ bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
|
||||||
set_foreach ((__instr)->uses, __entry) \
|
set_foreach ((__instr)->uses, __entry) \
|
||||||
if ((__use = (void *)__entry->key))
|
if ((__use = (void *)__entry->key))
|
||||||
|
|
||||||
static inline uint32_t reg_num(struct ir3_register *reg)
|
static inline uint32_t reg_num(const struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
return reg->num >> 2;
|
return reg->num >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t reg_comp(struct ir3_register *reg)
|
static inline uint32_t reg_comp(const struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
return reg->num & 0x3;
|
return reg->num & 0x3;
|
||||||
}
|
}
|
||||||
|
|
@ -1479,9 +1498,6 @@ bool ir3_cp_postsched(struct ir3 *ir);
|
||||||
/* Make arrays SSA */
|
/* Make arrays SSA */
|
||||||
bool ir3_array_to_ssa(struct ir3 *ir);
|
bool ir3_array_to_ssa(struct ir3 *ir);
|
||||||
|
|
||||||
/* group neighbors and insert mov's to resolve conflicts: */
|
|
||||||
bool ir3_group(struct ir3 *ir);
|
|
||||||
|
|
||||||
/* scheduling: */
|
/* scheduling: */
|
||||||
bool ir3_sched_add_deps(struct ir3 *ir);
|
bool ir3_sched_add_deps(struct ir3 *ir);
|
||||||
int ir3_sched(struct ir3 *ir);
|
int ir3_sched(struct ir3 *ir);
|
||||||
|
|
@ -1489,11 +1505,8 @@ int ir3_sched(struct ir3 *ir);
|
||||||
struct ir3_context;
|
struct ir3_context;
|
||||||
bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
|
bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
|
||||||
|
|
||||||
bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
|
|
||||||
|
|
||||||
/* register assignment: */
|
/* register assignment: */
|
||||||
struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs);
|
int ir3_ra(struct ir3_shader_variant *v);
|
||||||
int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
|
|
||||||
|
|
||||||
/* legalize: */
|
/* legalize: */
|
||||||
bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
|
bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
|
||||||
|
|
|
||||||
|
|
@ -125,9 +125,9 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
* src1.z - is 'data' for cmpxchg
|
* src1.z - is 'data' for cmpxchg
|
||||||
*
|
*
|
||||||
* The combining src and dest kinda doesn't work out so well with how
|
* The combining src and dest kinda doesn't work out so well with how
|
||||||
* scheduling and RA work. So for now we create a dummy src2.x, and
|
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||||
* then in a later fixup path, insert an extra MOV out of src1.x.
|
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||||
* See ir3_a6xx_fixup_atomic_dests().
|
* register) and then immediately extract the first component.
|
||||||
*
|
*
|
||||||
* Note that nir already multiplies the offset by four
|
* Note that nir already multiplies the offset by four
|
||||||
*/
|
*/
|
||||||
|
|
@ -193,7 +193,10 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||||
array_insert(b, b->keeps, atomic);
|
array_insert(b, b->keeps, atomic);
|
||||||
|
|
||||||
return atomic;
|
atomic->regs[0]->wrmask = src1->regs[0]->wrmask;
|
||||||
|
struct ir3_instruction *split;
|
||||||
|
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||||
|
return split;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* src[] = { deref, coord, sample_index }. const_index[] = {} */
|
/* src[] = { deref, coord, sample_index }. const_index[] = {} */
|
||||||
|
|
@ -270,9 +273,9 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
* src1.z - is 'value' for cmpxchg
|
* src1.z - is 'value' for cmpxchg
|
||||||
*
|
*
|
||||||
* The combining src and dest kinda doesn't work out so well with how
|
* The combining src and dest kinda doesn't work out so well with how
|
||||||
* scheduling and RA work. So for now we create a dummy src2.x, and
|
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||||
* then in a later fixup path, insert an extra MOV out of src1.x.
|
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||||
* See ir3_a6xx_fixup_atomic_dests().
|
* register) and then immediately extract the first component.
|
||||||
*/
|
*/
|
||||||
dummy = create_immed(b, 0);
|
dummy = create_immed(b, 0);
|
||||||
src0 = ir3_create_collect(ctx, coords, ncoords);
|
src0 = ir3_create_collect(ctx, coords, ncoords);
|
||||||
|
|
@ -341,7 +344,10 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||||
array_insert(b, b->keeps, atomic);
|
array_insert(b, b->keeps, atomic);
|
||||||
|
|
||||||
return atomic;
|
atomic->regs[0]->wrmask = src1->regs[0]->wrmask;
|
||||||
|
struct ir3_instruction *split;
|
||||||
|
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||||
|
return split;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -373,77 +379,3 @@ const struct ir3_context_funcs ir3_a6xx_funcs = {
|
||||||
.emit_intrinsic_image_size = emit_intrinsic_image_size,
|
.emit_intrinsic_image_size = emit_intrinsic_image_size,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Special pass to run after instruction scheduling to insert an
|
|
||||||
* extra mov from src1.x to dst. This way the other compiler passes
|
|
||||||
* can ignore this quirk of the new instruction encoding.
|
|
||||||
*
|
|
||||||
* This should run after RA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
static struct ir3_instruction *
|
|
||||||
get_atomic_dest_mov(struct ir3_instruction *atomic)
|
|
||||||
{
|
|
||||||
struct ir3_instruction *mov;
|
|
||||||
|
|
||||||
/* if we've already created the mov-out, then re-use it: */
|
|
||||||
if (atomic->data)
|
|
||||||
return atomic->data;
|
|
||||||
|
|
||||||
/* We are already out of SSA here, so we can't use the nice builders: */
|
|
||||||
mov = ir3_instr_create(atomic->block, OPC_MOV, 2);
|
|
||||||
ir3_reg_create(mov, 0, 0); /* dst */
|
|
||||||
ir3_reg_create(mov, 0, 0); /* src */
|
|
||||||
|
|
||||||
mov->cat1.src_type = TYPE_U32;
|
|
||||||
mov->cat1.dst_type = TYPE_U32;
|
|
||||||
|
|
||||||
/* extract back out the 'dummy' which serves as stand-in for dest: */
|
|
||||||
struct ir3_instruction *src = atomic->regs[3]->instr;
|
|
||||||
debug_assert(src->opc == OPC_META_COLLECT);
|
|
||||||
|
|
||||||
*mov->regs[0] = *atomic->regs[0];
|
|
||||||
*mov->regs[1] = *src->regs[1]->instr->regs[0];
|
|
||||||
|
|
||||||
mov->flags |= IR3_INSTR_SY;
|
|
||||||
|
|
||||||
/* it will have already been appended to the end of the block, which
|
|
||||||
* isn't where we want it, so fix-up the location:
|
|
||||||
*/
|
|
||||||
ir3_instr_move_after(mov, atomic);
|
|
||||||
|
|
||||||
return atomic->data = mov;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
|
|
||||||
{
|
|
||||||
bool progress = false;
|
|
||||||
|
|
||||||
if (ir3_shader_nibo(so) == 0 && !so->bindless_ibo)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
foreach_block (block, &ir->block_list) {
|
|
||||||
foreach_instr (instr, &block->instr_list) {
|
|
||||||
instr->data = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach_block (block, &ir->block_list) {
|
|
||||||
foreach_instr_safe (instr, &block->instr_list) {
|
|
||||||
foreach_src (reg, instr) {
|
|
||||||
struct ir3_instruction *src = reg->instr;
|
|
||||||
|
|
||||||
if (!src)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (is_atomic(src->opc) && (src->flags & IR3_INSTR_G)) {
|
|
||||||
reg->instr = get_atomic_dest_mov(src);
|
|
||||||
progress = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return progress;
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -78,7 +78,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce
|
||||||
compiler->dev = dev;
|
compiler->dev = dev;
|
||||||
compiler->gpu_id = gpu_id;
|
compiler->gpu_id = gpu_id;
|
||||||
compiler->robust_ubo_access = robust_ubo_access;
|
compiler->robust_ubo_access = robust_ubo_access;
|
||||||
compiler->set = ir3_ra_alloc_reg_set(compiler, false);
|
|
||||||
|
|
||||||
/* All known GPU's have 32k local memory (aka shared) */
|
/* All known GPU's have 32k local memory (aka shared) */
|
||||||
compiler->local_mem_size = 32 * 1024;
|
compiler->local_mem_size = 32 * 1024;
|
||||||
|
|
@ -88,7 +87,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce
|
||||||
compiler->max_waves = 16;
|
compiler->max_waves = 16;
|
||||||
|
|
||||||
if (compiler->gpu_id >= 600) {
|
if (compiler->gpu_id >= 600) {
|
||||||
compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true);
|
|
||||||
compiler->samgq_workaround = true;
|
compiler->samgq_workaround = true;
|
||||||
/* a6xx split the pipeline state into geometry and fragment state, in
|
/* a6xx split the pipeline state into geometry and fragment state, in
|
||||||
* order to let the VS run ahead of the FS. As a result there are now
|
* order to let the VS run ahead of the FS. As a result there are now
|
||||||
|
|
|
||||||
|
|
@ -38,8 +38,6 @@ struct ir3_shader;
|
||||||
struct ir3_compiler {
|
struct ir3_compiler {
|
||||||
struct fd_device *dev;
|
struct fd_device *dev;
|
||||||
uint32_t gpu_id;
|
uint32_t gpu_id;
|
||||||
struct ir3_ra_reg_set *set;
|
|
||||||
struct ir3_ra_reg_set *mergedregs_set;
|
|
||||||
uint32_t shader_count;
|
uint32_t shader_count;
|
||||||
|
|
||||||
struct disk_cache *disk_cache;
|
struct disk_cache *disk_cache;
|
||||||
|
|
|
||||||
|
|
@ -3811,13 +3811,17 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
/* Vertex shaders in a tessellation or geometry pipeline treat END as a
|
/* Vertex shaders in a tessellation or geometry pipeline treat END as a
|
||||||
* NOP and has an epilogue that writes the VS outputs to local storage, to
|
* NOP and has an epilogue that writes the VS outputs to local storage, to
|
||||||
* be read by the HS. Then it resets execution mask (chmask) and chains
|
* be read by the HS. Then it resets execution mask (chmask) and chains
|
||||||
* to the next shader (chsh).
|
* to the next shader (chsh). There are also a few output values which we
|
||||||
|
* must send to the next stage via registers, and in order for both stages
|
||||||
|
* to agree on the register used we must force these to be in specific
|
||||||
|
* registers.
|
||||||
*/
|
*/
|
||||||
if ((so->type == MESA_SHADER_VERTEX &&
|
if ((so->type == MESA_SHADER_VERTEX &&
|
||||||
(so->key.has_gs || so->key.tessellation)) ||
|
(so->key.has_gs || so->key.tessellation)) ||
|
||||||
(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
|
(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
|
||||||
struct ir3_instruction *outputs[3];
|
struct ir3_instruction *outputs[3];
|
||||||
unsigned outidxs[3];
|
unsigned outidxs[3];
|
||||||
|
unsigned regids[3];
|
||||||
unsigned outputs_count = 0;
|
unsigned outputs_count = 0;
|
||||||
|
|
||||||
if (ctx->primitive_id) {
|
if (ctx->primitive_id) {
|
||||||
|
|
@ -3828,6 +3832,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
ir3_create_collect(ctx, &ctx->primitive_id, 1);
|
ir3_create_collect(ctx, &ctx->primitive_id, 1);
|
||||||
outputs[outputs_count] = out;
|
outputs[outputs_count] = out;
|
||||||
outidxs[outputs_count] = n;
|
outidxs[outputs_count] = n;
|
||||||
|
regids[outputs_count] = regid(0, 1);
|
||||||
outputs_count++;
|
outputs_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3838,6 +3843,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
ir3_create_collect(ctx, &ctx->gs_header, 1);
|
ir3_create_collect(ctx, &ctx->gs_header, 1);
|
||||||
outputs[outputs_count] = out;
|
outputs[outputs_count] = out;
|
||||||
outidxs[outputs_count] = n;
|
outidxs[outputs_count] = n;
|
||||||
|
regids[outputs_count] = regid(0, 0);
|
||||||
outputs_count++;
|
outputs_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3848,6 +3854,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
ir3_create_collect(ctx, &ctx->tcs_header, 1);
|
ir3_create_collect(ctx, &ctx->tcs_header, 1);
|
||||||
outputs[outputs_count] = out;
|
outputs[outputs_count] = out;
|
||||||
outidxs[outputs_count] = n;
|
outidxs[outputs_count] = n;
|
||||||
|
regids[outputs_count] = regid(0, 0);
|
||||||
outputs_count++;
|
outputs_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3858,7 +3865,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
|
|
||||||
__ssa_dst(chmask);
|
__ssa_dst(chmask);
|
||||||
for (unsigned i = 0; i < outputs_count; i++)
|
for (unsigned i = 0; i < outputs_count; i++)
|
||||||
__ssa_src(chmask, outputs[i], 0);
|
__ssa_src(chmask, outputs[i], 0)->num = regids[i];
|
||||||
|
|
||||||
chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
|
chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
|
||||||
memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
|
memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
|
||||||
|
|
@ -3959,6 +3966,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
ir3_debug_print(ir, "AFTER: nir->ir3");
|
ir3_debug_print(ir, "AFTER: nir->ir3");
|
||||||
ir3_validate(ir);
|
ir3_validate(ir);
|
||||||
|
|
||||||
|
IR3_PASS(ir, ir3_array_to_ssa);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
progress = false;
|
progress = false;
|
||||||
|
|
||||||
|
|
@ -3980,11 +3989,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
|
|
||||||
IR3_PASS(ir, ir3_sched_add_deps);
|
IR3_PASS(ir, ir3_sched_add_deps);
|
||||||
|
|
||||||
/* Group left/right neighbors, inserting mov's where needed to
|
|
||||||
* solve conflicts:
|
|
||||||
*/
|
|
||||||
IR3_PASS(ir, ir3_group);
|
|
||||||
|
|
||||||
/* At this point, all the dead code should be long gone: */
|
/* At this point, all the dead code should be long gone: */
|
||||||
assert(!IR3_PASS(ir, ir3_dce, so));
|
assert(!IR3_PASS(ir, ir3_dce, so));
|
||||||
|
|
||||||
|
|
@ -4012,20 +4016,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
so->binning_pass;
|
so->binning_pass;
|
||||||
|
|
||||||
if (pre_assign_inputs) {
|
if (pre_assign_inputs) {
|
||||||
for (unsigned i = 0; i < ctx->ninputs; i++) {
|
foreach_input (in, ir) {
|
||||||
struct ir3_instruction *instr = ctx->inputs[i];
|
assert(in->opc == OPC_META_INPUT);
|
||||||
|
unsigned inidx = in->input.inidx;
|
||||||
|
|
||||||
if (!instr)
|
in->regs[0]->num = so->nonbinning->inputs[inidx].regid;
|
||||||
continue;
|
|
||||||
|
|
||||||
unsigned n = i / 4;
|
|
||||||
unsigned c = i % 4;
|
|
||||||
unsigned regid = so->nonbinning->inputs[n].regid + c;
|
|
||||||
|
|
||||||
instr->regs[0]->num = regid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = ir3_ra(so, ctx->inputs, ctx->ninputs);
|
|
||||||
} else if (ctx->tcs_header) {
|
} else if (ctx->tcs_header) {
|
||||||
/* We need to have these values in the same registers between VS and TCS
|
/* We need to have these values in the same registers between VS and TCS
|
||||||
* since the VS chains to TCS and doesn't get the sysvals redelivered.
|
* since the VS chains to TCS and doesn't get the sysvals redelivered.
|
||||||
|
|
@ -4033,8 +4029,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
|
|
||||||
ctx->tcs_header->regs[0]->num = regid(0, 0);
|
ctx->tcs_header->regs[0]->num = regid(0, 0);
|
||||||
ctx->primitive_id->regs[0]->num = regid(0, 1);
|
ctx->primitive_id->regs[0]->num = regid(0, 1);
|
||||||
struct ir3_instruction *precolor[] = { ctx->tcs_header, ctx->primitive_id };
|
|
||||||
ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor));
|
|
||||||
} else if (ctx->gs_header) {
|
} else if (ctx->gs_header) {
|
||||||
/* We need to have these values in the same registers between producer
|
/* We need to have these values in the same registers between producer
|
||||||
* (VS or DS) and GS since the producer chains to GS and doesn't get
|
* (VS or DS) and GS since the producer chains to GS and doesn't get
|
||||||
|
|
@ -4043,29 +4037,22 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
|
|
||||||
ctx->gs_header->regs[0]->num = regid(0, 0);
|
ctx->gs_header->regs[0]->num = regid(0, 0);
|
||||||
ctx->primitive_id->regs[0]->num = regid(0, 1);
|
ctx->primitive_id->regs[0]->num = regid(0, 1);
|
||||||
struct ir3_instruction *precolor[] = { ctx->gs_header, ctx->primitive_id };
|
|
||||||
ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor));
|
|
||||||
} else if (so->num_sampler_prefetch) {
|
} else if (so->num_sampler_prefetch) {
|
||||||
assert(so->type == MESA_SHADER_FRAGMENT);
|
assert(so->type == MESA_SHADER_FRAGMENT);
|
||||||
struct ir3_instruction *precolor[2];
|
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
|
|
||||||
foreach_input (instr, ir) {
|
foreach_input (instr, ir) {
|
||||||
if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
|
if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
assert(idx < ARRAY_SIZE(precolor));
|
assert(idx < 2);
|
||||||
|
|
||||||
precolor[idx] = instr;
|
|
||||||
instr->regs[0]->num = idx;
|
instr->regs[0]->num = idx;
|
||||||
|
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
ret = ir3_ra(so, precolor, idx);
|
|
||||||
} else {
|
|
||||||
ret = ir3_ra(so, NULL, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = ir3_ra(so);
|
||||||
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
DBG("RA failed!");
|
DBG("RA failed!");
|
||||||
goto out;
|
goto out;
|
||||||
|
|
@ -4073,10 +4060,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||||
|
|
||||||
IR3_PASS(ir, ir3_postsched, so);
|
IR3_PASS(ir, ir3_postsched, so);
|
||||||
|
|
||||||
if (compiler->gpu_id >= 600) {
|
|
||||||
IR3_PASS(ir, ir3_a6xx_fixup_atomic_dests, so);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (so->type == MESA_SHADER_FRAGMENT)
|
if (so->type == MESA_SHADER_FRAGMENT)
|
||||||
pack_inlocs(ctx);
|
pack_inlocs(ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ ir3_context_init(struct ir3_compiler *compiler,
|
||||||
if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
|
if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
|
||||||
NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
|
NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
|
||||||
|
|
||||||
NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
|
NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
|
||||||
|
|
||||||
/* Super crude heuristic to limit # of tex prefetch in small
|
/* Super crude heuristic to limit # of tex prefetch in small
|
||||||
* shaders. This completely ignores loops.. but that's really
|
* shaders. This completely ignores loops.. but that's really
|
||||||
|
|
|
||||||
|
|
@ -1,187 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
|
||||||
* to deal in the Software without restriction, including without limitation
|
|
||||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
* and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice (including the next
|
|
||||||
* paragraph) shall be included in all copies or substantial portions of the
|
|
||||||
* Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
||||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*
|
|
||||||
* Authors:
|
|
||||||
* Rob Clark <robclark@freedesktop.org>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "ir3.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find/group instruction neighbors:
|
|
||||||
*/
|
|
||||||
|
|
||||||
static void
|
|
||||||
insert_mov(struct ir3_instruction *collect, int idx)
|
|
||||||
{
|
|
||||||
struct ir3_instruction *src = ssa(collect->regs[idx+1]);
|
|
||||||
struct ir3_instruction *mov = ir3_MOV(src->block, src,
|
|
||||||
(collect->regs[idx+1]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32);
|
|
||||||
|
|
||||||
collect->regs[idx+1]->def = mov->regs[0];
|
|
||||||
|
|
||||||
/* if collect and src are in the same block, move the inserted mov
|
|
||||||
* to just before the collect to avoid a use-before-def. Otherwise
|
|
||||||
* it should be safe to leave at the end of the block it is in:
|
|
||||||
*/
|
|
||||||
if (src->block == collect->block) {
|
|
||||||
ir3_instr_move_before(mov, collect);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
|
|
||||||
static bool
|
|
||||||
in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
|
|
||||||
{
|
|
||||||
int idx = 0;
|
|
||||||
|
|
||||||
if (!instr)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (instr == cur)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
|
|
||||||
if ((idx++ != pos) && (instr == cur))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
group_collect(struct ir3_instruction *collect)
|
|
||||||
{
|
|
||||||
struct ir3_register **regs = &collect->regs[1];
|
|
||||||
unsigned n = collect->regs_count - 1;
|
|
||||||
|
|
||||||
/* first pass, figure out what has conflicts and needs a mov
|
|
||||||
* inserted. Do this up front, before starting to setup
|
|
||||||
* left/right neighbor pointers. Trying to do it in a single
|
|
||||||
* pass could result in a situation where we can't even setup
|
|
||||||
* the mov's right neighbor ptr if the next instr also needs
|
|
||||||
* a mov.
|
|
||||||
*/
|
|
||||||
restart:
|
|
||||||
for (unsigned i = 0; i < n; i++) {
|
|
||||||
struct ir3_instruction *instr = ssa(regs[i]);
|
|
||||||
if (instr) {
|
|
||||||
struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL;
|
|
||||||
struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL;
|
|
||||||
bool conflict;
|
|
||||||
|
|
||||||
/* check for left/right neighbor conflicts: */
|
|
||||||
conflict = conflicts(instr->cp.left, left) ||
|
|
||||||
conflicts(instr->cp.right, right);
|
|
||||||
|
|
||||||
/* Mixing array elements and higher register classes
|
|
||||||
* (ie. groups) doesn't really work out in RA. See:
|
|
||||||
*
|
|
||||||
* https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
|
|
||||||
*/
|
|
||||||
if (instr->regs[0]->flags & IR3_REG_ARRAY)
|
|
||||||
conflict = true;
|
|
||||||
|
|
||||||
/* we also can't have an instr twice in the group: */
|
|
||||||
for (unsigned j = i + 1; (j < n) && !conflict; j++)
|
|
||||||
if (in_neighbor_list(ssa(regs[j]), instr, i))
|
|
||||||
conflict = true;
|
|
||||||
|
|
||||||
if (conflict) {
|
|
||||||
insert_mov(collect, i);
|
|
||||||
/* inserting the mov may have caused a conflict
|
|
||||||
* against the previous:
|
|
||||||
*/
|
|
||||||
goto restart;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* second pass, now that we've inserted mov's, fixup left/right
|
|
||||||
* neighbors. This is guaranteed to succeed, since by definition
|
|
||||||
* the newly inserted mov's cannot conflict with anything.
|
|
||||||
*/
|
|
||||||
for (unsigned i = 0; i < n; i++) {
|
|
||||||
struct ir3_instruction *instr = ssa(regs[i]);
|
|
||||||
if (instr) {
|
|
||||||
struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL;
|
|
||||||
struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL;
|
|
||||||
|
|
||||||
debug_assert(!conflicts(instr->cp.left, left));
|
|
||||||
if (left) {
|
|
||||||
instr->cp.left_cnt++;
|
|
||||||
instr->cp.left = left;
|
|
||||||
}
|
|
||||||
|
|
||||||
debug_assert(!conflicts(instr->cp.right, right));
|
|
||||||
if (right) {
|
|
||||||
instr->cp.right_cnt++;
|
|
||||||
instr->cp.right = right;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
instr_find_neighbors(struct ir3_instruction *instr)
|
|
||||||
{
|
|
||||||
bool progress = false;
|
|
||||||
|
|
||||||
if (ir3_instr_check_mark(instr))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (instr->opc == OPC_META_COLLECT) {
|
|
||||||
group_collect(instr);
|
|
||||||
progress = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach_ssa_src (src, instr)
|
|
||||||
progress |= instr_find_neighbors(src);
|
|
||||||
|
|
||||||
return progress;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
find_neighbors(struct ir3 *ir)
|
|
||||||
{
|
|
||||||
bool progress = false;
|
|
||||||
unsigned i;
|
|
||||||
|
|
||||||
foreach_block (block, &ir->block_list) {
|
|
||||||
for (i = 0; i < block->keeps_count; i++) {
|
|
||||||
struct ir3_instruction *instr = block->keeps[i];
|
|
||||||
progress |= instr_find_neighbors(instr);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* We also need to account for if-condition: */
|
|
||||||
if (block->condition)
|
|
||||||
progress |= instr_find_neighbors(block->condition);
|
|
||||||
}
|
|
||||||
|
|
||||||
return progress;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
ir3_group(struct ir3 *ir)
|
|
||||||
{
|
|
||||||
ir3_clear_mark(ir);
|
|
||||||
return find_neighbors(ir);
|
|
||||||
}
|
|
||||||
184
src/freedreno/ir3/ir3_liveness.c
Normal file
184
src/freedreno/ir3/ir3_liveness.c
Normal file
|
|
@ -0,0 +1,184 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Valve Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ir3_ra.h"
|
||||||
|
#include "ir3_shader.h"
|
||||||
|
#include "ralloc.h"
|
||||||
|
|
||||||
|
/* A note on how phi node uses are handled:
|
||||||
|
*
|
||||||
|
* - Phi node sources are considered to happen after the end of the
|
||||||
|
* predecessor block, so the live_out for that block contains phi sources.
|
||||||
|
* - On the other hand, phi destinations are considered to happen at the start
|
||||||
|
* of the block, so that live_in does *not* contain phi destinations. This
|
||||||
|
* is mainly because phi destinations and live-through values have to be
|
||||||
|
* treated very differently by RA at the beginning of a block.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static bool
|
||||||
|
compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
|
||||||
|
BITSET_WORD *tmp_live, unsigned bitset_words)
|
||||||
|
{
|
||||||
|
memcpy(tmp_live, live->live_out[block->index], bitset_words *
|
||||||
|
sizeof(BITSET_WORD));
|
||||||
|
|
||||||
|
/* Process instructions */
|
||||||
|
foreach_instr_rev (instr, &block->instr_list) {
|
||||||
|
ra_foreach_dst(dst, instr) {
|
||||||
|
if (BITSET_TEST(tmp_live, dst->name))
|
||||||
|
dst->flags &= ~IR3_REG_UNUSED;
|
||||||
|
else
|
||||||
|
dst->flags |= IR3_REG_UNUSED;
|
||||||
|
BITSET_CLEAR(tmp_live, dst->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Phi node uses occur after the predecessor block */
|
||||||
|
if (instr->opc != OPC_META_PHI) {
|
||||||
|
ra_foreach_src(src, instr) {
|
||||||
|
if (BITSET_TEST(tmp_live, src->def->name))
|
||||||
|
src->flags &= ~IR3_REG_KILL;
|
||||||
|
else
|
||||||
|
src->flags |= IR3_REG_KILL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ra_foreach_src(src, instr) {
|
||||||
|
if (BITSET_TEST(tmp_live, src->def->name))
|
||||||
|
src->flags &= ~IR3_REG_FIRST_KILL;
|
||||||
|
else
|
||||||
|
src->flags |= IR3_REG_FIRST_KILL;
|
||||||
|
BITSET_SET(tmp_live, src->def->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(live->live_in[block->index], tmp_live,
|
||||||
|
bitset_words * sizeof(BITSET_WORD));
|
||||||
|
|
||||||
|
bool progress = false;
|
||||||
|
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||||
|
const struct ir3_block *pred = block->predecessors[i];
|
||||||
|
for (unsigned j = 0; j < bitset_words; j++) {
|
||||||
|
if (tmp_live[j] & ~live->live_out[pred->index][j])
|
||||||
|
progress = true;
|
||||||
|
live->live_out[pred->index][j] |= tmp_live[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Process phi sources. */
|
||||||
|
foreach_instr (phi, &block->instr_list) {
|
||||||
|
if (phi->opc != OPC_META_PHI)
|
||||||
|
break;
|
||||||
|
if (!phi->regs[1 + i]->def)
|
||||||
|
continue;
|
||||||
|
unsigned name = phi->regs[1 + i]->def->name;
|
||||||
|
if (!BITSET_TEST(live->live_out[pred->index], name)) {
|
||||||
|
progress = true;
|
||||||
|
BITSET_SET(live->live_out[pred->index], name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
|
||||||
|
{
|
||||||
|
struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
|
||||||
|
|
||||||
|
/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
|
||||||
|
* output nicer.
|
||||||
|
*/
|
||||||
|
array_insert(live, live->definitions, NULL);
|
||||||
|
|
||||||
|
/* Build definition <-> name mapping */
|
||||||
|
unsigned block_count = 0;
|
||||||
|
foreach_block (block, &v->ir->block_list) {
|
||||||
|
block->index = block_count++;
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
ra_foreach_dst(dst, instr) {
|
||||||
|
dst->name = live->definitions_count;
|
||||||
|
array_insert(live, live->definitions, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
live->block_count = block_count;
|
||||||
|
|
||||||
|
unsigned bitset_words = BITSET_WORDS(live->definitions_count);
|
||||||
|
BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
|
||||||
|
live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
|
||||||
|
live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
|
||||||
|
unsigned i = 0;
|
||||||
|
foreach_block (block, &v->ir->block_list) {
|
||||||
|
block->index = i++;
|
||||||
|
live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||||
|
live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool progress = true;
|
||||||
|
while (progress) {
|
||||||
|
progress = false;
|
||||||
|
foreach_block_rev (block, &v->ir->block_list) {
|
||||||
|
progress |=
|
||||||
|
compute_block_liveness(live, block, tmp_live, bitset_words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return live;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return true if "def" is live after "instr". It's assumed that "def"
|
||||||
|
* dominates "instr".
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
|
||||||
|
struct ir3_instruction *instr)
|
||||||
|
{
|
||||||
|
/* If it's live out then it's definitely live at the instruction. */
|
||||||
|
if (BITSET_TEST(live->live_out[instr->block->index], def->name))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/* If it's not live in and not defined in the same block then the live
|
||||||
|
* range can't extend to the instruction.
|
||||||
|
*/
|
||||||
|
if (def->instr->block != instr->block &&
|
||||||
|
!BITSET_TEST(live->live_in[instr->block->index], def->name))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Ok, now comes the tricky case, where "def" is killed somewhere in
|
||||||
|
* "instr"'s block and we have to check if it's before or after.
|
||||||
|
*/
|
||||||
|
foreach_instr_rev (test_instr, &instr->block->instr_list) {
|
||||||
|
if (test_instr == instr)
|
||||||
|
break;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < test_instr->regs_count; i++) {
|
||||||
|
if (test_instr->regs[i]->flags & IR3_REG_DEST)
|
||||||
|
continue;
|
||||||
|
if (test_instr->regs[i]->def == def)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
514
src/freedreno/ir3/ir3_lower_parallelcopy.c
Normal file
514
src/freedreno/ir3/ir3_lower_parallelcopy.c
Normal file
|
|
@ -0,0 +1,514 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Valve Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ir3_ra.h"
|
||||||
|
#include "ir3_shader.h"
|
||||||
|
|
||||||
|
struct copy_src {
|
||||||
|
unsigned flags;
|
||||||
|
union {
|
||||||
|
uint32_t imm;
|
||||||
|
physreg_t reg;
|
||||||
|
unsigned const_num;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct copy_entry {
|
||||||
|
physreg_t dst;
|
||||||
|
unsigned flags;
|
||||||
|
bool done;
|
||||||
|
|
||||||
|
struct copy_src src;
|
||||||
|
};
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
copy_entry_size(const struct copy_entry *entry)
|
||||||
|
{
|
||||||
|
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct copy_src
|
||||||
|
get_copy_src(const struct ir3_register *reg, unsigned offset)
|
||||||
|
{
|
||||||
|
if (reg->flags & IR3_REG_IMMED) {
|
||||||
|
return (struct copy_src) {
|
||||||
|
.flags = IR3_REG_IMMED,
|
||||||
|
.imm = reg->uim_val,
|
||||||
|
};
|
||||||
|
} else if (reg->flags & IR3_REG_CONST) {
|
||||||
|
return (struct copy_src) {
|
||||||
|
.flags = IR3_REG_CONST,
|
||||||
|
.const_num = reg->num,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return (struct copy_src) {
|
||||||
|
.flags = 0,
|
||||||
|
.reg = ra_reg_get_physreg(reg) + offset,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags)
|
||||||
|
{
|
||||||
|
struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 3);
|
||||||
|
struct ir3_register *dst = ir3_reg_create(xor, dst_num, flags | IR3_REG_DEST);
|
||||||
|
dst->wrmask = 1;
|
||||||
|
struct ir3_register *src1 = ir3_reg_create(xor, src1_num, flags);
|
||||||
|
src1->wrmask = 1;
|
||||||
|
struct ir3_register *src2 = ir3_reg_create(xor, src2_num, flags);
|
||||||
|
src2->wrmask = 1;
|
||||||
|
|
||||||
|
ir3_instr_move_before(xor, instr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
|
||||||
|
{
|
||||||
|
assert(!entry->src.flags);
|
||||||
|
/* TODO implement shared swaps */
|
||||||
|
assert(!(entry->flags & IR3_REG_SHARED));
|
||||||
|
|
||||||
|
if (entry->flags & IR3_REG_HALF) {
|
||||||
|
/* We currently make sure to never emit parallel copies where the
|
||||||
|
* source/destination is a half-reg above the range accessable to half
|
||||||
|
* registers. However, when a full-reg source overlaps a half-reg
|
||||||
|
* destination or vice versa, it can be very, very complicated to come
|
||||||
|
* up with a series of "legal" swaps and copies to resolve the
|
||||||
|
* parallel copy. So here we provide a fallback to implement the
|
||||||
|
* "illegal" swap instead. This may also be useful for implementing
|
||||||
|
* "spilling" half-regs to the inaccessable space.
|
||||||
|
*/
|
||||||
|
if (entry->src.reg >= RA_HALF_SIZE) {
|
||||||
|
/* Choose a temporary that doesn't overlap src or dst */
|
||||||
|
physreg_t tmp = entry->dst < 2 ? 2 : 0;
|
||||||
|
|
||||||
|
/* Swap src and the temporary */
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = entry->src.reg & ~1u },
|
||||||
|
.dst = tmp,
|
||||||
|
.flags = entry->flags & ~IR3_REG_HALF,
|
||||||
|
});
|
||||||
|
|
||||||
|
/* Do the original swap with src replaced with tmp */
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = tmp + (entry->src.reg & 1) },
|
||||||
|
.dst = entry->dst,
|
||||||
|
.flags = entry->flags,
|
||||||
|
});
|
||||||
|
|
||||||
|
/* Swap src and the temporary back */
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = entry->src.reg & ~1u },
|
||||||
|
.dst = tmp,
|
||||||
|
.flags = entry->flags & ~IR3_REG_HALF,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If dst is not addressable, we only need to swap the arguments and
|
||||||
|
* let the case above handle it.
|
||||||
|
*/
|
||||||
|
if (entry->dst >= RA_HALF_SIZE) {
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = entry->dst },
|
||||||
|
.dst = entry->src.reg,
|
||||||
|
.flags = entry->flags,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||||
|
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||||
|
|
||||||
|
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||||
|
do_xor(instr, src_num, src_num, dst_num, entry->flags);
|
||||||
|
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
|
||||||
|
{
|
||||||
|
/* TODO implement shared copies */
|
||||||
|
assert(!(entry->flags & IR3_REG_SHARED));
|
||||||
|
|
||||||
|
if (entry->flags & IR3_REG_HALF) {
|
||||||
|
/* See do_swap() for why this is here. */
|
||||||
|
if (entry->dst >= RA_HALF_SIZE) {
|
||||||
|
/* TODO: is there a hw instruction we can use for this case? */
|
||||||
|
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
|
||||||
|
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = entry->dst & ~1u },
|
||||||
|
.dst = tmp,
|
||||||
|
.flags = entry->flags & ~IR3_REG_HALF,
|
||||||
|
});
|
||||||
|
|
||||||
|
do_copy(instr, &(struct copy_entry) {
|
||||||
|
.src = entry->src,
|
||||||
|
.dst = tmp + (entry->dst & 1),
|
||||||
|
.flags = entry->flags,
|
||||||
|
});
|
||||||
|
|
||||||
|
do_swap(instr, &(struct copy_entry) {
|
||||||
|
.src = { .reg = entry->dst & ~1u },
|
||||||
|
.dst = tmp,
|
||||||
|
.flags = entry->flags & ~IR3_REG_HALF,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
|
||||||
|
unsigned src_num =
|
||||||
|
ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF);
|
||||||
|
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||||
|
|
||||||
|
if (entry->src.reg % 2 == 0) {
|
||||||
|
/* cov.u32u16 dst, src */
|
||||||
|
struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 2);
|
||||||
|
ir3_reg_create(cov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
|
||||||
|
ir3_reg_create(cov, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1;
|
||||||
|
cov->cat1.dst_type = TYPE_U16;
|
||||||
|
cov->cat1.src_type = TYPE_U32;
|
||||||
|
ir3_instr_move_before(cov, instr);
|
||||||
|
} else {
|
||||||
|
/* shr.b dst, src, h(16) */
|
||||||
|
struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 3);
|
||||||
|
ir3_reg_create(shr, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
|
||||||
|
ir3_reg_create(shr, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1;
|
||||||
|
ir3_reg_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
|
||||||
|
ir3_instr_move_before(shr, instr);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||||
|
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||||
|
|
||||||
|
struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_MOV, 2);
|
||||||
|
ir3_reg_create(mov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
|
||||||
|
ir3_reg_create(mov, src_num, entry->flags | entry->src.flags)->wrmask = 1;
|
||||||
|
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||||
|
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||||
|
if (entry->src.flags & IR3_REG_IMMED)
|
||||||
|
mov->regs[1]->uim_val = entry->src.imm;
|
||||||
|
else if (entry->src.flags & IR3_REG_CONST)
|
||||||
|
mov->regs[1]->num = entry->src.const_num;
|
||||||
|
ir3_instr_move_before(mov, instr);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct copy_ctx {
|
||||||
|
/* For each physreg, the number of pending copy entries that use it as a
|
||||||
|
* source. Once this drops to zero, then the physreg is unblocked and can
|
||||||
|
* be moved to.
|
||||||
|
*/
|
||||||
|
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
|
||||||
|
|
||||||
|
/* For each physreg, the pending copy_entry that uses it as a dest. */
|
||||||
|
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
|
||||||
|
|
||||||
|
struct copy_entry entries[RA_MAX_FILE_SIZE];
|
||||||
|
unsigned entry_count;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool
|
||||||
|
entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
|
||||||
|
if (ctx->physreg_use_count[entry->dst + i] != 0)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
|
||||||
|
{
|
||||||
|
assert(!entry->done);
|
||||||
|
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
|
||||||
|
assert(copy_entry_size(entry) == 2);
|
||||||
|
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
|
||||||
|
|
||||||
|
new_entry->dst = entry->dst + 1;
|
||||||
|
new_entry->src.flags = entry->src.flags;
|
||||||
|
new_entry->src.reg = entry->src.reg + 1;
|
||||||
|
new_entry->done = false;
|
||||||
|
entry->flags |= IR3_REG_HALF;
|
||||||
|
new_entry->flags = entry->flags;
|
||||||
|
ctx->physreg_dst[entry->dst + 1] = new_entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
|
||||||
|
{
|
||||||
|
/* Set up the bookkeeping */
|
||||||
|
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
|
||||||
|
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||||
|
struct copy_entry *entry = &ctx->entries[i];
|
||||||
|
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||||
|
if (!entry->src.flags)
|
||||||
|
ctx->physreg_use_count[entry->src.reg + j]++;
|
||||||
|
|
||||||
|
/* Copies should not have overlapping destinations. */
|
||||||
|
assert(!ctx->physreg_dst[entry->dst + j]);
|
||||||
|
ctx->physreg_dst[entry->dst + j] = entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool progress = true;
|
||||||
|
while (progress) {
|
||||||
|
progress = false;
|
||||||
|
|
||||||
|
/* Step 1: resolve paths in the transfer graph. This means finding
|
||||||
|
* copies whose destination aren't blocked by something else and then
|
||||||
|
* emitting them, continuing this process until every copy is blocked
|
||||||
|
* and there are only cycles left.
|
||||||
|
*
|
||||||
|
* TODO: We should note that src is also available in dst to unblock
|
||||||
|
* cycles that src is involved in.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||||
|
struct copy_entry *entry = &ctx->entries[i];
|
||||||
|
if (!entry->done && !entry_blocked(entry, ctx)) {
|
||||||
|
entry->done = true;
|
||||||
|
progress = true;
|
||||||
|
do_copy(instr, entry);
|
||||||
|
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||||
|
if (!entry->src.flags)
|
||||||
|
ctx->physreg_use_count[entry->src.reg + j]--;
|
||||||
|
ctx->physreg_dst[entry->dst + j] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (progress)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Step 2: Find partially blocked copies and split them. In the
|
||||||
|
* mergedregs case, we can 32-bit copies which are only blocked on one
|
||||||
|
* 16-bit half, and splitting them helps get things moving.
|
||||||
|
*
|
||||||
|
* We can skip splitting copies if the source isn't a register,
|
||||||
|
* however, because it does not unblock anything and therefore doesn't
|
||||||
|
* contribute to making forward progress with step 1. These copies
|
||||||
|
* should still be resolved eventually in step 1 because they can't be
|
||||||
|
* part of a cycle.
|
||||||
|
*/
|
||||||
|
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||||
|
struct copy_entry *entry = &ctx->entries[i];
|
||||||
|
if (entry->done || entry->flags & IR3_REG_HALF)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (((ctx->physreg_use_count[entry->dst] == 0 ||
|
||||||
|
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
|
||||||
|
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
|
||||||
|
split_32bit_copy(ctx, entry);
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Step 3: resolve cycles through swapping.
|
||||||
|
*
|
||||||
|
* At this point, the transfer graph should consist of only cycles.
|
||||||
|
* The reason is that, given any physreg n_1 that's the source of a
|
||||||
|
* remaining entry, it has a destination n_2, which (because every
|
||||||
|
* copy is blocked) is the source of some other copy whose destination
|
||||||
|
* is n_3, and so we can follow the chain until we get a cycle. If we
|
||||||
|
* reached some other node than n_1:
|
||||||
|
*
|
||||||
|
* n_1 -> n_2 -> ... -> n_i
|
||||||
|
* ^ |
|
||||||
|
* |-------------|
|
||||||
|
*
|
||||||
|
* then n_2 would be the destination of 2 copies, which is illegal
|
||||||
|
* (checked above in an assert). So n_1 must be part of a cycle:
|
||||||
|
*
|
||||||
|
* n_1 -> n_2 -> ... -> n_i
|
||||||
|
* ^ |
|
||||||
|
* |---------------------|
|
||||||
|
*
|
||||||
|
* and this must be only cycle n_1 is involved in, because any other
|
||||||
|
* path starting from n_1 would also have to end in n_1, resulting in
|
||||||
|
* a node somewhere along the way being the destination of 2 copies
|
||||||
|
* when the 2 paths merge.
|
||||||
|
*
|
||||||
|
* The way we resolve the cycle is through picking a copy (n_1, n_2)
|
||||||
|
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
|
||||||
|
* out of the cycle:
|
||||||
|
*
|
||||||
|
* n_1 -> ... -> n_i
|
||||||
|
* ^ |
|
||||||
|
* |--------------|
|
||||||
|
*
|
||||||
|
* and we can keep repeating this until the cycle is empty.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||||
|
struct copy_entry *entry = &ctx->entries[i];
|
||||||
|
if (entry->done)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
assert(!entry->src.flags);
|
||||||
|
|
||||||
|
/* catch trivial copies */
|
||||||
|
if (entry->dst == entry->src.reg) {
|
||||||
|
entry->done = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
do_swap(instr, entry);
|
||||||
|
|
||||||
|
/* Split any blocking copies whose sources are only partially
|
||||||
|
* contained within our destination.
|
||||||
|
*/
|
||||||
|
if (entry->flags & IR3_REG_HALF) {
|
||||||
|
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||||
|
struct copy_entry *blocking = &ctx->entries[j];
|
||||||
|
|
||||||
|
if (blocking->done)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (blocking->src.reg <= entry->dst &&
|
||||||
|
blocking->src.reg + 1 >= entry->dst &&
|
||||||
|
!(blocking->flags & IR3_REG_HALF)) {
|
||||||
|
split_32bit_copy(ctx, blocking);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Update sources of blocking copies.
|
||||||
|
*
|
||||||
|
* Note: at this point, every blocking copy's source should be
|
||||||
|
* contained within our destination.
|
||||||
|
*/
|
||||||
|
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||||
|
struct copy_entry *blocking = &ctx->entries[j];
|
||||||
|
if (blocking->src.reg >= entry->dst &&
|
||||||
|
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
|
||||||
|
blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
|
||||||
|
unsigned entry_count, bool mergedregs)
|
||||||
|
{
|
||||||
|
struct copy_ctx ctx;
|
||||||
|
|
||||||
|
if (mergedregs) {
|
||||||
|
/* Half regs and full regs are in the same file, so handle everything
|
||||||
|
* at once.
|
||||||
|
*/
|
||||||
|
memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count);
|
||||||
|
ctx.entry_count = entry_count;
|
||||||
|
_handle_copies(instr, &ctx);
|
||||||
|
} else {
|
||||||
|
/* There may be both half copies and full copies, so we have to split
|
||||||
|
* them up since they don't interfere.
|
||||||
|
*/
|
||||||
|
ctx.entry_count = 0;
|
||||||
|
for (unsigned i = 0; i < entry_count; i++) {
|
||||||
|
if (entries[i].flags & IR3_REG_HALF)
|
||||||
|
ctx.entries[ctx.entry_count++] = entries[i];
|
||||||
|
}
|
||||||
|
_handle_copies(instr, &ctx);
|
||||||
|
|
||||||
|
ctx.entry_count = 0;
|
||||||
|
for (unsigned i = 0; i < entry_count; i++) {
|
||||||
|
if (!(entries[i].flags & IR3_REG_HALF))
|
||||||
|
ctx.entries[ctx.entry_count++] = entries[i];
|
||||||
|
}
|
||||||
|
_handle_copies(instr, &ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ir3_lower_copies(struct ir3_shader_variant *v)
|
||||||
|
{
|
||||||
|
DECLARE_ARRAY(struct copy_entry, copies);
|
||||||
|
copies_count = copies_sz = 0;
|
||||||
|
copies = NULL;
|
||||||
|
|
||||||
|
foreach_block (block, &v->ir->block_list) {
|
||||||
|
foreach_instr_safe (instr, &block->instr_list) {
|
||||||
|
if (instr->opc == OPC_META_PARALLEL_COPY) {
|
||||||
|
copies_count = 0;
|
||||||
|
for (unsigned i = 0; i < instr->regs_count / 2; i++) {
|
||||||
|
struct ir3_register *dst = instr->regs[i];
|
||||||
|
struct ir3_register *src = instr->regs[i + instr->regs_count / 2];
|
||||||
|
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||||
|
for (unsigned j = 0; j < reg_elems(dst); j++) {
|
||||||
|
array_insert(NULL, copies, (struct copy_entry) {
|
||||||
|
.dst = ra_num_to_physreg(dst->num + j, flags),
|
||||||
|
.src = get_copy_src(src, j * reg_elem_size(dst)),
|
||||||
|
.flags = flags,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||||
|
list_del(&instr->node);
|
||||||
|
} else if (instr->opc == OPC_META_COLLECT) {
|
||||||
|
copies_count = 0;
|
||||||
|
struct ir3_register *dst = instr->regs[0];
|
||||||
|
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||||
|
for (unsigned i = 1; i < instr->regs_count; i++) {
|
||||||
|
struct ir3_register *src = instr->regs[i];
|
||||||
|
array_insert(NULL, copies, (struct copy_entry) {
|
||||||
|
.dst = ra_num_to_physreg(dst->num + i - 1, flags),
|
||||||
|
.src = get_copy_src(src, 0),
|
||||||
|
.flags = flags,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||||
|
list_del(&instr->node);
|
||||||
|
} else if (instr->opc == OPC_META_SPLIT) {
|
||||||
|
copies_count = 0;
|
||||||
|
struct ir3_register *dst = instr->regs[0];
|
||||||
|
struct ir3_register *src = instr->regs[1];
|
||||||
|
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||||
|
array_insert(NULL, copies, (struct copy_entry) {
|
||||||
|
.dst = ra_reg_get_physreg(dst),
|
||||||
|
.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
|
||||||
|
.flags = flags,
|
||||||
|
});
|
||||||
|
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||||
|
list_del(&instr->node);
|
||||||
|
} else if (instr->opc == OPC_META_PHI) {
|
||||||
|
list_del(&instr->node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (copies)
|
||||||
|
ralloc_free(copies);
|
||||||
|
}
|
||||||
|
|
||||||
574
src/freedreno/ir3/ir3_merge_regs.c
Normal file
574
src/freedreno/ir3/ir3_merge_regs.c
Normal file
|
|
@ -0,0 +1,574 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Valve Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ir3_ra.h"
|
||||||
|
#include "ir3_compiler.h"
|
||||||
|
#include "ralloc.h"
|
||||||
|
|
||||||
|
/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
|
||||||
|
* of parallelcopy's to trivially turn the program into CSSA form. Then we
|
||||||
|
* try to "merge" SSA def's into "merge sets" which could be allocated to a
|
||||||
|
* single register in order to eliminate copies. First we merge phi nodes,
|
||||||
|
* which should always succeed because of the parallelcopy's we inserted, and
|
||||||
|
* then we try to coalesce the copies we introduced.
|
||||||
|
*
|
||||||
|
* The merged registers are used for three purposes:
|
||||||
|
*
|
||||||
|
* 1. We always use the same pvtmem slot for spilling all SSA defs in each
|
||||||
|
* merge set. This prevents us from having to insert memory-to-memory copies
|
||||||
|
* in the spiller and makes sure we don't insert unecessary copies.
|
||||||
|
* 2. When two values are live at the same time, part of the same merge
|
||||||
|
* set, and they overlap each other in the merge set, they always occupy
|
||||||
|
* overlapping physical registers in RA. This reduces register pressure and
|
||||||
|
* copies in several important scenarios:
|
||||||
|
* - When sources of a collect are used later by something else, we don't
|
||||||
|
* have to introduce copies.
|
||||||
|
* - We can handle sequences of extracts that "explode" a vector into its
|
||||||
|
* components without any additional copying.
|
||||||
|
* 3. We use the merge sets for affinities in register allocation: That is, we
|
||||||
|
* try to allocate all the definitions in the same merge set to the
|
||||||
|
* same/compatible registers. This helps us e.g. allocate sources of a collect
|
||||||
|
* to contiguous registers without too much special code in RA.
|
||||||
|
*
|
||||||
|
* In a "normal" register allocator, or when spilling, we'd just merge
|
||||||
|
* registers in the same merge set to the same register, but with SSA-based
|
||||||
|
* register allocation we may have to split the live interval.
|
||||||
|
*
|
||||||
|
* The implementation is based on "Revisiting Out-of-SSA Translation for
|
||||||
|
* Correctness, CodeQuality, and Efficiency," and is broadly similar to the
|
||||||
|
* implementation in nir_from_ssa, with the twist that we also try to coalesce
|
||||||
|
* META_SPLIT and META_COLLECT. This makes this pass more complicated but
|
||||||
|
* prevents us from needing to handle these specially in RA and the spiller,
|
||||||
|
* which are already complicated enough. This also forces us to implement that
|
||||||
|
* value-comparison optimization they explain, as without it we wouldn't be
|
||||||
|
* able to coalesce META_SPLIT even in the simplest of cases.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* In order to dynamically reconstruct the dominance forest, we need the
|
||||||
|
* instructions ordered by a preorder traversal of the dominance tree:
|
||||||
|
*/
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
index_instrs(struct ir3_block *block, unsigned index)
|
||||||
|
{
|
||||||
|
foreach_instr (instr, &block->instr_list)
|
||||||
|
instr->ip = index++;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < block->dom_children_count; i++)
|
||||||
|
index = index_instrs(block->dom_children[i], index);
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Definitions within a merge set are ordered by instr->ip as set above: */
|
||||||
|
|
||||||
|
static bool
|
||||||
|
def_after(struct ir3_register *a, struct ir3_register *b)
|
||||||
|
{
|
||||||
|
return a->instr->ip > b->instr->ip;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
def_dominates(struct ir3_register *a, struct ir3_register *b)
|
||||||
|
{
|
||||||
|
if (def_after(a, b)) {
|
||||||
|
return false;
|
||||||
|
} else if (a->instr->block == b->instr->block) {
|
||||||
|
return def_after(b, a);
|
||||||
|
} else {
|
||||||
|
return ir3_block_dominates(a->instr->block, b->instr->block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This represents a region inside a register. The offset is relative to the
|
||||||
|
* start of the register, and offset + size <= size(reg).
|
||||||
|
*/
|
||||||
|
struct def_value {
|
||||||
|
struct ir3_register *reg;
|
||||||
|
unsigned offset, size;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Chase any copies to get the source of a region inside a register. This is
|
||||||
|
* Value(a) in the paper.
|
||||||
|
*/
|
||||||
|
static struct def_value
|
||||||
|
chase_copies(struct def_value value)
|
||||||
|
{
|
||||||
|
while (true) {
|
||||||
|
struct ir3_instruction *instr = value.reg->instr;
|
||||||
|
if (instr->opc == OPC_META_SPLIT) {
|
||||||
|
value.offset += instr->split.off * reg_elem_size(value.reg);
|
||||||
|
value.reg = instr->regs[1]->def;
|
||||||
|
} else if (instr->opc == OPC_META_COLLECT) {
|
||||||
|
if (value.offset % reg_elem_size(value.reg) != 0 ||
|
||||||
|
value.size > reg_elem_size(value.reg) ||
|
||||||
|
value.offset + value.size > reg_size(value.reg))
|
||||||
|
break;
|
||||||
|
struct ir3_register *src = instr->regs[1 + value.offset / reg_elem_size(value.reg)];
|
||||||
|
if (!src->def)
|
||||||
|
break;
|
||||||
|
value.offset = 0;
|
||||||
|
value.reg = src->def;
|
||||||
|
} else {
|
||||||
|
/* TODO: parallelcopy */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This represents an entry in the merge set, and consists of a register +
|
||||||
|
* offset from the merge set base.
|
||||||
|
*/
|
||||||
|
struct merge_def {
|
||||||
|
struct ir3_register *reg;
|
||||||
|
unsigned offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool
|
||||||
|
can_skip_interference(const struct merge_def *a, const struct merge_def *b)
|
||||||
|
{
|
||||||
|
unsigned a_start = a->offset;
|
||||||
|
unsigned b_start = b->offset;
|
||||||
|
unsigned a_end = a_start + reg_size(a->reg);
|
||||||
|
unsigned b_end = b_start + reg_size(b->reg);
|
||||||
|
|
||||||
|
/* Registers that don't overlap never interfere */
|
||||||
|
if (a_end <= b_start || b_end <= a_start)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/* Disallow skipping interference unless one definition contains the
|
||||||
|
* other. This restriction is important for register allocation, because
|
||||||
|
* it means that at any given point in the program, the live values in a
|
||||||
|
* given merge set will form a tree. If they didn't, then one live value
|
||||||
|
* would partially overlap another, and they would have overlapping live
|
||||||
|
* ranges because they're live at the same point. This simplifies register
|
||||||
|
* allocation and spilling.
|
||||||
|
*/
|
||||||
|
if (!((a_start <= b_start && a_end >= b_end) ||
|
||||||
|
(b_start <= a_start && b_end >= a_end)))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* For each register, chase the intersection of a and b to find the
|
||||||
|
* ultimate source.
|
||||||
|
*/
|
||||||
|
unsigned start = MAX2(a_start, b_start);
|
||||||
|
unsigned end = MIN2(a_end, b_end);
|
||||||
|
struct def_value a_value =
|
||||||
|
chase_copies((struct def_value) {
|
||||||
|
.reg = a->reg,
|
||||||
|
.offset = start - a_start,
|
||||||
|
.size = end - start,
|
||||||
|
});
|
||||||
|
struct def_value b_value =
|
||||||
|
chase_copies((struct def_value) {
|
||||||
|
.reg = b->reg,
|
||||||
|
.offset = start - b_start,
|
||||||
|
.size = end - start,
|
||||||
|
});
|
||||||
|
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ir3_merge_set *
|
||||||
|
get_merge_set(struct ir3_register *def)
|
||||||
|
{
|
||||||
|
if (def->merge_set)
|
||||||
|
return def->merge_set;
|
||||||
|
|
||||||
|
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
|
||||||
|
set->preferred_reg = ~0;
|
||||||
|
set->interval_start = ~0;
|
||||||
|
set->size = reg_size(def);
|
||||||
|
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
|
||||||
|
set->regs_count = 1;
|
||||||
|
set->regs = ralloc(set, struct ir3_register *);
|
||||||
|
set->regs[0] = def;
|
||||||
|
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Merges b into a */
|
||||||
|
static struct ir3_merge_set *
|
||||||
|
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b,
|
||||||
|
int b_offset)
|
||||||
|
{
|
||||||
|
if (b_offset < 0)
|
||||||
|
return merge_merge_sets(b, a, -b_offset);
|
||||||
|
|
||||||
|
struct ir3_register **new_regs =
|
||||||
|
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
|
||||||
|
|
||||||
|
unsigned a_index = 0, b_index = 0, new_index = 0;
|
||||||
|
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
|
||||||
|
if (b_index < b->regs_count &&
|
||||||
|
(a_index == a->regs_count ||
|
||||||
|
def_after(a->regs[a_index], b->regs[b_index]))) {
|
||||||
|
new_regs[new_index] = b->regs[b_index++];
|
||||||
|
new_regs[new_index]->merge_set_offset += b_offset;
|
||||||
|
} else {
|
||||||
|
new_regs[new_index] = a->regs[a_index++];
|
||||||
|
}
|
||||||
|
new_regs[new_index]->merge_set = a;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(new_index == a->regs_count + b->regs_count);
|
||||||
|
|
||||||
|
/* Technically this should be the lcm, but because alignment is only 1 or
|
||||||
|
* 2 so far this should be ok.
|
||||||
|
*/
|
||||||
|
a->alignment = MAX2(a->alignment, b->alignment);
|
||||||
|
a->regs_count += b->regs_count;
|
||||||
|
ralloc_free(a->regs);
|
||||||
|
a->regs = new_regs;
|
||||||
|
a->size = MAX2(a->size, b->size + b_offset);
|
||||||
|
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
merge_sets_interfere(struct ir3_liveness *live,
|
||||||
|
struct ir3_merge_set *a, struct ir3_merge_set *b,
|
||||||
|
int b_offset)
|
||||||
|
{
|
||||||
|
if (b_offset < 0)
|
||||||
|
return merge_sets_interfere(live, b, a, -b_offset);
|
||||||
|
|
||||||
|
struct merge_def dom[a->regs_count + b->regs_count];
|
||||||
|
unsigned a_index = 0, b_index = 0;
|
||||||
|
int dom_index = -1;
|
||||||
|
|
||||||
|
/* Reject trying to merge the sets if the alignment doesn't work out */
|
||||||
|
if (b_offset % a->alignment != 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
while (a_index < a->regs_count || b_index < b->regs_count) {
|
||||||
|
struct merge_def current;
|
||||||
|
if (a_index == a->regs_count) {
|
||||||
|
current.reg = b->regs[b_index];
|
||||||
|
current.offset = current.reg->merge_set_offset + b_offset;
|
||||||
|
b_index++;
|
||||||
|
} else if (b_index == b->regs_count) {
|
||||||
|
current.reg = a->regs[a_index];
|
||||||
|
current.offset = current.reg->merge_set_offset;
|
||||||
|
a_index++;
|
||||||
|
} else {
|
||||||
|
if (def_after(b->regs[b_index], a->regs[a_index])) {
|
||||||
|
current.reg = a->regs[a_index];
|
||||||
|
current.offset = current.reg->merge_set_offset;
|
||||||
|
a_index++;
|
||||||
|
} else {
|
||||||
|
current.reg = b->regs[b_index];
|
||||||
|
current.offset = current.reg->merge_set_offset + b_offset;
|
||||||
|
b_index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (dom_index >= 0 &&
|
||||||
|
!def_dominates(dom[dom_index].reg, current.reg)) {
|
||||||
|
dom_index--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO: in the original paper, just dom[dom_index] needs to be
|
||||||
|
* checked for interference. We implement the value-chasing extension
|
||||||
|
* as well as support for sub-registers, which complicates this
|
||||||
|
* significantly because it's no longer the case that if a dominates b
|
||||||
|
* dominates c and a and b don't interfere then we only need to check
|
||||||
|
* interference between b and c to be sure a and c don't interfere --
|
||||||
|
* this means we may have to check for interference against values
|
||||||
|
* higher in the stack then dom[dom_index]. In the paper there's a
|
||||||
|
* description of a way to do less interference tests with the
|
||||||
|
* value-chasing extension, but we'd have to come up with something
|
||||||
|
* ourselves for handling the similar problems that come up with
|
||||||
|
* allowing values to contain subregisters. For now we just test
|
||||||
|
* everything in the stack.
|
||||||
|
*/
|
||||||
|
for (int i = 0; i <= dom_index; i++) {
|
||||||
|
if (can_skip_interference(¤t, &dom[i]))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Ok, now we actually have to check interference. Since we know
|
||||||
|
* that dom[i] dominates current, this boils down to checking
|
||||||
|
* whether dom[i] is live after current.
|
||||||
|
*/
|
||||||
|
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
dom[++dom_index] = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
try_merge_defs(struct ir3_liveness *live,
|
||||||
|
struct ir3_register *a, struct ir3_register *b,
|
||||||
|
unsigned b_offset)
|
||||||
|
{
|
||||||
|
struct ir3_merge_set *a_set = get_merge_set(a);
|
||||||
|
struct ir3_merge_set *b_set = get_merge_set(b);
|
||||||
|
|
||||||
|
if (a_set == b_set) {
|
||||||
|
/* Note: Even in this case we may not always successfully be able to
|
||||||
|
* coalesce this copy, if the offsets don't line up. But in any
|
||||||
|
* case, we can't do anything.
|
||||||
|
*/
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
|
||||||
|
|
||||||
|
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
|
||||||
|
merge_merge_sets(a_set, b_set, b_set_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
coalesce_phi(struct ir3_liveness *live,
|
||||||
|
struct ir3_instruction *phi)
|
||||||
|
{
|
||||||
|
for (unsigned i = 1; i < phi->regs_count; i++) {
|
||||||
|
if (phi->regs[i]->def)
|
||||||
|
try_merge_defs(live, phi->regs[0], phi->regs[i]->def, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
|
||||||
|
struct ir3_instruction *pcopy)
|
||||||
|
{
|
||||||
|
unsigned copies = pcopy->regs_count / 2;
|
||||||
|
for (unsigned i = 0; i < copies; i++) {
|
||||||
|
if (!(pcopy->regs[copies + i]->flags & IR3_REG_SSA))
|
||||||
|
continue;
|
||||||
|
try_merge_defs(live, pcopy->regs[i], pcopy->regs[copies + i]->def, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
aggressive_coalesce_split(struct ir3_liveness *live,
|
||||||
|
struct ir3_instruction *split)
|
||||||
|
{
|
||||||
|
try_merge_defs(live, split->regs[1]->def, split->regs[0],
|
||||||
|
split->split.off * reg_elem_size(split->regs[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
aggressive_coalesce_collect(struct ir3_liveness *live,
|
||||||
|
struct ir3_instruction *collect)
|
||||||
|
{
|
||||||
|
for (unsigned i = 1, offset = 0; i < collect->regs_count;
|
||||||
|
offset += reg_elem_size(collect->regs[i]), i++) {
|
||||||
|
if (!(collect->regs[i]->flags & IR3_REG_SSA))
|
||||||
|
continue;
|
||||||
|
try_merge_defs(live, collect->regs[0], collect->regs[i]->def, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
create_parallel_copy(struct ir3_block *block)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < 2; i++) {
|
||||||
|
if (!block->successors[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
struct ir3_block *succ = block->successors[i];
|
||||||
|
|
||||||
|
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
||||||
|
|
||||||
|
unsigned phi_count = 0;
|
||||||
|
foreach_instr (phi, &succ->instr_list) {
|
||||||
|
if (phi->opc != OPC_META_PHI)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Avoid undef */
|
||||||
|
if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
|
||||||
|
!phi->regs[1 + pred_idx]->def)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* We don't support critical edges. If we were to support them,
|
||||||
|
* we'd need to insert parallel copies after the phi node to solve
|
||||||
|
* the lost-copy problem.
|
||||||
|
*/
|
||||||
|
assert(i == 0 && !block->successors[1]);
|
||||||
|
phi_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (phi_count == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
struct ir3_register *src[phi_count];
|
||||||
|
unsigned j = 0;
|
||||||
|
foreach_instr (phi, &succ->instr_list) {
|
||||||
|
if (phi->opc != OPC_META_PHI)
|
||||||
|
break;
|
||||||
|
if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
|
||||||
|
!phi->regs[1 + pred_idx]->def)
|
||||||
|
continue;
|
||||||
|
src[j++] = phi->regs[pred_idx + 1];
|
||||||
|
}
|
||||||
|
assert(j == phi_count);
|
||||||
|
|
||||||
|
struct ir3_instruction *pcopy =
|
||||||
|
ir3_instr_create(block, OPC_META_PARALLEL_COPY, 2 * phi_count);
|
||||||
|
|
||||||
|
for (j = 0; j < phi_count; j++) {
|
||||||
|
struct ir3_register *reg = __ssa_dst(pcopy);
|
||||||
|
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
|
||||||
|
reg->size = reg_elems(src[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j < phi_count; j++) {
|
||||||
|
pcopy->regs[pcopy->regs_count++] = ir3_reg_clone(block->shader, src[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
j = 0;
|
||||||
|
foreach_instr (phi, &succ->instr_list) {
|
||||||
|
if (phi->opc != OPC_META_PHI)
|
||||||
|
break;
|
||||||
|
if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
|
||||||
|
!phi->regs[1 + pred_idx]->def)
|
||||||
|
continue;
|
||||||
|
phi->regs[1 + pred_idx]->def = pcopy->regs[j];
|
||||||
|
phi->regs[1 + pred_idx]->flags = pcopy->regs[j]->flags & ~IR3_REG_DEST;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
assert(j == phi_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ir3_create_parallel_copies(struct ir3 *ir)
|
||||||
|
{
|
||||||
|
foreach_block (block, &ir->block_list) {
|
||||||
|
create_parallel_copy(block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
index_merge_sets(struct ir3 *ir)
|
||||||
|
{
|
||||||
|
unsigned offset = 0;
|
||||||
|
foreach_block (block, &ir->block_list) {
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
for (unsigned i = 0; i < instr->regs_count; i++) {
|
||||||
|
struct ir3_register *dst = instr->regs[i];
|
||||||
|
if (!(dst->flags & IR3_REG_DEST))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
unsigned dst_offset;
|
||||||
|
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||||
|
unsigned size = reg_size(dst);
|
||||||
|
if (merge_set) {
|
||||||
|
if (merge_set->interval_start == ~0) {
|
||||||
|
merge_set->interval_start = offset;
|
||||||
|
offset += merge_set->size;
|
||||||
|
}
|
||||||
|
dst_offset = merge_set->interval_start + dst->merge_set_offset;
|
||||||
|
} else {
|
||||||
|
dst_offset = offset;
|
||||||
|
offset += size;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst->interval_start = dst_offset;
|
||||||
|
dst->interval_end = dst_offset + size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RESET "\x1b[0m"
|
||||||
|
#define BLUE "\x1b[0;34m"
|
||||||
|
#define SYN_SSA(x) BLUE x RESET
|
||||||
|
|
||||||
|
static void
|
||||||
|
dump_merge_sets(struct ir3 *ir)
|
||||||
|
{
|
||||||
|
printf("merge sets:\n");
|
||||||
|
struct set *merge_sets = _mesa_pointer_set_create(NULL);
|
||||||
|
foreach_block (block, &ir->block_list) {
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
for (unsigned i = 0; i < instr->regs_count; i++) {
|
||||||
|
struct ir3_register *dst = instr->regs[i];
|
||||||
|
if (!(dst->flags & IR3_REG_DEST))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||||
|
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment);
|
||||||
|
for (unsigned j = 0; j < merge_set->regs_count; j++) {
|
||||||
|
struct ir3_register *reg = merge_set->regs[j];
|
||||||
|
printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno,
|
||||||
|
reg->name, reg->merge_set_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
_mesa_set_add(merge_sets, merge_set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ralloc_free(merge_sets);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
|
||||||
|
{
|
||||||
|
index_instrs(ir3_start_block(ir), 0);
|
||||||
|
|
||||||
|
/* First pass: coalesce phis, which must be together. */
|
||||||
|
foreach_block (block, &ir->block_list) {
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
if (instr->opc != OPC_META_PHI)
|
||||||
|
break;
|
||||||
|
|
||||||
|
coalesce_phi(live, instr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Second pass: aggressively coalesce parallelcopy, split, collect */
|
||||||
|
foreach_block (block, &ir->block_list) {
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
switch (instr->opc) {
|
||||||
|
case OPC_META_SPLIT:
|
||||||
|
aggressive_coalesce_split(live, instr);
|
||||||
|
break;
|
||||||
|
case OPC_META_COLLECT:
|
||||||
|
aggressive_coalesce_collect(live, instr);
|
||||||
|
break;
|
||||||
|
case OPC_META_PARALLEL_COPY:
|
||||||
|
aggressive_coalesce_parallel_copy(live, instr);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index_merge_sets(ir);
|
||||||
|
|
||||||
|
if (ir3_shader_debug & IR3_DBG_RAMSGS)
|
||||||
|
dump_merge_sets(ir);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -728,23 +728,14 @@ cleanup_self_movs(struct ir3 *ir)
|
||||||
{
|
{
|
||||||
foreach_block (block, &ir->block_list) {
|
foreach_block (block, &ir->block_list) {
|
||||||
foreach_instr_safe (instr, &block->instr_list) {
|
foreach_instr_safe (instr, &block->instr_list) {
|
||||||
|
|
||||||
foreach_src (reg, instr) {
|
|
||||||
if (!reg->def)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (is_self_mov(reg->def->instr)) {
|
|
||||||
list_delinit(®->def->instr->node);
|
|
||||||
reg->def = reg->def->instr->regs[1]->def;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < instr->deps_count; i++) {
|
for (unsigned i = 0; i < instr->deps_count; i++) {
|
||||||
if (instr->deps[i] && is_self_mov(instr->deps[i])) {
|
if (instr->deps[i] && is_self_mov(instr->deps[i])) {
|
||||||
list_delinit(&instr->deps[i]->node);
|
instr->deps[i] = NULL;
|
||||||
instr->deps[i] = instr->deps[i]->regs[1]->def->instr;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_self_mov(instr))
|
||||||
|
list_delinit(&instr->node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -165,6 +165,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags)
|
||||||
static void print_ssa_def_name(struct ir3_register *reg)
|
static void print_ssa_def_name(struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
printf(SYN_SSA("ssa_%u"), reg->instr->serialno);
|
printf(SYN_SSA("ssa_%u"), reg->instr->serialno);
|
||||||
|
if (reg->name != 0)
|
||||||
|
printf(":%u", reg->name);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_ssa_name(struct ir3_register *reg, bool dst)
|
static void print_ssa_name(struct ir3_register *reg, bool dst)
|
||||||
|
|
@ -177,6 +179,9 @@ static void print_ssa_name(struct ir3_register *reg, bool dst)
|
||||||
} else {
|
} else {
|
||||||
print_ssa_def_name(reg);
|
print_ssa_def_name(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (reg->num != INVALID_REG)
|
||||||
|
printf("("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *reg)
|
static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *reg)
|
||||||
|
|
@ -189,6 +194,11 @@ static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *r
|
||||||
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
|
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
|
||||||
printf("(abs)");
|
printf("(abs)");
|
||||||
|
|
||||||
|
if (reg->flags & IR3_REG_FIRST_KILL)
|
||||||
|
printf("(kill)");
|
||||||
|
if (reg->flags & IR3_REG_UNUSED)
|
||||||
|
printf("(unused)");
|
||||||
|
|
||||||
if (reg->flags & IR3_REG_R)
|
if (reg->flags & IR3_REG_R)
|
||||||
printf("(r)");
|
printf("(r)");
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
|
* Copyright (C) 2021 Valve Corporation
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
|
@ -19,361 +19,282 @@
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
* SOFTWARE.
|
* SOFTWARE.
|
||||||
*
|
|
||||||
* Authors:
|
|
||||||
* Rob Clark <robclark@freedesktop.org>
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef IR3_RA_H_
|
#ifndef _IR3_RA_H
|
||||||
#define IR3_RA_H_
|
#define _IR3_RA_H
|
||||||
|
|
||||||
#include <setjmp.h>
|
#include "ir3.h"
|
||||||
|
#include "ir3_compiler.h"
|
||||||
|
#include "util/rb_tree.h"
|
||||||
|
|
||||||
#include "util/bitset.h"
|
#ifdef DEBUG
|
||||||
|
#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
|
||||||
|
#else
|
||||||
|
#define RA_DEBUG 0
|
||||||
|
#endif
|
||||||
|
#define d(fmt, ...) do { if (RA_DEBUG) { \
|
||||||
|
printf("RA: "fmt"\n", ##__VA_ARGS__); \
|
||||||
|
} } while (0)
|
||||||
|
|
||||||
|
#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
|
||||||
|
printf("RA: "fmt": ", ##__VA_ARGS__); \
|
||||||
|
ir3_print_instr(instr); \
|
||||||
|
} } while (0)
|
||||||
|
|
||||||
static const unsigned class_sizes[] = {
|
typedef uint16_t physreg_t;
|
||||||
1, 2, 3, 4,
|
|
||||||
4 + 4, /* txd + 1d/2d */
|
|
||||||
4 + 6, /* txd + 3d */
|
|
||||||
};
|
|
||||||
#define class_count ARRAY_SIZE(class_sizes)
|
|
||||||
|
|
||||||
static const unsigned half_class_sizes[] = {
|
static inline unsigned
|
||||||
1, 2, 3, 4,
|
ra_physreg_to_num(physreg_t physreg, unsigned flags)
|
||||||
};
|
{
|
||||||
#define half_class_count ARRAY_SIZE(half_class_sizes)
|
if (!(flags & IR3_REG_HALF))
|
||||||
|
physreg /= 2;
|
||||||
|
if (flags & IR3_REG_SHARED)
|
||||||
|
physreg += 48 * 4;
|
||||||
|
return physreg;
|
||||||
|
}
|
||||||
|
|
||||||
/* seems to just be used for compute shaders? Seems like vec1 and vec3
|
static inline physreg_t
|
||||||
* are sufficient (for now?)
|
ra_num_to_physreg(unsigned num, unsigned flags)
|
||||||
|
{
|
||||||
|
if (flags & IR3_REG_SHARED)
|
||||||
|
num -= 48 * 4;
|
||||||
|
if (!(flags & IR3_REG_HALF))
|
||||||
|
num *= 2;
|
||||||
|
return num;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
ra_reg_get_num(const struct ir3_register *reg)
|
||||||
|
{
|
||||||
|
return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline physreg_t
|
||||||
|
ra_reg_get_physreg(const struct ir3_register *reg)
|
||||||
|
{
|
||||||
|
return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool
|
||||||
|
def_is_gpr(const struct ir3_register *reg)
|
||||||
|
{
|
||||||
|
return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Note: don't count undef as a source.
|
||||||
*/
|
*/
|
||||||
static const unsigned shared_class_sizes[] = {
|
static inline bool
|
||||||
1, 3,
|
ra_reg_is_src(const struct ir3_register *reg)
|
||||||
};
|
|
||||||
#define shared_class_count ARRAY_SIZE(shared_class_sizes)
|
|
||||||
|
|
||||||
#define total_class_count (class_count + half_class_count + shared_class_count)
|
|
||||||
|
|
||||||
/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */
|
|
||||||
#define NUM_REGS (4 * 48) /* r0 to r47 */
|
|
||||||
#define NUM_SHARED_REGS (4 * 8) /* r48 to r55 */
|
|
||||||
#define FIRST_SHARED_REG (4 * 48)
|
|
||||||
/* Number of virtual regs in a given class: */
|
|
||||||
|
|
||||||
static inline unsigned CLASS_REGS(unsigned i)
|
|
||||||
{
|
{
|
||||||
assert(i < class_count);
|
return (reg->flags & IR3_REG_SSA) && reg->def &&
|
||||||
|
def_is_gpr(reg->def);
|
||||||
return (NUM_REGS - (class_sizes[i] - 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned HALF_CLASS_REGS(unsigned i)
|
/* Array destinations can act as a source, reading the previous array and then
|
||||||
|
* modifying it. Return true when the register is an array destination that
|
||||||
|
* acts like a source.
|
||||||
|
*/
|
||||||
|
static inline bool
|
||||||
|
ra_reg_is_array_rmw(const struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
assert(i < half_class_count);
|
return ((reg->flags & IR3_REG_ARRAY) && (reg->flags & IR3_REG_DEST) && reg->def);
|
||||||
|
|
||||||
return (NUM_REGS - (half_class_sizes[i] - 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned SHARED_CLASS_REGS(unsigned i)
|
static inline bool
|
||||||
|
ra_reg_is_dst(const struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
assert(i < shared_class_count);
|
return (reg->flags & IR3_REG_SSA) && (reg->flags & IR3_REG_DEST) &&
|
||||||
|
def_is_gpr(reg) &&
|
||||||
return (NUM_SHARED_REGS - (shared_class_sizes[i] - 1));
|
((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HALF_OFFSET (class_count)
|
static inline struct ir3_register *
|
||||||
#define SHARED_OFFSET (class_count + half_class_count)
|
ra_dst_get_tied_src(const struct ir3_compiler *compiler, struct ir3_register *dst)
|
||||||
|
|
||||||
/* register-set, created one time, used for all shaders: */
|
|
||||||
struct ir3_ra_reg_set {
|
|
||||||
struct ra_regs *regs;
|
|
||||||
struct ra_class *classes[class_count];
|
|
||||||
struct ra_class *half_classes[half_class_count];
|
|
||||||
struct ra_class *shared_classes[shared_class_count];
|
|
||||||
|
|
||||||
/* pre-fetched tex dst is limited, on current gens to regs
|
|
||||||
* 0x3f and below. An additional register class, with one
|
|
||||||
* vreg, that is setup to conflict with any regs above that
|
|
||||||
* limit.
|
|
||||||
*/
|
|
||||||
struct ra_class *prefetch_exclude_class;
|
|
||||||
unsigned prefetch_exclude_reg;
|
|
||||||
|
|
||||||
/* The virtual register space flattens out all the classes,
|
|
||||||
* starting with full, followed by half and then shared, ie:
|
|
||||||
*
|
|
||||||
* scalar full (starting at zero)
|
|
||||||
* vec2 full
|
|
||||||
* vec3 full
|
|
||||||
* ...
|
|
||||||
* vecN full
|
|
||||||
* scalar half (starting at first_half_reg)
|
|
||||||
* vec2 half
|
|
||||||
* ...
|
|
||||||
* vecN half
|
|
||||||
* scalar shared (starting at first_shared_reg)
|
|
||||||
* ...
|
|
||||||
* vecN shared
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
unsigned first_half_reg, first_shared_reg;
|
|
||||||
|
|
||||||
/* maps flat virtual register space to base gpr: */
|
|
||||||
uint16_t *ra_reg_to_gpr;
|
|
||||||
/* maps cls,gpr to flat virtual register space: */
|
|
||||||
uint16_t **gpr_to_ra_reg;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* additional block-data (per-block) */
|
|
||||||
struct ir3_ra_block_data {
|
|
||||||
BITSET_WORD *def; /* variables defined before used in block */
|
|
||||||
BITSET_WORD *use; /* variables used before defined in block */
|
|
||||||
BITSET_WORD *livein; /* which defs reach entry point of block */
|
|
||||||
BITSET_WORD *liveout; /* which defs reach exit point of block */
|
|
||||||
};
|
|
||||||
|
|
||||||
/* additional instruction-data (per-instruction) */
|
|
||||||
struct ir3_ra_instr_data {
|
|
||||||
/* cached instruction 'definer' info: */
|
|
||||||
struct ir3_instruction *defn;
|
|
||||||
int off, sz, cls;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* register-assign context, per-shader */
|
|
||||||
struct ir3_ra_ctx {
|
|
||||||
struct ir3_shader_variant *v;
|
|
||||||
struct ir3 *ir;
|
|
||||||
|
|
||||||
struct ir3_ra_reg_set *set;
|
|
||||||
struct ra_graph *g;
|
|
||||||
|
|
||||||
/* Are we in the scalar assignment pass? In this pass, all larger-
|
|
||||||
* than-vec1 vales have already been assigned and pre-colored, so
|
|
||||||
* we only consider scalar values.
|
|
||||||
*/
|
|
||||||
bool scalar_pass;
|
|
||||||
|
|
||||||
unsigned alloc_count;
|
|
||||||
unsigned r0_xyz_nodes; /* ra node numbers for r0.[xyz] precolors */
|
|
||||||
unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors */
|
|
||||||
unsigned prefetch_exclude_node;
|
|
||||||
/* one per class, plus one slot for arrays: */
|
|
||||||
unsigned class_alloc_count[total_class_count + 1];
|
|
||||||
unsigned class_base[total_class_count + 1];
|
|
||||||
unsigned instr_cnt;
|
|
||||||
unsigned *def, *use; /* def/use table */
|
|
||||||
struct ir3_ra_instr_data *instrd;
|
|
||||||
|
|
||||||
/* Mapping vreg name back to instruction, used select reg callback: */
|
|
||||||
struct hash_table *name_to_instr;
|
|
||||||
|
|
||||||
/* Tracking for select_reg callback */
|
|
||||||
unsigned start_search_reg;
|
|
||||||
unsigned max_target;
|
|
||||||
|
|
||||||
/* Temporary buffer for def/use iterators
|
|
||||||
*
|
|
||||||
* The worst case should probably be an array w/ relative access (ie.
|
|
||||||
* all elements are def'd or use'd), and that can't be larger than
|
|
||||||
* the number of registers.
|
|
||||||
*
|
|
||||||
* NOTE we could declare this on the stack if needed, but I don't
|
|
||||||
* think there is a need for nested iterators.
|
|
||||||
*/
|
|
||||||
unsigned namebuf[NUM_REGS];
|
|
||||||
unsigned namecnt, nameidx;
|
|
||||||
|
|
||||||
/* Error handling: */
|
|
||||||
jmp_buf jmp_env;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define ra_assert(ctx, expr) do { \
|
|
||||||
if (!(expr)) { \
|
|
||||||
_debug_printf("RA: %s:%u: %s: Assertion `%s' failed.\n", __FILE__, __LINE__, __func__, #expr); \
|
|
||||||
longjmp((ctx)->jmp_env, -1); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#define ra_unreachable(ctx, str) ra_assert(ctx, !str)
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
|
|
||||||
{
|
{
|
||||||
unsigned name;
|
/* With the a6xx new cat6 encoding, the same register is used for the
|
||||||
debug_assert(id->cls >= 0);
|
* value and destination of atomic operations.
|
||||||
debug_assert(id->cls < total_class_count); /* we shouldn't get arrays here.. */
|
*/
|
||||||
name = ctx->class_base[id->cls] + id->defn->name;
|
if (compiler->gpu_id >= 600 && is_atomic(dst->instr->opc) &&
|
||||||
debug_assert(name < ctx->alloc_count);
|
(dst->instr->flags & IR3_INSTR_G)) {
|
||||||
return name;
|
return dst->instr->regs[3];
|
||||||
}
|
|
||||||
|
|
||||||
/* Get the scalar name of the n'th component of an instruction dst: */
|
|
||||||
static inline int
|
|
||||||
scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n)
|
|
||||||
{
|
|
||||||
if (ctx->scalar_pass) {
|
|
||||||
if (instr->opc == OPC_META_SPLIT) {
|
|
||||||
debug_assert(n == 0); /* split results in a scalar */
|
|
||||||
struct ir3_instruction *src = instr->regs[1]->def->instr;
|
|
||||||
return scalar_name(ctx, src, instr->split.off);
|
|
||||||
} else if (instr->opc == OPC_META_COLLECT) {
|
|
||||||
debug_assert(n < (instr->regs_count + 1));
|
|
||||||
struct ir3_instruction *src = instr->regs[n + 1]->def->instr;
|
|
||||||
return scalar_name(ctx, src, 0);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug_assert(n == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ra_name(ctx, &ctx->instrd[instr->ip]) + n;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define NO_NAME ~0
|
/* Iterators for sources and destinations which:
|
||||||
|
* - Don't include fake sources (irrelevant for RA)
|
||||||
/*
|
* - Don't include non-SSA sources (immediates and constants, also irrelevant)
|
||||||
* Iterators to iterate the vreg names of an instructions def's and use's
|
* - Consider array destinations as both a source and a destination
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static inline unsigned
|
#define ra_foreach_src(__srcreg, __instr) \
|
||||||
__ra_name_cnt(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
|
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||||
|
for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \
|
||||||
|
if (ra_reg_is_src((__srcreg = (__instr)->regs[__i])))
|
||||||
|
|
||||||
|
#define ra_foreach_src_rev(__srcreg, __instr) \
|
||||||
|
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||||
|
for (int __cnt = (__instr)->regs_count, __i = __cnt - 1; __i >= 0; __i--) \
|
||||||
|
if (ra_reg_is_src((__srcreg = (__instr)->regs[__i])))
|
||||||
|
|
||||||
|
#define ra_foreach_dst(__srcreg, __instr) \
|
||||||
|
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||||
|
for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \
|
||||||
|
if (ra_reg_is_dst((__srcreg = (__instr)->regs[__i])))
|
||||||
|
|
||||||
|
static inline struct ir3_register *
|
||||||
|
ra_src_get_tied_dst(const struct ir3_compiler *compiler,
|
||||||
|
struct ir3_instruction *instr,
|
||||||
|
struct ir3_register *src)
|
||||||
{
|
{
|
||||||
if (!instr)
|
if (compiler->gpu_id >= 600 && is_atomic(instr->opc) &&
|
||||||
return 0;
|
(instr->flags & IR3_INSTR_G) && src == instr->regs[3]) {
|
||||||
|
return instr->regs[0];
|
||||||
|
}
|
||||||
|
|
||||||
/* Filter special cases, ie. writes to a0.x or p0.x, or non-ssa: */
|
return NULL;
|
||||||
if (!writes_gpr(instr) || (instr->regs[0]->flags & IR3_REG_ARRAY))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* in scalar pass, we aren't considering virtual register classes, ie.
|
|
||||||
* if an instruction writes a vec2, then it defines two different scalar
|
|
||||||
* register names.
|
|
||||||
*/
|
|
||||||
if (ctx->scalar_pass)
|
|
||||||
return dest_regs(instr);
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define foreach_name_n(__name, __n, __ctx, __instr) \
|
|
||||||
for (unsigned __cnt = __ra_name_cnt(__ctx, __instr), __n = 0, __name; \
|
|
||||||
(__n < __cnt) && ({__name = scalar_name(__ctx, __instr, __n); 1;}); __n++)
|
|
||||||
|
|
||||||
#define foreach_name(__name, __ctx, __instr) \
|
#define RA_HALF_SIZE (4 * 48)
|
||||||
foreach_name_n(__name, __n, __ctx, __instr)
|
#define RA_FULL_SIZE (4 * 48 * 2)
|
||||||
|
#define RA_SHARED_SIZE (2 * 4 * 8)
|
||||||
|
#define RA_MAX_FILE_SIZE RA_FULL_SIZE
|
||||||
|
|
||||||
static inline unsigned
|
struct ir3_liveness {
|
||||||
__ra_itr_pop(struct ir3_ra_ctx *ctx)
|
unsigned block_count;
|
||||||
|
DECLARE_ARRAY(struct ir3_register *, definitions);
|
||||||
|
DECLARE_ARRAY(BITSET_WORD *, live_out);
|
||||||
|
DECLARE_ARRAY(BITSET_WORD *, live_in);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
|
||||||
|
|
||||||
|
bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
|
||||||
|
struct ir3_instruction *instr);
|
||||||
|
|
||||||
|
void ir3_create_parallel_copies(struct ir3 *ir);
|
||||||
|
|
||||||
|
void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
|
||||||
|
|
||||||
|
struct ir3_pressure {
|
||||||
|
unsigned full, half, shared;
|
||||||
|
};
|
||||||
|
|
||||||
|
void ir3_calc_pressure(struct ir3_shader_variant *v,
|
||||||
|
struct ir3_liveness *live,
|
||||||
|
struct ir3_pressure *max_pressure);
|
||||||
|
|
||||||
|
void ir3_lower_copies(struct ir3_shader_variant *v);
|
||||||
|
|
||||||
|
/* Register interval datastructure
|
||||||
|
*
|
||||||
|
* ir3_reg_ctx is used to track which registers are live. The tricky part is
|
||||||
|
* that some registers may overlap each other, when registers with overlapping
|
||||||
|
* live ranges get coalesced. For example, splits will overlap with their
|
||||||
|
* parent vector and sometimes collect sources will also overlap with the
|
||||||
|
* collect'ed vector. ir3_merge_regs guarantees for us that none of the
|
||||||
|
* registers in a merge set that are live at any given point partially
|
||||||
|
* overlap, which means that we can organize them into a forest. While each
|
||||||
|
* register has a per-merge-set offset, ir3_merge_regs also computes a
|
||||||
|
* "global" offset which allows us to throw away the original merge sets and
|
||||||
|
* think of registers as just intervals in a forest of live intervals. When a
|
||||||
|
* register becomes live, we insert it into the forest, and when it dies we
|
||||||
|
* remove it from the forest (and then its children get moved up a level). We
|
||||||
|
* use red-black trees to keep track of each level of the forest, so insertion
|
||||||
|
* and deletion should be fast operations. ir3_reg_ctx handles all the
|
||||||
|
* internal bookkeeping for this, so that it can be shared between RA,
|
||||||
|
* spilling, and register pressure tracking.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct ir3_reg_interval {
|
||||||
|
struct rb_node node;
|
||||||
|
|
||||||
|
struct rb_tree children;
|
||||||
|
|
||||||
|
struct ir3_reg_interval *parent;
|
||||||
|
|
||||||
|
struct ir3_register *reg;
|
||||||
|
|
||||||
|
bool inserted;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ir3_reg_ctx {
|
||||||
|
/* The tree of top-level intervals in the forest. */
|
||||||
|
struct rb_tree intervals;
|
||||||
|
|
||||||
|
/* Users of ir3_reg_ctx need to keep around additional state that is
|
||||||
|
* modified when top-level intervals are added or removed. For register
|
||||||
|
* pressure tracking, this is just the register pressure, but for RA we
|
||||||
|
* need to keep track of the physreg of each top-level interval. These
|
||||||
|
* callbacks provide a place to let users deriving from ir3_reg_ctx update
|
||||||
|
* their state when top-level intervals are inserted/removed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Called when an interval is added and it turns out to be at the top
|
||||||
|
* level.
|
||||||
|
*/
|
||||||
|
void (*interval_add)(struct ir3_reg_ctx *ctx,
|
||||||
|
struct ir3_reg_interval *interval);
|
||||||
|
|
||||||
|
/* Called when an interval is deleted from the top level. */
|
||||||
|
void (*interval_delete)(struct ir3_reg_ctx *ctx,
|
||||||
|
struct ir3_reg_interval *interval);
|
||||||
|
|
||||||
|
/* Called when an interval is deleted and its child becomes top-level.
|
||||||
|
*/
|
||||||
|
void (*interval_readd)(struct ir3_reg_ctx *ctx,
|
||||||
|
struct ir3_reg_interval *parent,
|
||||||
|
struct ir3_reg_interval *child);
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline struct ir3_reg_interval *
|
||||||
|
ir3_rb_node_to_interval(struct rb_node *node)
|
||||||
{
|
{
|
||||||
if (ctx->nameidx < ctx->namecnt)
|
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||||
return ctx->namebuf[ctx->nameidx++];
|
}
|
||||||
return NO_NAME;
|
|
||||||
|
static inline const struct ir3_reg_interval *
|
||||||
|
ir3_rb_node_to_interval_const(const struct rb_node *node)
|
||||||
|
{
|
||||||
|
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct ir3_reg_interval *
|
||||||
|
ir3_reg_interval_next(struct ir3_reg_interval *interval)
|
||||||
|
{
|
||||||
|
struct rb_node *next = rb_node_next(&interval->node);
|
||||||
|
return next ? ir3_rb_node_to_interval(next) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct ir3_reg_interval *
|
||||||
|
ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval)
|
||||||
|
{
|
||||||
|
return interval ? ir3_reg_interval_next(interval) : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
__ra_itr_push(struct ir3_ra_ctx *ctx, unsigned name)
|
ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg)
|
||||||
{
|
{
|
||||||
assert(ctx->namecnt < ARRAY_SIZE(ctx->namebuf));
|
rb_tree_init(&interval->children);
|
||||||
ctx->namebuf[ctx->namecnt++] = name;
|
interval->reg = reg;
|
||||||
|
interval->parent = NULL;
|
||||||
|
interval->inserted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned
|
void
|
||||||
__ra_init_def_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
|
ir3_reg_interval_dump(struct ir3_reg_interval *interval);
|
||||||
{
|
|
||||||
/* nested use is not supported: */
|
|
||||||
assert(ctx->namecnt == ctx->nameidx);
|
|
||||||
|
|
||||||
ctx->namecnt = ctx->nameidx = 0;
|
void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
|
||||||
|
struct ir3_reg_interval *interval);
|
||||||
|
|
||||||
if (!writes_gpr(instr))
|
void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
|
||||||
return NO_NAME;
|
struct ir3_reg_interval *interval);
|
||||||
|
|
||||||
struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
|
void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
|
||||||
struct ir3_register *dst = instr->regs[0];
|
struct ir3_reg_interval *interval);
|
||||||
|
|
||||||
if (dst->flags & IR3_REG_ARRAY) {
|
#endif
|
||||||
struct ir3_array *arr = ir3_lookup_array(ctx->ir, dst->array.id);
|
|
||||||
|
|
||||||
/* indirect write is treated like a write to all array
|
|
||||||
* elements, since we don't know which one is actually
|
|
||||||
* written:
|
|
||||||
*/
|
|
||||||
if (dst->flags & IR3_REG_RELATIV) {
|
|
||||||
for (unsigned i = 0; i < arr->length; i++) {
|
|
||||||
__ra_itr_push(ctx, arr->base + i);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
__ra_itr_push(ctx, arr->base + dst->array.offset);
|
|
||||||
debug_assert(dst->array.offset < arr->length);
|
|
||||||
}
|
|
||||||
} else if (id->defn == instr) {
|
|
||||||
foreach_name_n (name, i, ctx, instr) {
|
|
||||||
/* tex instructions actually have a wrmask, and
|
|
||||||
* don't touch masked out components. We can't do
|
|
||||||
* anything useful about that in the first pass,
|
|
||||||
* but in the scalar pass we can realize these
|
|
||||||
* registers are available:
|
|
||||||
*/
|
|
||||||
if (ctx->scalar_pass && is_tex_or_prefetch(instr) &&
|
|
||||||
!(instr->regs[0]->wrmask & (1 << i)))
|
|
||||||
continue;
|
|
||||||
__ra_itr_push(ctx, name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return __ra_itr_pop(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned
|
|
||||||
__ra_init_use_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
|
|
||||||
{
|
|
||||||
/* nested use is not supported: */
|
|
||||||
assert(ctx->namecnt == ctx->nameidx);
|
|
||||||
|
|
||||||
ctx->namecnt = ctx->nameidx = 0;
|
|
||||||
|
|
||||||
foreach_src (reg, instr) {
|
|
||||||
if (reg->flags & IR3_REG_ARRAY) {
|
|
||||||
struct ir3_array *arr =
|
|
||||||
ir3_lookup_array(ctx->ir, reg->array.id);
|
|
||||||
|
|
||||||
/* indirect read is treated like a read from all array
|
|
||||||
* elements, since we don't know which one is actually
|
|
||||||
* read:
|
|
||||||
*/
|
|
||||||
if (reg->flags & IR3_REG_RELATIV) {
|
|
||||||
for (unsigned i = 0; i < arr->length; i++) {
|
|
||||||
__ra_itr_push(ctx, arr->base + i);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
__ra_itr_push(ctx, arr->base + reg->array.offset);
|
|
||||||
debug_assert(reg->array.offset < arr->length);
|
|
||||||
}
|
|
||||||
} else if (reg->def) {
|
|
||||||
foreach_name_n (name, i, ctx, reg->def->instr) {
|
|
||||||
/* split takes a src w/ wrmask potentially greater
|
|
||||||
* than 0x1, but it really only cares about a single
|
|
||||||
* component. This shows up in splits coming out of
|
|
||||||
* a tex instruction w/ wrmask=.z, for example.
|
|
||||||
*/
|
|
||||||
if (ctx->scalar_pass && (instr->opc == OPC_META_SPLIT) &&
|
|
||||||
!(i == instr->split.off))
|
|
||||||
continue;
|
|
||||||
__ra_itr_push(ctx, name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return __ra_itr_pop(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define foreach_def(__name, __ctx, __instr) \
|
|
||||||
for (unsigned __name = __ra_init_def_itr(__ctx, __instr); \
|
|
||||||
__name != NO_NAME; __name = __ra_itr_pop(__ctx))
|
|
||||||
|
|
||||||
#define foreach_use(__name, __ctx, __instr) \
|
|
||||||
for (unsigned __name = __ra_init_use_itr(__ctx, __instr); \
|
|
||||||
__name != NO_NAME; __name = __ra_itr_pop(__ctx))
|
|
||||||
|
|
||||||
int ra_size_to_class(unsigned sz, bool half, bool shared);
|
|
||||||
int ra_class_to_size(unsigned class, bool *half, bool *shared);
|
|
||||||
|
|
||||||
#endif /* IR3_RA_H_ */
|
|
||||||
|
|
|
||||||
|
|
@ -1,249 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
|
||||||
* to deal in the Software without restriction, including without limitation
|
|
||||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
* and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice (including the next
|
|
||||||
* paragraph) shall be included in all copies or substantial portions of the
|
|
||||||
* Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
||||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*
|
|
||||||
* Authors:
|
|
||||||
* Rob Clark <robclark@freedesktop.org>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "util/u_math.h"
|
|
||||||
#include "util/register_allocate.h"
|
|
||||||
#include "util/ralloc.h"
|
|
||||||
#include "util/bitset.h"
|
|
||||||
|
|
||||||
#include "ir3.h"
|
|
||||||
#include "ir3_compiler.h"
|
|
||||||
#include "ir3_ra.h"
|
|
||||||
|
|
||||||
static void
|
|
||||||
setup_conflicts(struct ir3_ra_reg_set *set)
|
|
||||||
{
|
|
||||||
unsigned reg;
|
|
||||||
|
|
||||||
reg = 0;
|
|
||||||
for (unsigned i = 0; i < class_count; i++) {
|
|
||||||
for (unsigned j = 0; j < CLASS_REGS(i); j++) {
|
|
||||||
for (unsigned br = j; br < j + class_sizes[i]; br++) {
|
|
||||||
ra_add_transitive_reg_conflict(set->regs, br, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < half_class_count; i++) {
|
|
||||||
for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
|
|
||||||
for (unsigned br = j; br < j + half_class_sizes[i]; br++) {
|
|
||||||
ra_add_transitive_reg_conflict(set->regs,
|
|
||||||
br + set->first_half_reg, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < shared_class_count; i++) {
|
|
||||||
for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) {
|
|
||||||
for (unsigned br = j; br < j + shared_class_sizes[i]; br++) {
|
|
||||||
ra_add_transitive_reg_conflict(set->regs,
|
|
||||||
br + set->first_shared_reg, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Setup conflicts with registers over 0x3f for the special vreg
|
|
||||||
* that exists to use as interference for tex-prefetch:
|
|
||||||
*/
|
|
||||||
|
|
||||||
for (unsigned i = 0x40; i < CLASS_REGS(0); i++) {
|
|
||||||
ra_add_transitive_reg_conflict(set->regs, i,
|
|
||||||
set->prefetch_exclude_reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (unsigned i = 0x40; i < HALF_CLASS_REGS(0); i++) {
|
|
||||||
ra_add_transitive_reg_conflict(set->regs, i + set->first_half_reg,
|
|
||||||
set->prefetch_exclude_reg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* One-time setup of RA register-set, which describes all the possible
|
|
||||||
* "virtual" registers and their interferences. Ie. double register
|
|
||||||
* occupies (and conflicts with) two single registers, and so forth.
|
|
||||||
* Since registers do not need to be aligned to their class size, they
|
|
||||||
* can conflict with other registers in the same class too. Ie:
|
|
||||||
*
|
|
||||||
* Single (base) | Double
|
|
||||||
* --------------+---------------
|
|
||||||
* R0 | D0
|
|
||||||
* R1 | D0 D1
|
|
||||||
* R2 | D1 D2
|
|
||||||
* R3 | D2
|
|
||||||
* .. and so on..
|
|
||||||
*
|
|
||||||
* (NOTE the disassembler uses notation like r0.x/y/z/w but those are
|
|
||||||
* really just four scalar registers. Don't let that confuse you.)
|
|
||||||
*/
|
|
||||||
struct ir3_ra_reg_set *
|
|
||||||
ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs)
|
|
||||||
{
|
|
||||||
struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
|
|
||||||
unsigned ra_reg_count, reg, base;
|
|
||||||
|
|
||||||
/* calculate # of regs across all classes: */
|
|
||||||
ra_reg_count = 0;
|
|
||||||
for (unsigned i = 0; i < class_count; i++)
|
|
||||||
ra_reg_count += CLASS_REGS(i);
|
|
||||||
for (unsigned i = 0; i < half_class_count; i++)
|
|
||||||
ra_reg_count += HALF_CLASS_REGS(i);
|
|
||||||
for (unsigned i = 0; i < shared_class_count; i++)
|
|
||||||
ra_reg_count += SHARED_CLASS_REGS(i);
|
|
||||||
|
|
||||||
ra_reg_count += 1; /* for tex-prefetch excludes */
|
|
||||||
|
|
||||||
/* allocate the reg-set.. */
|
|
||||||
set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
|
|
||||||
set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
|
|
||||||
set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
|
|
||||||
|
|
||||||
/* .. and classes */
|
|
||||||
reg = 0;
|
|
||||||
for (unsigned i = 0; i < class_count; i++) {
|
|
||||||
set->classes[i] = ra_alloc_reg_class(set->regs);
|
|
||||||
|
|
||||||
set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
|
|
||||||
|
|
||||||
for (unsigned j = 0; j < CLASS_REGS(i); j++) {
|
|
||||||
ra_class_add_reg(set->classes[i], reg);
|
|
||||||
|
|
||||||
set->ra_reg_to_gpr[reg] = j;
|
|
||||||
set->gpr_to_ra_reg[i][j] = reg;
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
set->first_half_reg = reg;
|
|
||||||
base = HALF_OFFSET;
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < half_class_count; i++) {
|
|
||||||
set->half_classes[i] = ra_alloc_reg_class(set->regs);
|
|
||||||
|
|
||||||
set->gpr_to_ra_reg[base + i] =
|
|
||||||
ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
|
|
||||||
|
|
||||||
for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
|
|
||||||
ra_class_add_reg(set->half_classes[i], reg);
|
|
||||||
|
|
||||||
set->ra_reg_to_gpr[reg] = j;
|
|
||||||
set->gpr_to_ra_reg[base + i][j] = reg;
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
set->first_shared_reg = reg;
|
|
||||||
base = SHARED_OFFSET;
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < shared_class_count; i++) {
|
|
||||||
set->shared_classes[i] = ra_alloc_reg_class(set->regs);
|
|
||||||
|
|
||||||
set->gpr_to_ra_reg[base + i] =
|
|
||||||
ralloc_array(set, uint16_t, SHARED_CLASS_REGS(i));
|
|
||||||
|
|
||||||
for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) {
|
|
||||||
ra_class_add_reg(set->shared_classes[i], reg);
|
|
||||||
|
|
||||||
set->ra_reg_to_gpr[reg] = j;
|
|
||||||
set->gpr_to_ra_reg[base + i][j] = reg;
|
|
||||||
|
|
||||||
reg++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Setup an additional class, with one vreg, to simply conflict
|
|
||||||
* with registers that are too high to encode tex-prefetch. This
|
|
||||||
* vreg is only used to setup additional conflicts so that RA
|
|
||||||
* knows to allocate prefetch dst regs below the limit:
|
|
||||||
*/
|
|
||||||
set->prefetch_exclude_class = ra_alloc_reg_class(set->regs);
|
|
||||||
ra_class_add_reg(set->prefetch_exclude_class, reg);
|
|
||||||
set->prefetch_exclude_reg = reg++;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* And finally setup conflicts. Starting a6xx, half precision regs
|
|
||||||
* conflict w/ full precision regs (when using MERGEDREGS):
|
|
||||||
*/
|
|
||||||
if (mergedregs) {
|
|
||||||
for (unsigned i = 0; i < CLASS_REGS(0) / 2; i++) {
|
|
||||||
unsigned freg = set->gpr_to_ra_reg[0][i];
|
|
||||||
unsigned hreg0 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 0];
|
|
||||||
unsigned hreg1 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 1];
|
|
||||||
|
|
||||||
ra_add_transitive_reg_pair_conflict(set->regs, freg, hreg0, hreg1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_conflicts(set);
|
|
||||||
|
|
||||||
ra_set_finalize(set->regs, NULL);
|
|
||||||
|
|
||||||
return set;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
ra_size_to_class(unsigned sz, bool half, bool shared)
|
|
||||||
{
|
|
||||||
if (shared) {
|
|
||||||
for (unsigned i = 0; i < shared_class_count; i++)
|
|
||||||
if (shared_class_sizes[i] >= sz)
|
|
||||||
return i + SHARED_OFFSET;
|
|
||||||
} else if (half) {
|
|
||||||
for (unsigned i = 0; i < half_class_count; i++)
|
|
||||||
if (half_class_sizes[i] >= sz)
|
|
||||||
return i + HALF_OFFSET;
|
|
||||||
} else {
|
|
||||||
for (unsigned i = 0; i < class_count; i++)
|
|
||||||
if (class_sizes[i] >= sz)
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
debug_assert(0);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
ra_class_to_size(unsigned class, bool *half, bool *shared)
|
|
||||||
{
|
|
||||||
*half = *shared = false;
|
|
||||||
|
|
||||||
if (class >= SHARED_OFFSET) {
|
|
||||||
*shared = true;
|
|
||||||
return shared_class_sizes[class - SHARED_OFFSET];
|
|
||||||
} else if (class >= HALF_OFFSET) {
|
|
||||||
*half = true;
|
|
||||||
return half_class_sizes[class - HALF_OFFSET];
|
|
||||||
} else {
|
|
||||||
return class_sizes[class];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
362
src/freedreno/ir3/ir3_spill.c
Normal file
362
src/freedreno/ir3/ir3_spill.c
Normal file
|
|
@ -0,0 +1,362 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Valve Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ir3_ra.h"
|
||||||
|
#include "ir3_shader.h"
|
||||||
|
#include "util/rb_tree.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This pass does one thing so far:
|
||||||
|
*
|
||||||
|
* 1. Calculates the maximum register pressure. To do this, we need to use the
|
||||||
|
* exact same technique that RA uses for combining meta_split instructions
|
||||||
|
* with their sources, so that our calculation agrees with RA.
|
||||||
|
*
|
||||||
|
* It will also optionally spill registers once that's implemented.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct ra_spill_interval {
|
||||||
|
struct ir3_reg_interval interval;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ra_spill_ctx {
|
||||||
|
struct ir3_reg_ctx reg_ctx;
|
||||||
|
|
||||||
|
struct ra_spill_interval *intervals;
|
||||||
|
|
||||||
|
struct ir3_pressure cur_pressure, max_pressure;
|
||||||
|
|
||||||
|
struct ir3_liveness *live;
|
||||||
|
|
||||||
|
const struct ir3_compiler *compiler;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg)
|
||||||
|
{
|
||||||
|
ir3_reg_interval_init(&interval->interval, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
|
||||||
|
{
|
||||||
|
unsigned size = reg_size(interval->interval.reg);
|
||||||
|
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||||
|
pressure->shared += size;
|
||||||
|
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||||
|
pressure->half += size;
|
||||||
|
else
|
||||||
|
pressure->full += size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
|
||||||
|
{
|
||||||
|
unsigned size = reg_size(interval->interval.reg);
|
||||||
|
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||||
|
pressure->shared -= size;
|
||||||
|
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||||
|
pressure->half -= size;
|
||||||
|
else
|
||||||
|
pressure->full -= size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ra_spill_interval *
|
||||||
|
ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
|
||||||
|
{
|
||||||
|
return rb_node_data(struct ra_spill_interval, interval, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ra_spill_ctx *
|
||||||
|
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
|
||||||
|
{
|
||||||
|
return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||||
|
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||||
|
|
||||||
|
ra_pressure_add(&ctx->cur_pressure, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||||
|
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||||
|
|
||||||
|
ra_pressure_sub(&ctx->cur_pressure, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
|
||||||
|
struct ir3_reg_interval *_child)
|
||||||
|
{
|
||||||
|
interval_add(_ctx, _child);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
spill_ctx_init(struct ra_spill_ctx *ctx)
|
||||||
|
{
|
||||||
|
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||||
|
ctx->reg_ctx.interval_add = interval_add;
|
||||||
|
ctx->reg_ctx.interval_delete = interval_delete;
|
||||||
|
ctx->reg_ctx.interval_readd = interval_readd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
|
||||||
|
{
|
||||||
|
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
|
||||||
|
{
|
||||||
|
ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||||
|
ra_spill_interval_init(interval, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||||
|
if (interval->interval.inserted)
|
||||||
|
return;
|
||||||
|
|
||||||
|
ra_spill_ctx_insert(ctx, interval);
|
||||||
|
|
||||||
|
/* For precolored inputs, make sure we leave enough registers to allow for
|
||||||
|
* holes in the inputs. It can happen that the binning shader has a lower
|
||||||
|
* register pressure than the main shader, but the main shader decided to
|
||||||
|
* add holes between the inputs which means that the binning shader has a
|
||||||
|
* higher register demand.
|
||||||
|
*/
|
||||||
|
if (dst->instr->opc == OPC_META_INPUT &&
|
||||||
|
dst->num != INVALID_REG) {
|
||||||
|
physreg_t physreg = ra_reg_get_physreg(dst);
|
||||||
|
physreg_t max = physreg + reg_size(dst);
|
||||||
|
|
||||||
|
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||||
|
ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
|
||||||
|
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||||
|
ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
|
||||||
|
else
|
||||||
|
ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
|
||||||
|
{
|
||||||
|
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||||
|
return;
|
||||||
|
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||||
|
|
||||||
|
if (!interval->interval.inserted || interval->interval.parent ||
|
||||||
|
!rb_tree_is_empty(&interval->interval.children))
|
||||||
|
return;
|
||||||
|
|
||||||
|
ra_spill_ctx_remove(ctx, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
|
||||||
|
{
|
||||||
|
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||||
|
return;
|
||||||
|
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||||
|
|
||||||
|
if (!interval->interval.inserted)
|
||||||
|
return;
|
||||||
|
|
||||||
|
ra_spill_ctx_remove(ctx, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||||
|
|
||||||
|
if (!interval->interval.inserted)
|
||||||
|
return;
|
||||||
|
|
||||||
|
ra_spill_ctx_remove(ctx, interval);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
update_max_pressure(struct ra_spill_ctx *ctx)
|
||||||
|
{
|
||||||
|
d("pressure:");
|
||||||
|
d("\tfull: %u", ctx->cur_pressure.full);
|
||||||
|
d("\thalf: %u", ctx->cur_pressure.half);
|
||||||
|
d("\tshared: %u", ctx->cur_pressure.shared);
|
||||||
|
|
||||||
|
ctx->max_pressure.full =
|
||||||
|
MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
|
||||||
|
ctx->max_pressure.half =
|
||||||
|
MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
|
||||||
|
ctx->max_pressure.shared =
|
||||||
|
MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||||
|
{
|
||||||
|
if (RA_DEBUG) {
|
||||||
|
printf("processing: ");
|
||||||
|
ir3_print_instr(instr);
|
||||||
|
}
|
||||||
|
|
||||||
|
ra_foreach_dst(dst, instr) {
|
||||||
|
init_dst(ctx, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handle tied destinations. If a destination is tied to a source and that
|
||||||
|
* source is live-through, then we need to allocate a new register for the
|
||||||
|
* destination which is live-through itself and cannot overlap the
|
||||||
|
* sources.
|
||||||
|
*/
|
||||||
|
|
||||||
|
ra_foreach_dst(dst, instr) {
|
||||||
|
if (!ra_reg_is_array_rmw(dst)) {
|
||||||
|
struct ir3_register *tied_src =
|
||||||
|
ra_dst_get_tied_src(ctx->compiler, dst);
|
||||||
|
if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
|
||||||
|
insert_dst(ctx, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
update_max_pressure(ctx);
|
||||||
|
|
||||||
|
ra_foreach_src(src, instr) {
|
||||||
|
if (src->flags & IR3_REG_FIRST_KILL)
|
||||||
|
remove_src_early(ctx, instr, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ra_foreach_dst(dst, instr) {
|
||||||
|
insert_dst(ctx, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
update_max_pressure(ctx);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < instr->regs_count; i++) {
|
||||||
|
if (ra_reg_is_src(instr->regs[i]) &&
|
||||||
|
(instr->regs[i]->flags & IR3_REG_FIRST_KILL))
|
||||||
|
remove_src(ctx, instr, instr->regs[i]);
|
||||||
|
else if (ra_reg_is_dst(instr->regs[i]) &&
|
||||||
|
(instr->regs[i]->flags & IR3_REG_UNUSED))
|
||||||
|
remove_dst(ctx, instr->regs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||||
|
{
|
||||||
|
init_dst(ctx, instr->regs[0]);
|
||||||
|
insert_dst(ctx, instr->regs[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||||
|
{
|
||||||
|
ra_foreach_src(src, instr)
|
||||||
|
remove_src(ctx, instr, src);
|
||||||
|
if (instr->regs[0]->flags & IR3_REG_UNUSED)
|
||||||
|
remove_dst(ctx, instr->regs[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
|
||||||
|
{
|
||||||
|
struct ra_spill_interval *interval = &ctx->intervals[def->name];
|
||||||
|
ra_spill_interval_init(interval, def);
|
||||||
|
insert_dst(ctx, def);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
|
||||||
|
{
|
||||||
|
memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
|
||||||
|
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||||
|
|
||||||
|
unsigned name;
|
||||||
|
BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
|
||||||
|
ctx->live->definitions_count) {
|
||||||
|
struct ir3_register *reg = ctx->live->definitions[name];
|
||||||
|
handle_live_in(ctx, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
|
||||||
|
instr->opc != OPC_META_TEX_PREFETCH)
|
||||||
|
break;
|
||||||
|
handle_input_phi(ctx, instr);
|
||||||
|
}
|
||||||
|
|
||||||
|
update_max_pressure(ctx);
|
||||||
|
|
||||||
|
foreach_instr (instr, &block->instr_list) {
|
||||||
|
if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
|
||||||
|
instr->opc == OPC_META_TEX_PREFETCH)
|
||||||
|
remove_input_phi(ctx, instr);
|
||||||
|
else
|
||||||
|
handle_instr(ctx, instr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
|
||||||
|
struct ir3_pressure *max_pressure)
|
||||||
|
{
|
||||||
|
struct ra_spill_ctx ctx = {};
|
||||||
|
ctx.live = live;
|
||||||
|
ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
|
||||||
|
ctx.compiler = v->shader->compiler;
|
||||||
|
spill_ctx_init(&ctx);
|
||||||
|
|
||||||
|
foreach_block (block, &v->ir->block_list) {
|
||||||
|
handle_block(&ctx, block);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ctx.cur_pressure.full == 0);
|
||||||
|
assert(ctx.cur_pressure.half == 0);
|
||||||
|
assert(ctx.cur_pressure.shared == 0);
|
||||||
|
|
||||||
|
free(ctx.intervals);
|
||||||
|
|
||||||
|
*max_pressure = ctx.max_pressure;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -81,11 +81,13 @@ libfreedreno_ir3_files = files(
|
||||||
'ir3_delay.c',
|
'ir3_delay.c',
|
||||||
'ir3_dominance.c',
|
'ir3_dominance.c',
|
||||||
'ir3_disk_cache.c',
|
'ir3_disk_cache.c',
|
||||||
'ir3_group.c',
|
|
||||||
'ir3_image.c',
|
'ir3_image.c',
|
||||||
'ir3_image.h',
|
'ir3_image.h',
|
||||||
'ir3.h',
|
'ir3.h',
|
||||||
'ir3_legalize.c',
|
'ir3_legalize.c',
|
||||||
|
'ir3_liveness.c',
|
||||||
|
'ir3_lower_parallelcopy.c',
|
||||||
|
'ir3_merge_regs.c',
|
||||||
'ir3_nir.c',
|
'ir3_nir.c',
|
||||||
'ir3_nir.h',
|
'ir3_nir.h',
|
||||||
'ir3_nir_analyze_ubo_ranges.c',
|
'ir3_nir_analyze_ubo_ranges.c',
|
||||||
|
|
@ -100,10 +102,10 @@ libfreedreno_ir3_files = files(
|
||||||
'ir3_print.c',
|
'ir3_print.c',
|
||||||
'ir3_ra.c',
|
'ir3_ra.c',
|
||||||
'ir3_ra.h',
|
'ir3_ra.h',
|
||||||
'ir3_ra_regset.c',
|
|
||||||
'ir3_sched.c',
|
'ir3_sched.c',
|
||||||
'ir3_shader.c',
|
'ir3_shader.c',
|
||||||
'ir3_shader.h',
|
'ir3_shader.h',
|
||||||
|
'ir3_spill.c',
|
||||||
'ir3_validate.c',
|
'ir3_validate.c',
|
||||||
'regmask.h',
|
'regmask.h',
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -336,6 +336,6 @@ wgl@wgl-multi-context-single-window,Fail
|
||||||
wgl@wgl-multi-window-single-context,Fail
|
wgl@wgl-multi-window-single-context,Fail
|
||||||
wgl@wgl-sanity,Fail
|
wgl@wgl-sanity,Fail
|
||||||
spec@glsl-1.30@execution@clipping@fs-clip-distance-interpolated,Crash
|
spec@glsl-1.30@execution@clipping@fs-clip-distance-interpolated,Crash
|
||||||
spec@glsl-1.30@execution@fs-large-local-array-vec2,Crash
|
spec@glsl-1.30@execution@fs-large-local-array-vec2,Fail
|
||||||
spec@glsl-1.30@execution@fs-large-local-array-vec3,Crash
|
spec@glsl-1.30@execution@fs-large-local-array-vec3,Fail
|
||||||
spec@glsl-1.30@execution@fs-large-local-array-vec4,Crash
|
spec@glsl-1.30@execution@fs-large-local-array-vec4,Fail
|
||||||
|
|
|
||||||
|
|
@ -475,8 +475,6 @@ spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-float-i
|
||||||
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec2-index-rd,Crash
|
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec2-index-rd,Crash
|
||||||
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec3-index-rd,Crash
|
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec3-index-rd,Crash
|
||||||
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec4-index-rd,Crash
|
spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec4-index-rd,Crash
|
||||||
spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec3-index-wr-before-tcs,Fail
|
|
||||||
spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec4-index-wr-before-tcs,Fail
|
|
||||||
spec@arb_tessellation_shader@execution@vertex-partial-write,Crash
|
spec@arb_tessellation_shader@execution@vertex-partial-write,Crash
|
||||||
spec@arb_tessellation_shader@execution@vs-tes-max-in-out-components,Fail
|
spec@arb_tessellation_shader@execution@vs-tes-max-in-out-components,Fail
|
||||||
spec@arb_tessellation_shader@execution@vs-tes-tessinner-tessouter-inputs-quads,Fail
|
spec@arb_tessellation_shader@execution@vs-tes-tessinner-tessouter-inputs-quads,Fail
|
||||||
|
|
@ -507,11 +505,8 @@ spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash
|
||||||
spec@glsl-1.50@execution@geometry@max-input-components,Fail
|
spec@glsl-1.50@execution@geometry@max-input-components,Fail
|
||||||
spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
|
spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
|
||||||
spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
|
spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
|
||||||
spec@glsl-1.50@execution@variable-indexing@gs-input-array-float-index-rd,Fail
|
|
||||||
spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
|
spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
|
||||||
spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
|
spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
|
||||||
spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
|
spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
|
||||||
spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
|
spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
|
||||||
spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec3-index-wr-before-gs,Fail
|
|
||||||
spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
|
spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
|
||||||
spec@glsl-es-3.10@execution@cs-image-atomic-if-else-2,Fail
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue