mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-04 11:20:20 +01:00
freedreno/ir3: post-RA sched pass
After RA, we can schedule to increase parallelism (reduce nop's) without worrying about increasing register pressure. This pass lets us cut down the instruction count ~10%, and prioritize bary.f, kill, etc, which would tend to increase register pressure if we tried to do that before RA. It should be more useful if RA round-robin'd register choices. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>
This commit is contained in:
parent
3369406e44
commit
0f78c32492
6 changed files with 679 additions and 5 deletions
|
|
@ -44,6 +44,7 @@ ir3_SOURCES := \
|
|||
ir3/ir3_nir_lower_tex_prefetch.c \
|
||||
ir3/ir3_nir_lower_tg4_to_tex.c \
|
||||
ir3/ir3_nir_move_varying_inputs.c \
|
||||
ir3/ir3_postsched.c \
|
||||
ir3/ir3_print.c \
|
||||
ir3/ir3_ra.c \
|
||||
ir3/ir3_sched.c \
|
||||
|
|
|
|||
|
|
@ -1138,6 +1138,9 @@ void ir3_sun(struct ir3 *ir);
|
|||
void ir3_sched_add_deps(struct ir3 *ir);
|
||||
int ir3_sched(struct ir3 *ir);
|
||||
|
||||
struct ir3_context;
|
||||
int ir3_postsched(struct ir3_context *ctx);
|
||||
|
||||
bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
|
||||
|
||||
/* register assignment: */
|
||||
|
|
|
|||
|
|
@ -3481,6 +3481,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
|||
|
||||
ir3_debug_print(ir, "AFTER RA");
|
||||
|
||||
ir3_postsched(ctx);
|
||||
ir3_debug_print(ir, "AFTER POSTSCHED");
|
||||
|
||||
if (compiler->gpu_id >= 600) {
|
||||
if (ir3_a6xx_fixup_atomic_dests(ir, so)) {
|
||||
ir3_debug_print(ir, "AFTER ATOMIC FIXUP");
|
||||
|
|
|
|||
669
src/freedreno/ir3/ir3_postsched.c
Normal file
669
src/freedreno/ir3/ir3_postsched.c
Normal file
|
|
@ -0,0 +1,669 @@
|
|||
/*
|
||||
* Copyright (C) 2019 Google, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Rob Clark <robclark@freedesktop.org>
|
||||
*/
|
||||
|
||||
|
||||
#include "util/dag.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
#include "ir3.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "ir3_context.h"
|
||||
|
||||
#ifdef DEBUG
|
||||
#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
|
||||
#else
|
||||
#define SCHED_DEBUG 0
|
||||
#endif
|
||||
#define d(fmt, ...) do { if (SCHED_DEBUG) { \
|
||||
printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
|
||||
} } while (0)
|
||||
|
||||
#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
|
||||
printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
|
||||
ir3_print_instr(instr); \
|
||||
} } while (0)
|
||||
|
||||
/*
|
||||
* Post RA Instruction Scheduling
|
||||
*/
|
||||
|
||||
struct ir3_postsched_ctx {
|
||||
struct ir3_context *ctx;
|
||||
|
||||
void *mem_ctx;
|
||||
struct ir3_block *block; /* the current block */
|
||||
struct dag *dag;
|
||||
|
||||
struct list_head unscheduled_list; /* unscheduled instructions */
|
||||
struct ir3_instruction *scheduled; /* last scheduled instr */
|
||||
struct ir3_instruction *pred; /* current p0.x user, if any */
|
||||
|
||||
bool error;
|
||||
};
|
||||
|
||||
struct ir3_postsched_node {
|
||||
struct dag_node dag; /* must be first for util_dynarray_foreach */
|
||||
struct ir3_instruction *instr;
|
||||
bool partially_evaluated_path;
|
||||
|
||||
unsigned delay;
|
||||
unsigned max_delay;
|
||||
};
|
||||
|
||||
#define foreach_sched_node(__n, __list) \
|
||||
list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
|
||||
|
||||
#define foreach_bit(b, mask) \
|
||||
for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
|
||||
|
||||
// TODO deduplicate
|
||||
static bool is_sfu_or_mem(struct ir3_instruction *instr)
|
||||
{
|
||||
return is_sfu(instr) || is_mem(instr);
|
||||
}
|
||||
|
||||
static void
|
||||
schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
debug_assert(ctx->block == instr->block);
|
||||
|
||||
/* maybe there is a better way to handle this than just stuffing
|
||||
* a nop.. ideally we'd know about this constraint in the
|
||||
* scheduling and depth calculation..
|
||||
*/
|
||||
if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
|
||||
ir3_NOP(ctx->block);
|
||||
|
||||
/* remove from unscheduled_list:
|
||||
*/
|
||||
list_delinit(&instr->node);
|
||||
|
||||
if (writes_pred(instr)) {
|
||||
ctx->pred = instr;
|
||||
}
|
||||
|
||||
di(instr, "schedule");
|
||||
|
||||
list_addtail(&instr->node, &instr->block->instr_list);
|
||||
ctx->scheduled = instr;
|
||||
|
||||
struct ir3_postsched_node *n = instr->data;
|
||||
dag_prune_head(ctx->dag, &n->dag);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_state(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
if (!SCHED_DEBUG)
|
||||
return;
|
||||
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
di(n->instr, "maxdel=%3d ", n->max_delay);
|
||||
|
||||
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
||||
struct ir3_postsched_node *child =
|
||||
(struct ir3_postsched_node *)edge->child;
|
||||
|
||||
di(child->instr, " -> (%d parents) ", child->dag.parent_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* find instruction to schedule: */
|
||||
static struct ir3_instruction *
|
||||
choose_instr(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
struct ir3_postsched_node *chosen = NULL;
|
||||
|
||||
dump_state(ctx);
|
||||
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
if (!is_meta(n->instr))
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "prio: chose (meta)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* Try to schedule inputs with a higher priority, if possible, as
|
||||
* the last bary.f unlocks varying storage to unblock more VS
|
||||
* warps.
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
if (!is_input(n->instr))
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "prio: chose (input)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* Next prioritize discards: */
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
||||
if (!is_kill(n->instr))
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (kill, hard ready)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* Next prioritize expensive instructions: */
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
||||
if (!(is_sfu(n->instr) || is_tex(n->instr)))
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* First try to find a ready leader w/ soft delay (ie. including extra
|
||||
* delay for things like tex fetch which can be synchronized w/ sync
|
||||
* bit (but we probably do want to schedule some other instructions
|
||||
* while we wait)
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (soft ready)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* Next try to find a ready leader that can be scheduled without nop's,
|
||||
* which in the case of things that need (sy)/(ss) could result in
|
||||
* stalls.. but we've already decided there is not a better option.
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (hard ready)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
/* Otherwise choose leader with maximum cost:
|
||||
*
|
||||
* TODO should we try to balance cost and delays? I guess it is
|
||||
* a balance between now-nop's and future-nop's?
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
if (!chosen || chosen->max_delay < n->max_delay)
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (leader)");
|
||||
return chosen->instr;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct ir3_postsched_deps_state {
|
||||
struct ir3_context *ctx;
|
||||
|
||||
enum { F, R } direction;
|
||||
|
||||
bool merged;
|
||||
|
||||
/* Track the mapping between sched node (instruction) that last
|
||||
* wrote a given register (in whichever direction we are iterating
|
||||
* the block)
|
||||
*
|
||||
* Note, this table is twice as big as the # of regs, to deal with
|
||||
* half-precision regs. The approach differs depending on whether
|
||||
* the half and full precision register files are "merged" (conflict,
|
||||
* ie. a6xx+) in which case we consider each full precision dep
|
||||
* as two half-precision dependencies, vs older separate (non-
|
||||
* conflicting) in which case the first half of the table is used
|
||||
* for full precision and 2nd half for half-precision.
|
||||
*/
|
||||
struct ir3_postsched_node *regs[2 * 256];
|
||||
};
|
||||
|
||||
/* bounds checking read/write accessors, since OoB access to stuff on
|
||||
* the stack is gonna cause a bad day.
|
||||
*/
|
||||
#define dep_reg(state, idx) *({ \
|
||||
assert((idx) < ARRAY_SIZE((state)->regs)); \
|
||||
&(state)->regs[(idx)]; \
|
||||
})
|
||||
|
||||
static void
|
||||
add_dep(struct ir3_postsched_deps_state *state,
|
||||
struct ir3_postsched_node *before,
|
||||
struct ir3_postsched_node *after)
|
||||
{
|
||||
if (!before || !after)
|
||||
return;
|
||||
|
||||
assert(before != after);
|
||||
|
||||
if (state->direction == F) {
|
||||
dag_add_edge(&before->dag, &after->dag, NULL);
|
||||
} else {
|
||||
dag_add_edge(&after->dag, &before->dag, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
add_single_reg_dep(struct ir3_postsched_deps_state *state,
|
||||
struct ir3_postsched_node *node, unsigned num, bool write)
|
||||
{
|
||||
add_dep(state, dep_reg(state, num), node);
|
||||
if (write) {
|
||||
dep_reg(state, num) = node;
|
||||
}
|
||||
}
|
||||
|
||||
/* This is where we handled full vs half-precision, and potential conflicts
|
||||
* between half and full precision that result in additional dependencies.
|
||||
* The 'reg' arg is really just to know half vs full precision.
|
||||
*/
|
||||
static void
|
||||
add_reg_dep(struct ir3_postsched_deps_state *state,
|
||||
struct ir3_postsched_node *node, const struct ir3_register *reg,
|
||||
unsigned num, bool write)
|
||||
{
|
||||
if (state->merged) {
|
||||
if (reg->flags & IR3_REG_HALF) {
|
||||
/* single conflict in half-reg space: */
|
||||
add_single_reg_dep(state, node, num, write);
|
||||
} else {
|
||||
/* two conflicts in half-reg space: */
|
||||
add_single_reg_dep(state, node, 2 * num + 0, write);
|
||||
add_single_reg_dep(state, node, 2 * num + 1, write);
|
||||
}
|
||||
} else {
|
||||
if (reg->flags & IR3_REG_HALF)
|
||||
num += ARRAY_SIZE(state->regs) / 2;
|
||||
add_single_reg_dep(state, node, num, write);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_deps(struct ir3_postsched_deps_state *state,
|
||||
struct ir3_postsched_node *node)
|
||||
{
|
||||
static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
|
||||
struct ir3_register *reg;
|
||||
int b;
|
||||
|
||||
/* Add dependencies on instructions that previously (or next,
|
||||
* in the reverse direction) wrote any of our src registers:
|
||||
*/
|
||||
foreach_src_n (reg, i, node->instr) {
|
||||
/* NOTE: relative access for a src can be either const or gpr: */
|
||||
if (reg->flags & IR3_REG_RELATIV) {
|
||||
/* also reads a0.x: */
|
||||
add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
|
||||
}
|
||||
|
||||
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
|
||||
continue;
|
||||
|
||||
if (reg->flags & IR3_REG_RELATIV) {
|
||||
/* mark entire array as read: */
|
||||
struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
|
||||
for (unsigned i = 0; i < arr->length; i++) {
|
||||
add_reg_dep(state, node, reg, arr->reg + i, false);
|
||||
}
|
||||
} else {
|
||||
foreach_bit (b, reg->wrmask) {
|
||||
add_reg_dep(state, node, reg, reg->num + b, false);
|
||||
|
||||
struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
|
||||
if (dep && (state->direction == F)) {
|
||||
unsigned d = ir3_delayslots(dep->instr, node->instr, i);
|
||||
node->delay = MAX2(node->delay, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dest_regs(node->instr) == 0)
|
||||
return;
|
||||
|
||||
/* And then after we update the state for what this instruction
|
||||
* wrote:
|
||||
*/
|
||||
reg = node->instr->regs[0];
|
||||
if (reg->flags & IR3_REG_RELATIV) {
|
||||
/* mark the entire array as written: */
|
||||
struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
|
||||
for (unsigned i = 0; i < arr->length; i++) {
|
||||
add_reg_dep(state, node, reg, arr->reg + i, true);
|
||||
}
|
||||
|
||||
/* also reads a0.x: */
|
||||
add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
|
||||
} else {
|
||||
foreach_bit (b, reg->wrmask) {
|
||||
add_reg_dep(state, node, reg, reg->num + b, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_forward_deps(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
struct ir3_postsched_deps_state state = {
|
||||
.ctx = ctx->ctx,
|
||||
.direction = F,
|
||||
.merged = ctx->ctx->compiler->gpu_id >= 600,
|
||||
};
|
||||
|
||||
foreach_instr (instr, &ctx->unscheduled_list) {
|
||||
calculate_deps(&state, instr->data);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
struct ir3_postsched_deps_state state = {
|
||||
.ctx = ctx->ctx,
|
||||
.direction = R,
|
||||
.merged = ctx->ctx->compiler->gpu_id >= 600,
|
||||
};
|
||||
|
||||
foreach_instr_rev (instr, &ctx->unscheduled_list) {
|
||||
calculate_deps(&state, instr->data);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
|
||||
|
||||
dag_init_node(ctx->dag, &n->dag);
|
||||
|
||||
n->instr = instr;
|
||||
instr->data = n;
|
||||
}
|
||||
|
||||
static void
|
||||
sched_dag_max_delay_cb(struct dag_node *node, void *state)
|
||||
{
|
||||
struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
|
||||
uint32_t max_delay = 0;
|
||||
|
||||
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
||||
struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
|
||||
max_delay = MAX2(child->max_delay, max_delay);
|
||||
}
|
||||
|
||||
n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
|
||||
}
|
||||
|
||||
static void
|
||||
sched_dag_init(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
ctx->mem_ctx = ralloc_context(NULL);
|
||||
|
||||
ctx->dag = dag_create(ctx->mem_ctx);
|
||||
|
||||
foreach_instr (instr, &ctx->unscheduled_list)
|
||||
sched_node_init(ctx, instr);
|
||||
|
||||
calculate_forward_deps(ctx);
|
||||
calculate_reverse_deps(ctx);
|
||||
|
||||
/*
|
||||
* Normal srcs won't be in SSA at this point, those are dealt with in
|
||||
* calculate_forward_deps() and calculate_reverse_deps(). But we still
|
||||
* have the false-dep information in SSA form, so go ahead and add
|
||||
* dependencies for that here:
|
||||
*/
|
||||
foreach_instr (instr, &ctx->unscheduled_list) {
|
||||
struct ir3_postsched_node *n = instr->data;
|
||||
struct ir3_instruction *src;
|
||||
|
||||
foreach_ssa_src_n (src, i, instr) {
|
||||
if (src->block != instr->block)
|
||||
continue;
|
||||
|
||||
/* we can end up with unused false-deps.. just skip them: */
|
||||
if (src->flags & IR3_INSTR_UNUSED)
|
||||
continue;
|
||||
|
||||
struct ir3_postsched_node *sn = src->data;
|
||||
|
||||
/* don't consider dependencies in other blocks: */
|
||||
if (src->block != instr->block)
|
||||
continue;
|
||||
|
||||
dag_add_edge(&sn->dag, &n->dag, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO do we want to do this after reverse-dependencies?
|
||||
dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
|
||||
}
|
||||
|
||||
static void
|
||||
sched_dag_destroy(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
ralloc_free(ctx->mem_ctx);
|
||||
ctx->mem_ctx = NULL;
|
||||
ctx->dag = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
|
||||
{
|
||||
ctx->block = block;
|
||||
ctx->scheduled = NULL;
|
||||
ctx->pred = NULL;
|
||||
|
||||
/* move all instructions to the unscheduled list, and
|
||||
* empty the block's instruction list (to which we will
|
||||
* be inserting).
|
||||
*/
|
||||
list_replace(&block->instr_list, &ctx->unscheduled_list);
|
||||
list_inithead(&block->instr_list);
|
||||
|
||||
// TODO once we are using post-sched for everything we can
|
||||
// just not stick in NOP's prior to post-sched, and drop this.
|
||||
// for now keep this, since it makes post-sched optional:
|
||||
foreach_instr_safe (instr, &ctx->unscheduled_list) {
|
||||
switch (instr->opc) {
|
||||
case OPC_NOP:
|
||||
case OPC_BR:
|
||||
case OPC_JUMP:
|
||||
list_delinit(&instr->node);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
sched_dag_init(ctx);
|
||||
|
||||
/* First schedule all meta:input instructions, followed by
|
||||
* tex-prefetch. We want all of the instructions that load
|
||||
* values into registers before the shader starts to go
|
||||
* before any other instructions. But in particular we
|
||||
* want inputs to come before prefetches. This is because
|
||||
* a FS's bary_ij input may not actually be live in the
|
||||
* shader, but it should not be scheduled on top of any
|
||||
* other input (but can be overwritten by a tex prefetch)
|
||||
*/
|
||||
foreach_instr_safe (instr, &ctx->unscheduled_list)
|
||||
if (instr->opc == OPC_META_INPUT)
|
||||
schedule(ctx, instr);
|
||||
|
||||
foreach_instr_safe (instr, &ctx->unscheduled_list)
|
||||
if (instr->opc == OPC_META_TEX_PREFETCH)
|
||||
schedule(ctx, instr);
|
||||
|
||||
while (!list_is_empty(&ctx->unscheduled_list)) {
|
||||
struct ir3_instruction *instr;
|
||||
|
||||
instr = choose_instr(ctx);
|
||||
|
||||
/* this shouldn't happen: */
|
||||
if (!instr) {
|
||||
ctx->error = true;
|
||||
break;
|
||||
}
|
||||
|
||||
unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
|
||||
d("delay=%u", delay);
|
||||
|
||||
/* and if we run out of instructions that can be scheduled,
|
||||
* then it is time for nop's:
|
||||
*/
|
||||
debug_assert(delay <= 6);
|
||||
while (delay > 0) {
|
||||
ir3_NOP(block);
|
||||
delay--;
|
||||
}
|
||||
|
||||
schedule(ctx, instr);
|
||||
}
|
||||
|
||||
sched_dag_destroy(ctx);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
is_self_mov(struct ir3_instruction *instr)
|
||||
{
|
||||
if (!is_same_type_mov(instr))
|
||||
return false;
|
||||
|
||||
if (instr->regs[0]->num != instr->regs[1]->num)
|
||||
return false;
|
||||
|
||||
if (instr->regs[0]->flags & IR3_REG_RELATIV)
|
||||
return false;
|
||||
|
||||
if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
|
||||
IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
|
||||
IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
|
||||
IR3_REG_EVEN | IR3_REG_POS_INF))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
|
||||
* as a result of places were before RA we are not sure that it is
|
||||
* safe to eliminate. We could eliminate these earlier, but sometimes
|
||||
* they are tangled up in false-dep's, etc, so it is easier just to
|
||||
* let them exist until after RA
|
||||
*/
|
||||
static void
|
||||
cleanup_self_movs(struct ir3 *ir)
|
||||
{
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
struct ir3_register *reg;
|
||||
|
||||
foreach_src (reg, instr) {
|
||||
if (!reg->instr)
|
||||
continue;
|
||||
|
||||
if (is_self_mov(reg->instr)) {
|
||||
list_delinit(®->instr->node);
|
||||
reg->instr = reg->instr->regs[1]->instr;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < instr->deps_count; i++) {
|
||||
if (is_self_mov(instr->deps[i])) {
|
||||
list_delinit(&instr->deps[i]->node);
|
||||
instr->deps[i] = instr->deps[i]->regs[1]->instr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
ir3_postsched(struct ir3_context *cctx)
|
||||
{
|
||||
struct ir3_postsched_ctx ctx = {
|
||||
.ctx = cctx,
|
||||
};
|
||||
|
||||
ir3_remove_nops(cctx->ir);
|
||||
cleanup_self_movs(cctx->ir);
|
||||
|
||||
foreach_block (block, &cctx->ir->block_list) {
|
||||
sched_block(&ctx, block);
|
||||
}
|
||||
|
||||
if (ctx.error)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -385,14 +385,11 @@ size_to_class(unsigned sz, bool half, bool high)
|
|||
static bool
|
||||
writes_gpr(struct ir3_instruction *instr)
|
||||
{
|
||||
if (is_store(instr))
|
||||
return false;
|
||||
if (instr->regs_count == 0)
|
||||
if (dest_regs(instr) == 0)
|
||||
return false;
|
||||
/* is dest a normal temp register: */
|
||||
struct ir3_register *reg = instr->regs[0];
|
||||
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
|
||||
return false;
|
||||
debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)));
|
||||
if ((reg->num == regid(REG_A0, 0)) ||
|
||||
(reg->num == regid(REG_P0, 0)))
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ libfreedreno_ir3_files = files(
|
|||
'ir3_nir_lower_tex_prefetch.c',
|
||||
'ir3_nir_lower_tg4_to_tex.c',
|
||||
'ir3_nir_move_varying_inputs.c',
|
||||
'ir3_postsched.c',
|
||||
'ir3_print.c',
|
||||
'ir3_ra.c',
|
||||
'ir3_sched.c',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue