mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
ir3/lower_parallelcopy: Use SWZ
shader-db results on a650:
total instructions in shared programs: 1575484 -> 1574866 (-0.04%)
instructions in affected programs: 32579 -> 31961 (-1.90%)
helped: 75
HURT: 0
helped stats (abs) min: 1 max: 98 x̄: 8.24 x̃: 7
helped stats (rel) min: 0.41% max: 30.12% x̄: 2.47% x̃: 1.13%
95% mean confidence interval for instructions value: -10.97 -5.51
95% mean confidence interval for instructions %-change: -3.44% -1.51%
Instructions are helped.
total nops in shared programs: 355742 -> 355628 (-0.03%)
nops in affected programs: 18635 -> 18521 (-0.61%)
helped: 55
HURT: 147
helped stats (abs) min: 1 max: 14 x̄: 4.76 x̃: 6
helped stats (rel) min: 1.41% max: 100.00% x̄: 8.13% x̃: 4.76%
HURT stats (abs) min: 1 max: 2 x̄: 1.01 x̃: 1
HURT stats (rel) min: 0.56% max: 25.00% x̄: 2.09% x̃: 1.20%
95% mean confidence interval for nops value: -0.98 -0.15
95% mean confidence interval for nops %-change: -1.93% 0.55%
Inconclusive result (%-change mean confidence interval includes 0).
total non-nops in shared programs: 1219742 -> 1219238 (-0.04%)
non-nops in affected programs: 61125 -> 60621 (-0.82%)
helped: 220
HURT: 0
helped stats (abs) min: 1 max: 99 x̄: 2.29 x̃: 1
helped stats (rel) min: 0.19% max: 29.17% x̄: 0.90% x̃: 0.40%
95% mean confidence interval for non-nops value: -3.26 -1.32
95% mean confidence interval for non-nops %-change: -1.24% -0.56%
Non-nops are helped.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11565>
This commit is contained in:
parent
92bb37cb59
commit
42b3d83dd4
1 changed files with 42 additions and 24 deletions
|
|
@ -83,7 +83,8 @@ do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsig
|
|||
}
|
||||
|
||||
static void
|
||||
do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
|
||||
do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
const struct copy_entry *entry)
|
||||
{
|
||||
assert(!entry->src.flags);
|
||||
/* TODO implement shared swaps */
|
||||
|
|
@ -104,21 +105,21 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
|
|||
physreg_t tmp = entry->dst < 2 ? 2 : 0;
|
||||
|
||||
/* Swap src and the temporary */
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->src.reg & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
|
||||
/* Do the original swap with src replaced with tmp */
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = tmp + (entry->src.reg & 1) },
|
||||
.dst = entry->dst,
|
||||
.flags = entry->flags,
|
||||
});
|
||||
|
||||
/* Swap src and the temporary back */
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->src.reg & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
|
|
@ -130,7 +131,7 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
|
|||
* let the case above handle it.
|
||||
*/
|
||||
if (entry->dst >= RA_HALF_SIZE) {
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst },
|
||||
.dst = entry->src.reg,
|
||||
.flags = entry->flags,
|
||||
|
|
@ -142,13 +143,29 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
|
|||
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
do_xor(instr, src_num, src_num, dst_num, entry->flags);
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
/* a5xx+ is known to support swz, which enables us to swap two registers
|
||||
* in-place. If unsupported we emulate it using the xor trick.
|
||||
*/
|
||||
if (compiler->gpu_id < 500) {
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
do_xor(instr, src_num, src_num, dst_num, entry->flags);
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
} else {
|
||||
struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2);
|
||||
ir3_dst_create(swz, dst_num, entry->flags)->wrmask = 1;
|
||||
ir3_dst_create(swz, src_num, entry->flags)->wrmask = 1;
|
||||
ir3_src_create(swz, src_num, entry->flags)->wrmask = 1;
|
||||
ir3_src_create(swz, dst_num, entry->flags)->wrmask = 1;
|
||||
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->repeat = 1;
|
||||
ir3_instr_move_before(swz, instr);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
|
||||
do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
const struct copy_entry *entry)
|
||||
{
|
||||
/* TODO implement shared copies */
|
||||
assert(!(entry->flags & IR3_REG_SHARED));
|
||||
|
|
@ -159,19 +176,19 @@ do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
|
|||
/* TODO: is there a hw instruction we can use for this case? */
|
||||
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
|
||||
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
|
||||
do_copy(instr, &(struct copy_entry) {
|
||||
do_copy(compiler, instr, &(struct copy_entry) {
|
||||
.src = entry->src,
|
||||
.dst = tmp + (entry->dst & 1),
|
||||
.flags = entry->flags,
|
||||
});
|
||||
|
||||
do_swap(instr, &(struct copy_entry) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
|
|
@ -262,7 +279,8 @@ split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
|
|||
}
|
||||
|
||||
static void
|
||||
_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
|
||||
_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
struct copy_ctx *ctx)
|
||||
{
|
||||
/* Set up the bookkeeping */
|
||||
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
|
||||
|
|
@ -298,7 +316,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
|
|||
if (!entry->done && !entry_blocked(entry, ctx)) {
|
||||
entry->done = true;
|
||||
progress = true;
|
||||
do_copy(instr, entry);
|
||||
do_copy(compiler, instr, entry);
|
||||
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||
if (!entry->src.flags)
|
||||
ctx->physreg_use_count[entry->src.reg + j]--;
|
||||
|
|
@ -383,7 +401,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
|
|||
continue;
|
||||
}
|
||||
|
||||
do_swap(instr, entry);
|
||||
do_swap(compiler, instr, entry);
|
||||
|
||||
/* Split any blocking copies whose sources are only partially
|
||||
* contained within our destination.
|
||||
|
|
@ -419,18 +437,18 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
|
|||
}
|
||||
|
||||
static void
|
||||
handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
|
||||
unsigned entry_count, bool mergedregs)
|
||||
handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
|
||||
struct copy_entry *entries, unsigned entry_count)
|
||||
{
|
||||
struct copy_ctx ctx;
|
||||
|
||||
if (mergedregs) {
|
||||
if (v->mergedregs) {
|
||||
/* Half regs and full regs are in the same file, so handle everything
|
||||
* at once.
|
||||
*/
|
||||
memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count);
|
||||
ctx.entry_count = entry_count;
|
||||
_handle_copies(instr, &ctx);
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
} else {
|
||||
/* There may be both half copies and full copies, so we have to split
|
||||
* them up since they don't interfere.
|
||||
|
|
@ -440,14 +458,14 @@ handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
|
|||
if (entries[i].flags & IR3_REG_HALF)
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(instr, &ctx);
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (!(entries[i].flags & IR3_REG_HALF))
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(instr, &ctx);
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -475,7 +493,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
|
|||
});
|
||||
}
|
||||
}
|
||||
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_COLLECT) {
|
||||
copies_count = 0;
|
||||
|
|
@ -489,7 +507,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
|
|||
.flags = flags,
|
||||
});
|
||||
}
|
||||
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_SPLIT) {
|
||||
copies_count = 0;
|
||||
|
|
@ -501,7 +519,7 @@ ir3_lower_copies(struct ir3_shader_variant *v)
|
|||
.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
|
||||
.flags = flags,
|
||||
});
|
||||
handle_copies(instr, copies, copies_count, v->mergedregs);
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_PHI) {
|
||||
list_del(&instr->node);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue