diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c index 1144fa056c6..10c512e7e01 100644 --- a/src/freedreno/ir3/ir3_lower_parallelcopy.c +++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c @@ -83,7 +83,8 @@ do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsig } static void -do_swap(struct ir3_instruction *instr, const struct copy_entry *entry) +do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr, + const struct copy_entry *entry) { assert(!entry->src.flags); /* TODO implement shared swaps */ @@ -104,21 +105,21 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry) physreg_t tmp = entry->dst < 2 ? 2 : 0; /* Swap src and the temporary */ - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = entry->src.reg & ~1u }, .dst = tmp, .flags = entry->flags & ~IR3_REG_HALF, }); /* Do the original swap with src replaced with tmp */ - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = tmp + (entry->src.reg & 1) }, .dst = entry->dst, .flags = entry->flags, }); /* Swap src and the temporary back */ - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = entry->src.reg & ~1u }, .dst = tmp, .flags = entry->flags & ~IR3_REG_HALF, @@ -130,7 +131,7 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry) * let the case above handle it. */ if (entry->dst >= RA_HALF_SIZE) { - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = entry->dst }, .dst = entry->src.reg, .flags = entry->flags, @@ -142,13 +143,29 @@ do_swap(struct ir3_instruction *instr, const struct copy_entry *entry) unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); - do_xor(instr, dst_num, dst_num, src_num, entry->flags); - do_xor(instr, src_num, src_num, dst_num, entry->flags); - do_xor(instr, dst_num, dst_num, src_num, entry->flags); + /* a5xx+ is known to support swz, which enables us to swap two registers + * in-place. If unsupported we emulate it using the xor trick. + */ + if (compiler->gpu_id < 500) { + do_xor(instr, dst_num, dst_num, src_num, entry->flags); + do_xor(instr, src_num, src_num, dst_num, entry->flags); + do_xor(instr, dst_num, dst_num, src_num, entry->flags); + } else { + struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2); + ir3_dst_create(swz, dst_num, entry->flags)->wrmask = 1; + ir3_dst_create(swz, src_num, entry->flags)->wrmask = 1; + ir3_src_create(swz, src_num, entry->flags)->wrmask = 1; + ir3_src_create(swz, dst_num, entry->flags)->wrmask = 1; + swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + swz->repeat = 1; + ir3_instr_move_before(swz, instr); + } } static void -do_copy(struct ir3_instruction *instr, const struct copy_entry *entry) +do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr, + const struct copy_entry *entry) { /* TODO implement shared copies */ assert(!(entry->flags & IR3_REG_SHARED)); @@ -159,19 +176,19 @@ do_copy(struct ir3_instruction *instr, const struct copy_entry *entry) /* TODO: is there a hw instruction we can use for this case? */ physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0; - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = entry->dst & ~1u }, .dst = tmp, .flags = entry->flags & ~IR3_REG_HALF, }); - do_copy(instr, &(struct copy_entry) { + do_copy(compiler, instr, &(struct copy_entry) { .src = entry->src, .dst = tmp + (entry->dst & 1), .flags = entry->flags, }); - do_swap(instr, &(struct copy_entry) { + do_swap(compiler, instr, &(struct copy_entry) { .src = { .reg = entry->dst & ~1u }, .dst = tmp, .flags = entry->flags & ~IR3_REG_HALF, @@ -262,7 +279,8 @@ split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry) } static void -_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx) +_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr, + struct copy_ctx *ctx) { /* Set up the bookkeeping */ memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst)); @@ -298,7 +316,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx) if (!entry->done && !entry_blocked(entry, ctx)) { entry->done = true; progress = true; - do_copy(instr, entry); + do_copy(compiler, instr, entry); for (unsigned j = 0; j < copy_entry_size(entry); j++) { if (!entry->src.flags) ctx->physreg_use_count[entry->src.reg + j]--; @@ -383,7 +401,7 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx) continue; } - do_swap(instr, entry); + do_swap(compiler, instr, entry); /* Split any blocking copies whose sources are only partially * contained within our destination. @@ -419,18 +437,18 @@ _handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx) } static void -handle_copies(struct ir3_instruction *instr, struct copy_entry *entries, - unsigned entry_count, bool mergedregs) +handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr, + struct copy_entry *entries, unsigned entry_count) { struct copy_ctx ctx; - if (mergedregs) { + if (v->mergedregs) { /* Half regs and full regs are in the same file, so handle everything * at once. */ memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count); ctx.entry_count = entry_count; - _handle_copies(instr, &ctx); + _handle_copies(v->shader->compiler, instr, &ctx); } else { /* There may be both half copies and full copies, so we have to split * them up since they don't interfere. @@ -440,14 +458,14 @@ handle_copies(struct ir3_instruction *instr, struct copy_entry *entries, if (entries[i].flags & IR3_REG_HALF) ctx.entries[ctx.entry_count++] = entries[i]; } - _handle_copies(instr, &ctx); + _handle_copies(v->shader->compiler, instr, &ctx); ctx.entry_count = 0; for (unsigned i = 0; i < entry_count; i++) { if (!(entries[i].flags & IR3_REG_HALF)) ctx.entries[ctx.entry_count++] = entries[i]; } - _handle_copies(instr, &ctx); + _handle_copies(v->shader->compiler, instr, &ctx); } } @@ -475,7 +493,7 @@ ir3_lower_copies(struct ir3_shader_variant *v) }); } } - handle_copies(instr, copies, copies_count, v->mergedregs); + handle_copies(v, instr, copies, copies_count); list_del(&instr->node); } else if (instr->opc == OPC_META_COLLECT) { copies_count = 0; @@ -489,7 +507,7 @@ ir3_lower_copies(struct ir3_shader_variant *v) .flags = flags, }); } - handle_copies(instr, copies, copies_count, v->mergedregs); + handle_copies(v, instr, copies, copies_count); list_del(&instr->node); } else if (instr->opc == OPC_META_SPLIT) { copies_count = 0; @@ -501,7 +519,7 @@ ir3_lower_copies(struct ir3_shader_variant *v) .src = get_copy_src(src, instr->split.off * reg_elem_size(dst)), .flags = flags, }); - handle_copies(instr, copies, copies_count, v->mergedregs); + handle_copies(v, instr, copies, copies_count); list_del(&instr->node); } else if (instr->opc == OPC_META_PHI) { list_del(&instr->node);