pan/bi: Lower combines to rewrites for scalars

This avoids unneeded moves for scalars. It still generates suboptimal
code for vectors, however.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4288>
This commit is contained in:
Alyssa Rosenzweig 2020-03-23 16:48:58 -04:00 committed by Marge Bot
parent e0a51d5308
commit 5a3493c536

View file

@ -28,18 +28,32 @@
*
* v = combine x, y, z, w
*
* These combines need to be lowered by the pass in this file.
* These combines need to be lowered by the pass in this file. Fix a given
* source at component c.
*
* First suppose the source is SSA. If it is also scalar, then we may rewrite
* the destination of the generating instruction (unique by SSA+scalar) to
* write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
* (the original by scalar). If it is vector, there are two cases. If the
* component c is `x`, we are accessing v.x, and each of the succeeding
* components y, z... up to the last component of the vector are accessed
* sequentially, then we may perform the same rewrite. If this is not the case,
* rewriting would require a swizzle or writemask (TODO), so we fallback on a
* move.
*
* Otherwise is the source is not SSA, we also fallback on a move. We could
* probably do better.
*/
static void
bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp)
bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
{
unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
unsigned bytes = bits / 8;
bi_instruction move = {
.type = BI_MOV,
.dest = parent->dest,
.dest = R,
.dest_type = parent->dest_type,
.writemask = ((1 << bytes) - 1) << (bytes * comp),
.src = { parent->src[comp] },
@ -50,19 +64,139 @@ bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp)
bi_emit_before(ctx, parent, move);
}
/* Gets the instruction generating a given source. Combine lowering is
* accidentally O(n^2) right now because this function is O(n) instead of O(1).
* If this pass is slow, this cost can be avoided in favour for better
* bookkeeping. */
static bi_instruction *
bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
{
bi_foreach_instr_global(ctx, ins) {
if (ins->dest == idx)
if ((ins->writemask & mask) == mask)
return ins;
}
return NULL;
}
/* Rewrites uses of an index. Again, this could be O(n) to the program but is
* currently O(nc) to the program and number of combines, so the pass becomes
* effectively O(n^2). Better bookkeeping would bring down to linear if that's
* an issue. */
static void
bi_rewrite_uses(bi_context *ctx,
unsigned old, unsigned oldc,
unsigned new, unsigned newc)
{
bi_foreach_instr_global(ctx, ins) {
bi_foreach_src(ins, s) {
if (ins->src[s] != old) continue;
for (unsigned i = 0; i < 16; ++i)
ins->swizzle[s][0] += (newc - oldc);
ins->src[s] = new;
}
}
}
/* Shifts the writemask of an instruction by a specified byte count,
* rotating the sources to compensate. Returns true if successful, and
* returns false if not (nondestructive in this case). */
static bool
bi_shift_mask_scalar(bi_instruction *ins, signed shift)
{
unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
unsigned bytes = sz / 8;
/* If things are misaligned, we bail. Check if shift % bytes is
* nonzero. Note bytes is a power-of-two. */
if (shift & (bytes - 1))
return false;
/* Ensure there are no funny types */
bi_foreach_src(ins, s) {
if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
return false;
}
/* Otherwise, shift is divisible by bytes, and all relevant src types
* are the same size as the dest type. */
ins->writemask <<= shift;
/* Shift swizzle so old i'th component is accessed by new (i + j)'th
* component where j is component shift */
signed component_shift = shift / bytes;
bi_foreach_src(ins, s) {
if (!ins->src[s]) continue;
size_t overlap = sizeof(ins->swizzle[s]) - abs(component_shift);
if (component_shift > 0)
memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
else
memmove(ins->swizzle[s], ins->swizzle[s] - component_shift, overlap);
}
return true;
}
/* Tries to lower a given source of a combine to an appropriate rewrite,
* returning true if successful, and false with no changes otherwise. */
static bool
bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R)
{
unsigned src = ins->src[s];
/* We currently only handle SSA */
if (!src) return false;
if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;
/* We are SSA. Lookup the generating instruction. */
unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
bi_instruction *parent = bi_get_parent(ctx, src,
0xF << (ins->swizzle[s][0] * bytes));
if (!parent) return false;
/* We have a parent instuction, sanity check the typesize */
unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
if (pbytes != bytes) return false;
/* Scalar? */
if (parent->writemask != ((1 << bytes) - 1)) return false;
if (!bi_shift_mask_scalar(parent, bytes * s)) return false;
bi_rewrite_uses(ctx, parent->dest, 0, R, s);
parent->dest = R;
return true;
}
void
bi_lower_combine(bi_context *ctx, bi_block *block)
{
bi_foreach_instr_in_block_safe(block, ins) {
if (ins->type != BI_COMBINE) continue;
bi_foreach_src(ins, s) {
if (!ins->src[s])
break;
/* The vector itself can't be shifted */
assert(ins->writemask & 0x1);
bi_insert_combine_mov(ctx, ins, s);
unsigned R = bi_make_temp_reg(ctx);
bi_foreach_src(ins, s) {
if (!bi_lower_combine_src(ctx, ins, s, R))
bi_insert_combine_mov(ctx, ins, s, R);
}
bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
bi_remove_instruction(ins);
}
}