pan/bi: Add bytewise copy propagation

This adds a new bytewise copy propagation pass which chews through MKVEC
and SWZ instructions.  The word-based copy propagation pass only existed
to chew through SPLIT/COLLECT but MKVEC is COLLECT for bytes and we had
nothing to help with that.

This is actually two passes in one: Byte propagation and swizzle
propagation. Any time we see a MKVEC, we look at its sources only as
bytes and chase individual bytes back, through other MKVEC and SWZ, to
their generating instruction and make the MKVEC only consume the
original bytes.  If the MKVEC happens to construct something that's just
a swizzle of another def (this is fairly common), we record that as
well. The idea here is that a lot of MKVEC just consume other MKVEC and
we can get rid of the intermediate ones or even the whole chain if it
just ends up being a swizzle in the end.

For SWZ instructions, we first look at them like a MKVEC of the
individual bytes they consume.  If that doesn't yield a single swizzled
word, we then crawl through the words table, just accumulating swizzles.
This gives us the best (closest to the generating instructions) coherent
word.  We could also replace SWZ with MKVEC and just do byte propagation
but MKVEC is often 2 instructions whereas SWZ is often one (or folded
into a source) so this is probably the better balance.

Finally, we not only replace the MKVEC and SWZ instructions but we also
attempt to propagate swizzles into individual ALU op sources.  For v4i8
ops, this often fails since the full generality isn't always available
but for fp16, we can almost always fold the swizzle into the consuming
instruction.

Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40720>
This commit is contained in:
Faith Ekstrand 2026-03-29 04:04:21 -04:00 committed by Marge Bot
parent a4e9002660
commit db8cb73b34

View file

@ -6,6 +6,7 @@
#include "bi_builder.h"
#include "compiler.h"
#include "util/hash_table.h"
/* SSA copy propagation */
@ -20,8 +21,8 @@ bi_reads_fau(bi_instr *ins)
return false;
}
void
bi_opt_copy_prop(bi_context *ctx)
static void
bi_opt_word_prop(bi_context *ctx)
{
/* Chase SPLIT of COLLECT. Instruction selection usually avoids this
* pattern (due to the split cache), but it is inevitably generated by
@ -96,3 +97,367 @@ bi_opt_copy_prop(bi_context *ctx)
free(replacement);
}
static bi_index
select_byte(bi_index idx, unsigned byte)
{
unsigned bytes[4] = {0, 0, 0, 0};
bi_swizzle_to_byte_channels(idx.swizzle, bytes);
idx.swizzle = BI_SWIZZLE_B0 + bytes[byte];
return idx;
}
struct byte_prop_entry {
bi_index word;
bi_index bytes[4];
};
static void *
byte_prop_ht_key(bi_index idx)
{
assert(bi_is_ssa(idx));
assert(idx.value < (1 << 26) - 1);
assert(idx.offset < (1 << 6));
/* Add 1 to value because zero keys are not allowed */
uint32_t key = (idx.value + 1) | ((uint32_t)idx.offset << 26);
return (void *)(uintptr_t)key;
}
static struct byte_prop_entry *
add_byte_prop_entry(struct hash_table *ht, bi_index idx)
{
assert(!_mesa_hash_table_search(ht, byte_prop_ht_key(idx)));
struct byte_prop_entry *entry = rzalloc(ht, struct byte_prop_entry);
_mesa_hash_table_insert(ht, byte_prop_ht_key(idx), entry);
return entry;
}
static struct byte_prop_entry *
get_byte_prop_entry(struct hash_table *ht, bi_index idx)
{
if (!bi_is_ssa(idx))
return NULL;
struct hash_entry *ht_entry =
_mesa_hash_table_search(ht, byte_prop_ht_key(idx));
if (ht_entry == NULL)
return NULL;
return ht_entry->data;
}
static bool
chase_bytes(struct byte_prop_entry *dst_entry, struct hash_table *ht)
{
bool progress = false;
for (unsigned i = 0; i < 4; i++) {
/* Chase as far as we can, updating dst_entry->src[i] as we go */
while (true) {
assert(bi_swizzle_replicates_8(dst_entry->bytes[i].swizzle));
unsigned byte = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0;
struct byte_prop_entry *src_entry =
get_byte_prop_entry(ht, dst_entry->bytes[i]);
if (src_entry == NULL || bi_is_null(src_entry->bytes[byte]))
break;
progress = true;
dst_entry->bytes[i] = src_entry->bytes[byte];
}
}
unsigned swizzle_bytes[4];
bool is_swizzle = true, is_const = true;
for (unsigned i = 0; i < 4; i++) {
swizzle_bytes[i] = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0;
if (i > 0 && !bi_is_word_equiv(dst_entry->bytes[i], dst_entry->bytes[0]))
is_swizzle = false;
if (dst_entry->bytes[i].type != BI_INDEX_CONSTANT)
is_const = false;
}
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
if (is_const) {
uint32_t value = 0;
for (unsigned i = 0; i < 4; i++) {
uint8_t byte = bi_apply_swizzle(dst_entry->bytes[i].value,
dst_entry->bytes[i].swizzle);
value |= (uint32_t)byte << (i * 8);
}
dst_entry->word = bi_imm_u32(value);
} else if (is_swizzle && bi_swizzle_from_byte_channels(swizzle_bytes,
&swizzle)) {
dst_entry->word = dst_entry->bytes[0];
dst_entry->word.swizzle = swizzle;
}
return progress;
}
static bool
chase_word(struct byte_prop_entry *dst_entry, struct hash_table *ht)
{
/* Chase the swizzle, updaing dst_entry->word whenever we find a new valid
* swizzled word. For the purposes of this function, src will only be used
* to index into the prop entry table. The actual swizzle is represented
* by src_bytes[], which may or may not be a valid bi_swizzle.
*/
bi_index src = dst_entry->word;
unsigned src_bytes[4];
bi_swizzle_to_byte_channels(src.swizzle, src_bytes);
bool progress = false;
while (true) {
struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, src);
if (src_entry == NULL || bi_is_null(src_entry->word))
return progress;
if (src_entry->word.type == BI_INDEX_CONSTANT) {
assert(src_entry->word.swizzle == BI_SWIZZLE_H01);
const uint32_t src_value = src_entry->word.value;
uint32_t value = 0;
for (unsigned i = 0; i < 4; i++)
value |= ((src_value >> (src_bytes[i] * 8)) & 0xff) << (i * 8);
dst_entry->word = bi_imm_u32(value);
return true;
}
unsigned entry_bytes[4];
bi_swizzle_to_byte_channels(src_entry->word.swizzle, entry_bytes);
for (unsigned i = 0; i < 4; i++)
src_bytes[i] = entry_bytes[src_bytes[i]];
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
if (bi_swizzle_from_byte_channels(src_bytes, &swizzle)) {
progress = true;
dst_entry->word = src_entry->word;
dst_entry->word.swizzle = swizzle;
}
src = src_entry->word;
}
}
static void
byte_chase_instr_srcs(bi_context *ctx, bi_instr *I, struct hash_table *ht)
{
bi_foreach_src(I, s) {
if (bi_count_read_registers(I, s) != 1)
continue;
struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, I->src[s]);
if (src_entry == NULL)
continue;
bi_index repl;
if (bi_swizzle_replicates_8(I->src[s].swizzle)) {
unsigned byte = I->src[s].swizzle - BI_SWIZZLE_B0;
repl = src_entry->bytes[byte];
} else {
if (bi_is_null(src_entry->word))
continue;
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
if (!bi_try_compose_swizzles(&swizzle, I->src[s].swizzle,
src_entry->word.swizzle))
continue;
repl = src_entry->word;
repl.swizzle = swizzle;
}
if (!bi_op_supports_swizzle(I->op, s, repl.swizzle, ctx->arch))
continue;
if (bi_is_staging_src(I, s) && repl.type != BI_INDEX_NORMAL)
continue;
if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(I))
continue;
/* bi_replace_src uses the swizzle and other modifiers from the original
* and stops the replacement.
*/
I->src[s].swizzle = repl.swizzle;
repl.swizzle = BI_SWIZZLE_H01;
bi_replace_src(I, s, repl);
}
}
static bool
byte_chase_add_swz_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht)
{
assert(I->op == BI_OPCODE_MOV_I32 ||
I->op == BI_OPCODE_SWZ_V2I16 ||
I->op == BI_OPCODE_SWZ_V4I8);
/* Don't try to propagate registers */
if (I->src[0].type == BI_INDEX_REGISTER)
return false;
struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]);
for (unsigned i = 0; i < 4; i++)
dst_entry->bytes[i] = select_byte(I->src[0], i);
bool progress = chase_bytes(dst_entry, ht);
if (bi_is_null(dst_entry->word)) {
/* If chasing bytes wasn't able to produce a word, just chase words and
* see if we can do better.
*/
dst_entry->word = I->src[0];
progress = chase_word(dst_entry, ht);
}
if (!progress || bi_is_null(dst_entry->word))
return false;
I->src[0] = dst_entry->word;
if (dst_entry->word.swizzle == BI_SWIZZLE_H01)
I->op = BI_OPCODE_MOV_I32;
else if (dst_entry->word.swizzle == BI_SWIZZLE_H00 ||
dst_entry->word.swizzle == BI_SWIZZLE_H10 ||
dst_entry->word.swizzle == BI_SWIZZLE_H11)
I->op = BI_OPCODE_SWZ_V2I16;
else
I->op = BI_OPCODE_SWZ_V4I8;
return true;
}
static bool
byte_chase_add_vec_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht)
{
bi_index bytes[4];
switch (I->op) {
case BI_OPCODE_MKVEC_V2I16:
for (unsigned i = 0; i < 2; i++) {
assert(I->src[i].swizzle == BI_SWIZZLE_H00 ||
I->src[i].swizzle == BI_SWIZZLE_H11);
bytes[i * 2 + 0] = select_byte(I->src[i], 0);
bytes[i * 2 + 1] = select_byte(I->src[i], 1);
}
break;
case BI_OPCODE_MKVEC_V2I8:
bytes[0] = I->src[0];
bytes[1] = I->src[1];
bytes[2] = select_byte(I->src[2], 0);
bytes[3] = select_byte(I->src[2], 1);
break;
case BI_OPCODE_MKVEC_V4I8:
for (unsigned i = 0; i < 4; i++)
bytes[i] = I->src[i];
break;
case BI_OPCODE_U8_TO_U32:
bytes[0] = I->src[0];
bytes[1] = bi_imm_u8(0);
bytes[2] = bi_imm_u8(0);
bytes[3] = bi_imm_u8(0);
break;
case BI_OPCODE_U16_TO_U32:
bytes[0] = select_byte(I->src[0], 0);
bytes[1] = select_byte(I->src[0], 1);
bytes[2] = bi_imm_u8(0);
bytes[3] = bi_imm_u8(0);
break;
case BI_OPCODE_V2U8_TO_V2U16:
bytes[0] = select_byte(I->src[0], 0);
bytes[1] = bi_imm_u8(0);
bytes[2] = select_byte(I->src[0], 2);
bytes[3] = bi_imm_u8(0);
break;
default:
UNREACHABLE("Unhanded vec instruction");
}
struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]);
for (unsigned i = 0; i < 4; i++) {
/* Don't try to propagate registers */
if (bytes[i].type != BI_INDEX_REGISTER)
dst_entry->bytes[i] = bytes[i];
}
if (!chase_bytes(dst_entry, ht))
return false;
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
bi_mkvec_v4i8_to(&b, I->dest[0],
dst_entry->bytes[0], dst_entry->bytes[1],
dst_entry->bytes[2], dst_entry->bytes[3]);
bi_remove_instruction(I);
return true;
}
/** Byte propagation
*
* This is actually two passes in one: Byte propagation and swizzle
* propagation. Any time we see a MKVEC, we look at its sources only as bytes
* and chase individual bytes back, through other MKVEC and SWZ, to their
* generating instruction and make the MKVEC only consume the original bytes.
* If the MKVEC happens to construct something that's just a swizzle of
* another def (this is fairly common), we record that as well. The idea here
* is that a lot of MKVEC just consume other MKVEC and we can get rid of the
* intermediate ones or even the whole chain if it just ends up being a
* swizzle in the end.
*
* For SWZ instructions, we first look at them like a MKVEC of the individual
* bytes they consume. If that doesn't yield a single swizzled word, we then
* crawl through the words table, just accumulating swizzles. This gives us
* the best (closest to the generating instructions) coherent word. We could
* also replace SWZ with MKVEC and just do byte propagation but MKVEC is often
* 2 instructions whereas SWZ is often one (or folded into a source) so this
* is probably the better balance.
*
* Finally, we not only replace the MKVEC and SWZ instructions but we also
* attempt to propagate swizzles into individual ALU op sources.
*/
static void
bi_opt_byte_prop(bi_context *ctx)
{
struct hash_table *ht = _mesa_hash_table_create_u32_keys(NULL);
bi_foreach_instr_global_safe(ctx, I) {
switch (I->op) {
case BI_OPCODE_MKVEC_V2I16:
case BI_OPCODE_MKVEC_V2I8:
case BI_OPCODE_MKVEC_V4I8:
case BI_OPCODE_U8_TO_U32:
case BI_OPCODE_U16_TO_U32:
case BI_OPCODE_V2U8_TO_V2U16:
byte_chase_add_vec_instr(ctx, I, ht);
break;
case BI_OPCODE_MOV_I32:
case BI_OPCODE_SWZ_V2I16:
case BI_OPCODE_SWZ_V4I8:
byte_chase_add_swz_instr(ctx, I, ht);
break;
default:
byte_chase_instr_srcs(ctx, I, ht);
break;
}
}
_mesa_hash_table_destroy(ht, NULL);
}
void
bi_opt_copy_prop(bi_context *ctx)
{
bi_opt_word_prop(ctx);
bi_opt_byte_prop(ctx);
}