mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 02:48:06 +02:00
pan/bi: Add bytewise copy propagation
This adds a new bytewise copy propagation pass which chews through MKVEC and SWZ instructions. The word-based copy propagation pass only existed to chew through SPLIT/COLLECT but MKVEC is COLLECT for bytes and we had nothing to help with that. This is actually two passes in one: Byte propagation and swizzle propagation. Any time we see a MKVEC, we look at its sources only as bytes and chase individual bytes back, through other MKVEC and SWZ, to their generating instruction and make the MKVEC only consume the original bytes. If the MKVEC happens to construct something that's just a swizzle of another def (this is fairly common), we record that as well. The idea here is that a lot of MKVEC just consume other MKVEC and we can get rid of the intermediate ones or even the whole chain if it just ends up being a swizzle in the end. For SWZ instructions, we first look at them like a MKVEC of the individual bytes they consume. If that doesn't yield a single swizzled word, we then crawl through the words table, just accumulating swizzles. This gives us the best (closest to the generating instructions) coherent word. We could also replace SWZ with MKVEC and just do byte propagation but MKVEC is often 2 instructions whereas SWZ is often one (or folded into a source) so this is probably the better balance. Finally, we not only replace the MKVEC and SWZ instructions but we also attempt to propagate swizzles into individual ALU op sources. For v4i8 ops, this often fails since the full generality isn't always available but for fp16, we can almost always fold the swizzle into the consuming instruction. Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40720>
This commit is contained in:
parent
a4e9002660
commit
db8cb73b34
1 changed files with 367 additions and 2 deletions
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "bi_builder.h"
|
||||
#include "compiler.h"
|
||||
#include "util/hash_table.h"
|
||||
|
||||
/* SSA copy propagation */
|
||||
|
||||
|
|
@ -20,8 +21,8 @@ bi_reads_fau(bi_instr *ins)
|
|||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
bi_opt_copy_prop(bi_context *ctx)
|
||||
static void
|
||||
bi_opt_word_prop(bi_context *ctx)
|
||||
{
|
||||
/* Chase SPLIT of COLLECT. Instruction selection usually avoids this
|
||||
* pattern (due to the split cache), but it is inevitably generated by
|
||||
|
|
@ -96,3 +97,367 @@ bi_opt_copy_prop(bi_context *ctx)
|
|||
|
||||
free(replacement);
|
||||
}
|
||||
|
||||
static bi_index
|
||||
select_byte(bi_index idx, unsigned byte)
|
||||
{
|
||||
unsigned bytes[4] = {0, 0, 0, 0};
|
||||
bi_swizzle_to_byte_channels(idx.swizzle, bytes);
|
||||
idx.swizzle = BI_SWIZZLE_B0 + bytes[byte];
|
||||
return idx;
|
||||
}
|
||||
|
||||
struct byte_prop_entry {
|
||||
bi_index word;
|
||||
bi_index bytes[4];
|
||||
};
|
||||
|
||||
static void *
|
||||
byte_prop_ht_key(bi_index idx)
|
||||
{
|
||||
assert(bi_is_ssa(idx));
|
||||
|
||||
assert(idx.value < (1 << 26) - 1);
|
||||
assert(idx.offset < (1 << 6));
|
||||
|
||||
/* Add 1 to value because zero keys are not allowed */
|
||||
uint32_t key = (idx.value + 1) | ((uint32_t)idx.offset << 26);
|
||||
|
||||
return (void *)(uintptr_t)key;
|
||||
}
|
||||
|
||||
static struct byte_prop_entry *
|
||||
add_byte_prop_entry(struct hash_table *ht, bi_index idx)
|
||||
{
|
||||
assert(!_mesa_hash_table_search(ht, byte_prop_ht_key(idx)));
|
||||
struct byte_prop_entry *entry = rzalloc(ht, struct byte_prop_entry);
|
||||
_mesa_hash_table_insert(ht, byte_prop_ht_key(idx), entry);
|
||||
return entry;
|
||||
}
|
||||
|
||||
static struct byte_prop_entry *
|
||||
get_byte_prop_entry(struct hash_table *ht, bi_index idx)
|
||||
{
|
||||
if (!bi_is_ssa(idx))
|
||||
return NULL;
|
||||
|
||||
struct hash_entry *ht_entry =
|
||||
_mesa_hash_table_search(ht, byte_prop_ht_key(idx));
|
||||
if (ht_entry == NULL)
|
||||
return NULL;
|
||||
|
||||
return ht_entry->data;
|
||||
}
|
||||
|
||||
static bool
|
||||
chase_bytes(struct byte_prop_entry *dst_entry, struct hash_table *ht)
|
||||
{
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
/* Chase as far as we can, updating dst_entry->src[i] as we go */
|
||||
while (true) {
|
||||
assert(bi_swizzle_replicates_8(dst_entry->bytes[i].swizzle));
|
||||
unsigned byte = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0;
|
||||
|
||||
struct byte_prop_entry *src_entry =
|
||||
get_byte_prop_entry(ht, dst_entry->bytes[i]);
|
||||
if (src_entry == NULL || bi_is_null(src_entry->bytes[byte]))
|
||||
break;
|
||||
|
||||
progress = true;
|
||||
dst_entry->bytes[i] = src_entry->bytes[byte];
|
||||
}
|
||||
}
|
||||
|
||||
unsigned swizzle_bytes[4];
|
||||
bool is_swizzle = true, is_const = true;
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
swizzle_bytes[i] = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0;
|
||||
if (i > 0 && !bi_is_word_equiv(dst_entry->bytes[i], dst_entry->bytes[0]))
|
||||
is_swizzle = false;
|
||||
|
||||
if (dst_entry->bytes[i].type != BI_INDEX_CONSTANT)
|
||||
is_const = false;
|
||||
}
|
||||
|
||||
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
|
||||
if (is_const) {
|
||||
uint32_t value = 0;
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
uint8_t byte = bi_apply_swizzle(dst_entry->bytes[i].value,
|
||||
dst_entry->bytes[i].swizzle);
|
||||
value |= (uint32_t)byte << (i * 8);
|
||||
}
|
||||
dst_entry->word = bi_imm_u32(value);
|
||||
} else if (is_swizzle && bi_swizzle_from_byte_channels(swizzle_bytes,
|
||||
&swizzle)) {
|
||||
dst_entry->word = dst_entry->bytes[0];
|
||||
dst_entry->word.swizzle = swizzle;
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
chase_word(struct byte_prop_entry *dst_entry, struct hash_table *ht)
|
||||
{
|
||||
/* Chase the swizzle, updaing dst_entry->word whenever we find a new valid
|
||||
* swizzled word. For the purposes of this function, src will only be used
|
||||
* to index into the prop entry table. The actual swizzle is represented
|
||||
* by src_bytes[], which may or may not be a valid bi_swizzle.
|
||||
*/
|
||||
bi_index src = dst_entry->word;
|
||||
unsigned src_bytes[4];
|
||||
bi_swizzle_to_byte_channels(src.swizzle, src_bytes);
|
||||
|
||||
bool progress = false;
|
||||
while (true) {
|
||||
struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, src);
|
||||
if (src_entry == NULL || bi_is_null(src_entry->word))
|
||||
return progress;
|
||||
|
||||
if (src_entry->word.type == BI_INDEX_CONSTANT) {
|
||||
assert(src_entry->word.swizzle == BI_SWIZZLE_H01);
|
||||
const uint32_t src_value = src_entry->word.value;
|
||||
|
||||
uint32_t value = 0;
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
value |= ((src_value >> (src_bytes[i] * 8)) & 0xff) << (i * 8);
|
||||
|
||||
dst_entry->word = bi_imm_u32(value);
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned entry_bytes[4];
|
||||
bi_swizzle_to_byte_channels(src_entry->word.swizzle, entry_bytes);
|
||||
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
src_bytes[i] = entry_bytes[src_bytes[i]];
|
||||
|
||||
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
|
||||
if (bi_swizzle_from_byte_channels(src_bytes, &swizzle)) {
|
||||
progress = true;
|
||||
dst_entry->word = src_entry->word;
|
||||
dst_entry->word.swizzle = swizzle;
|
||||
}
|
||||
|
||||
src = src_entry->word;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
byte_chase_instr_srcs(bi_context *ctx, bi_instr *I, struct hash_table *ht)
|
||||
{
|
||||
bi_foreach_src(I, s) {
|
||||
if (bi_count_read_registers(I, s) != 1)
|
||||
continue;
|
||||
|
||||
struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, I->src[s]);
|
||||
if (src_entry == NULL)
|
||||
continue;
|
||||
|
||||
bi_index repl;
|
||||
if (bi_swizzle_replicates_8(I->src[s].swizzle)) {
|
||||
unsigned byte = I->src[s].swizzle - BI_SWIZZLE_B0;
|
||||
repl = src_entry->bytes[byte];
|
||||
} else {
|
||||
if (bi_is_null(src_entry->word))
|
||||
continue;
|
||||
|
||||
enum bi_swizzle swizzle = BI_SWIZZLE_H01;
|
||||
if (!bi_try_compose_swizzles(&swizzle, I->src[s].swizzle,
|
||||
src_entry->word.swizzle))
|
||||
continue;
|
||||
|
||||
repl = src_entry->word;
|
||||
repl.swizzle = swizzle;
|
||||
}
|
||||
|
||||
if (!bi_op_supports_swizzle(I->op, s, repl.swizzle, ctx->arch))
|
||||
continue;
|
||||
|
||||
if (bi_is_staging_src(I, s) && repl.type != BI_INDEX_NORMAL)
|
||||
continue;
|
||||
|
||||
if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(I))
|
||||
continue;
|
||||
|
||||
/* bi_replace_src uses the swizzle and other modifiers from the original
|
||||
* and stops the replacement.
|
||||
*/
|
||||
I->src[s].swizzle = repl.swizzle;
|
||||
repl.swizzle = BI_SWIZZLE_H01;
|
||||
bi_replace_src(I, s, repl);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
byte_chase_add_swz_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht)
|
||||
{
|
||||
assert(I->op == BI_OPCODE_MOV_I32 ||
|
||||
I->op == BI_OPCODE_SWZ_V2I16 ||
|
||||
I->op == BI_OPCODE_SWZ_V4I8);
|
||||
|
||||
/* Don't try to propagate registers */
|
||||
if (I->src[0].type == BI_INDEX_REGISTER)
|
||||
return false;
|
||||
|
||||
struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]);
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
dst_entry->bytes[i] = select_byte(I->src[0], i);
|
||||
|
||||
bool progress = chase_bytes(dst_entry, ht);
|
||||
if (bi_is_null(dst_entry->word)) {
|
||||
/* If chasing bytes wasn't able to produce a word, just chase words and
|
||||
* see if we can do better.
|
||||
*/
|
||||
dst_entry->word = I->src[0];
|
||||
progress = chase_word(dst_entry, ht);
|
||||
}
|
||||
|
||||
if (!progress || bi_is_null(dst_entry->word))
|
||||
return false;
|
||||
|
||||
I->src[0] = dst_entry->word;
|
||||
if (dst_entry->word.swizzle == BI_SWIZZLE_H01)
|
||||
I->op = BI_OPCODE_MOV_I32;
|
||||
else if (dst_entry->word.swizzle == BI_SWIZZLE_H00 ||
|
||||
dst_entry->word.swizzle == BI_SWIZZLE_H10 ||
|
||||
dst_entry->word.swizzle == BI_SWIZZLE_H11)
|
||||
I->op = BI_OPCODE_SWZ_V2I16;
|
||||
else
|
||||
I->op = BI_OPCODE_SWZ_V4I8;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
byte_chase_add_vec_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht)
|
||||
{
|
||||
bi_index bytes[4];
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_MKVEC_V2I16:
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
assert(I->src[i].swizzle == BI_SWIZZLE_H00 ||
|
||||
I->src[i].swizzle == BI_SWIZZLE_H11);
|
||||
bytes[i * 2 + 0] = select_byte(I->src[i], 0);
|
||||
bytes[i * 2 + 1] = select_byte(I->src[i], 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case BI_OPCODE_MKVEC_V2I8:
|
||||
bytes[0] = I->src[0];
|
||||
bytes[1] = I->src[1];
|
||||
bytes[2] = select_byte(I->src[2], 0);
|
||||
bytes[3] = select_byte(I->src[2], 1);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_MKVEC_V4I8:
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
bytes[i] = I->src[i];
|
||||
break;
|
||||
|
||||
case BI_OPCODE_U8_TO_U32:
|
||||
bytes[0] = I->src[0];
|
||||
bytes[1] = bi_imm_u8(0);
|
||||
bytes[2] = bi_imm_u8(0);
|
||||
bytes[3] = bi_imm_u8(0);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_U16_TO_U32:
|
||||
bytes[0] = select_byte(I->src[0], 0);
|
||||
bytes[1] = select_byte(I->src[0], 1);
|
||||
bytes[2] = bi_imm_u8(0);
|
||||
bytes[3] = bi_imm_u8(0);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_V2U8_TO_V2U16:
|
||||
bytes[0] = select_byte(I->src[0], 0);
|
||||
bytes[1] = bi_imm_u8(0);
|
||||
bytes[2] = select_byte(I->src[0], 2);
|
||||
bytes[3] = bi_imm_u8(0);
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE("Unhanded vec instruction");
|
||||
}
|
||||
|
||||
struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]);
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
/* Don't try to propagate registers */
|
||||
if (bytes[i].type != BI_INDEX_REGISTER)
|
||||
dst_entry->bytes[i] = bytes[i];
|
||||
}
|
||||
|
||||
if (!chase_bytes(dst_entry, ht))
|
||||
return false;
|
||||
|
||||
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
|
||||
bi_mkvec_v4i8_to(&b, I->dest[0],
|
||||
dst_entry->bytes[0], dst_entry->bytes[1],
|
||||
dst_entry->bytes[2], dst_entry->bytes[3]);
|
||||
bi_remove_instruction(I);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Byte propagation
|
||||
*
|
||||
* This is actually two passes in one: Byte propagation and swizzle
|
||||
* propagation. Any time we see a MKVEC, we look at its sources only as bytes
|
||||
* and chase individual bytes back, through other MKVEC and SWZ, to their
|
||||
* generating instruction and make the MKVEC only consume the original bytes.
|
||||
* If the MKVEC happens to construct something that's just a swizzle of
|
||||
* another def (this is fairly common), we record that as well. The idea here
|
||||
* is that a lot of MKVEC just consume other MKVEC and we can get rid of the
|
||||
* intermediate ones or even the whole chain if it just ends up being a
|
||||
* swizzle in the end.
|
||||
*
|
||||
* For SWZ instructions, we first look at them like a MKVEC of the individual
|
||||
* bytes they consume. If that doesn't yield a single swizzled word, we then
|
||||
* crawl through the words table, just accumulating swizzles. This gives us
|
||||
* the best (closest to the generating instructions) coherent word. We could
|
||||
* also replace SWZ with MKVEC and just do byte propagation but MKVEC is often
|
||||
* 2 instructions whereas SWZ is often one (or folded into a source) so this
|
||||
* is probably the better balance.
|
||||
*
|
||||
* Finally, we not only replace the MKVEC and SWZ instructions but we also
|
||||
* attempt to propagate swizzles into individual ALU op sources.
|
||||
*/
|
||||
static void
|
||||
bi_opt_byte_prop(bi_context *ctx)
|
||||
{
|
||||
struct hash_table *ht = _mesa_hash_table_create_u32_keys(NULL);
|
||||
|
||||
bi_foreach_instr_global_safe(ctx, I) {
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_MKVEC_V2I16:
|
||||
case BI_OPCODE_MKVEC_V2I8:
|
||||
case BI_OPCODE_MKVEC_V4I8:
|
||||
case BI_OPCODE_U8_TO_U32:
|
||||
case BI_OPCODE_U16_TO_U32:
|
||||
case BI_OPCODE_V2U8_TO_V2U16:
|
||||
byte_chase_add_vec_instr(ctx, I, ht);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_MOV_I32:
|
||||
case BI_OPCODE_SWZ_V2I16:
|
||||
case BI_OPCODE_SWZ_V4I8:
|
||||
byte_chase_add_swz_instr(ctx, I, ht);
|
||||
break;
|
||||
|
||||
default:
|
||||
byte_chase_instr_srcs(ctx, I, ht);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_mesa_hash_table_destroy(ht, NULL);
|
||||
}
|
||||
|
||||
void
|
||||
bi_opt_copy_prop(bi_context *ctx)
|
||||
{
|
||||
bi_opt_word_prop(ctx);
|
||||
bi_opt_byte_prop(ctx);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue