diff --git a/src/panfrost/compiler/bifrost/bi_opt_copy_prop.c b/src/panfrost/compiler/bifrost/bi_opt_copy_prop.c index 10093c71a48..455461911f9 100644 --- a/src/panfrost/compiler/bifrost/bi_opt_copy_prop.c +++ b/src/panfrost/compiler/bifrost/bi_opt_copy_prop.c @@ -6,6 +6,7 @@ #include "bi_builder.h" #include "compiler.h" +#include "util/hash_table.h" /* SSA copy propagation */ @@ -20,8 +21,8 @@ bi_reads_fau(bi_instr *ins) return false; } -void -bi_opt_copy_prop(bi_context *ctx) +static void +bi_opt_word_prop(bi_context *ctx) { /* Chase SPLIT of COLLECT. Instruction selection usually avoids this * pattern (due to the split cache), but it is inevitably generated by @@ -96,3 +97,367 @@ bi_opt_copy_prop(bi_context *ctx) free(replacement); } + +static bi_index +select_byte(bi_index idx, unsigned byte) +{ + unsigned bytes[4] = {0, 0, 0, 0}; + bi_swizzle_to_byte_channels(idx.swizzle, bytes); + idx.swizzle = BI_SWIZZLE_B0 + bytes[byte]; + return idx; +} + +struct byte_prop_entry { + bi_index word; + bi_index bytes[4]; +}; + +static void * +byte_prop_ht_key(bi_index idx) +{ + assert(bi_is_ssa(idx)); + + assert(idx.value < (1 << 26) - 1); + assert(idx.offset < (1 << 6)); + + /* Add 1 to value because zero keys are not allowed */ + uint32_t key = (idx.value + 1) | ((uint32_t)idx.offset << 26); + + return (void *)(uintptr_t)key; +} + +static struct byte_prop_entry * +add_byte_prop_entry(struct hash_table *ht, bi_index idx) +{ + assert(!_mesa_hash_table_search(ht, byte_prop_ht_key(idx))); + struct byte_prop_entry *entry = rzalloc(ht, struct byte_prop_entry); + _mesa_hash_table_insert(ht, byte_prop_ht_key(idx), entry); + return entry; +} + +static struct byte_prop_entry * +get_byte_prop_entry(struct hash_table *ht, bi_index idx) +{ + if (!bi_is_ssa(idx)) + return NULL; + + struct hash_entry *ht_entry = + _mesa_hash_table_search(ht, byte_prop_ht_key(idx)); + if (ht_entry == NULL) + return NULL; + + return ht_entry->data; +} + +static bool +chase_bytes(struct byte_prop_entry *dst_entry, struct hash_table *ht) +{ + bool progress = false; + for (unsigned i = 0; i < 4; i++) { + /* Chase as far as we can, updating dst_entry->src[i] as we go */ + while (true) { + assert(bi_swizzle_replicates_8(dst_entry->bytes[i].swizzle)); + unsigned byte = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0; + + struct byte_prop_entry *src_entry = + get_byte_prop_entry(ht, dst_entry->bytes[i]); + if (src_entry == NULL || bi_is_null(src_entry->bytes[byte])) + break; + + progress = true; + dst_entry->bytes[i] = src_entry->bytes[byte]; + } + } + + unsigned swizzle_bytes[4]; + bool is_swizzle = true, is_const = true; + for (unsigned i = 0; i < 4; i++) { + swizzle_bytes[i] = dst_entry->bytes[i].swizzle - BI_SWIZZLE_B0; + if (i > 0 && !bi_is_word_equiv(dst_entry->bytes[i], dst_entry->bytes[0])) + is_swizzle = false; + + if (dst_entry->bytes[i].type != BI_INDEX_CONSTANT) + is_const = false; + } + + enum bi_swizzle swizzle = BI_SWIZZLE_H01; + if (is_const) { + uint32_t value = 0; + for (unsigned i = 0; i < 4; i++) { + uint8_t byte = bi_apply_swizzle(dst_entry->bytes[i].value, + dst_entry->bytes[i].swizzle); + value |= (uint32_t)byte << (i * 8); + } + dst_entry->word = bi_imm_u32(value); + } else if (is_swizzle && bi_swizzle_from_byte_channels(swizzle_bytes, + &swizzle)) { + dst_entry->word = dst_entry->bytes[0]; + dst_entry->word.swizzle = swizzle; + } + + return progress; +} + +static bool +chase_word(struct byte_prop_entry *dst_entry, struct hash_table *ht) +{ + /* Chase the swizzle, updaing dst_entry->word whenever we find a new valid + * swizzled word. For the purposes of this function, src will only be used + * to index into the prop entry table. The actual swizzle is represented + * by src_bytes[], which may or may not be a valid bi_swizzle. + */ + bi_index src = dst_entry->word; + unsigned src_bytes[4]; + bi_swizzle_to_byte_channels(src.swizzle, src_bytes); + + bool progress = false; + while (true) { + struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, src); + if (src_entry == NULL || bi_is_null(src_entry->word)) + return progress; + + if (src_entry->word.type == BI_INDEX_CONSTANT) { + assert(src_entry->word.swizzle == BI_SWIZZLE_H01); + const uint32_t src_value = src_entry->word.value; + + uint32_t value = 0; + for (unsigned i = 0; i < 4; i++) + value |= ((src_value >> (src_bytes[i] * 8)) & 0xff) << (i * 8); + + dst_entry->word = bi_imm_u32(value); + return true; + } + + unsigned entry_bytes[4]; + bi_swizzle_to_byte_channels(src_entry->word.swizzle, entry_bytes); + + for (unsigned i = 0; i < 4; i++) + src_bytes[i] = entry_bytes[src_bytes[i]]; + + enum bi_swizzle swizzle = BI_SWIZZLE_H01; + if (bi_swizzle_from_byte_channels(src_bytes, &swizzle)) { + progress = true; + dst_entry->word = src_entry->word; + dst_entry->word.swizzle = swizzle; + } + + src = src_entry->word; + } +} + +static void +byte_chase_instr_srcs(bi_context *ctx, bi_instr *I, struct hash_table *ht) +{ + bi_foreach_src(I, s) { + if (bi_count_read_registers(I, s) != 1) + continue; + + struct byte_prop_entry *src_entry = get_byte_prop_entry(ht, I->src[s]); + if (src_entry == NULL) + continue; + + bi_index repl; + if (bi_swizzle_replicates_8(I->src[s].swizzle)) { + unsigned byte = I->src[s].swizzle - BI_SWIZZLE_B0; + repl = src_entry->bytes[byte]; + } else { + if (bi_is_null(src_entry->word)) + continue; + + enum bi_swizzle swizzle = BI_SWIZZLE_H01; + if (!bi_try_compose_swizzles(&swizzle, I->src[s].swizzle, + src_entry->word.swizzle)) + continue; + + repl = src_entry->word; + repl.swizzle = swizzle; + } + + if (!bi_op_supports_swizzle(I->op, s, repl.swizzle, ctx->arch)) + continue; + + if (bi_is_staging_src(I, s) && repl.type != BI_INDEX_NORMAL) + continue; + + if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(I)) + continue; + + /* bi_replace_src uses the swizzle and other modifiers from the original + * and stops the replacement. + */ + I->src[s].swizzle = repl.swizzle; + repl.swizzle = BI_SWIZZLE_H01; + bi_replace_src(I, s, repl); + } +} + +static bool +byte_chase_add_swz_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht) +{ + assert(I->op == BI_OPCODE_MOV_I32 || + I->op == BI_OPCODE_SWZ_V2I16 || + I->op == BI_OPCODE_SWZ_V4I8); + + /* Don't try to propagate registers */ + if (I->src[0].type == BI_INDEX_REGISTER) + return false; + + struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]); + for (unsigned i = 0; i < 4; i++) + dst_entry->bytes[i] = select_byte(I->src[0], i); + + bool progress = chase_bytes(dst_entry, ht); + if (bi_is_null(dst_entry->word)) { + /* If chasing bytes wasn't able to produce a word, just chase words and + * see if we can do better. + */ + dst_entry->word = I->src[0]; + progress = chase_word(dst_entry, ht); + } + + if (!progress || bi_is_null(dst_entry->word)) + return false; + + I->src[0] = dst_entry->word; + if (dst_entry->word.swizzle == BI_SWIZZLE_H01) + I->op = BI_OPCODE_MOV_I32; + else if (dst_entry->word.swizzle == BI_SWIZZLE_H00 || + dst_entry->word.swizzle == BI_SWIZZLE_H10 || + dst_entry->word.swizzle == BI_SWIZZLE_H11) + I->op = BI_OPCODE_SWZ_V2I16; + else + I->op = BI_OPCODE_SWZ_V4I8; + + return true; +} + +static bool +byte_chase_add_vec_instr(bi_context *ctx, bi_instr *I, struct hash_table *ht) +{ + bi_index bytes[4]; + switch (I->op) { + case BI_OPCODE_MKVEC_V2I16: + for (unsigned i = 0; i < 2; i++) { + assert(I->src[i].swizzle == BI_SWIZZLE_H00 || + I->src[i].swizzle == BI_SWIZZLE_H11); + bytes[i * 2 + 0] = select_byte(I->src[i], 0); + bytes[i * 2 + 1] = select_byte(I->src[i], 1); + } + break; + + case BI_OPCODE_MKVEC_V2I8: + bytes[0] = I->src[0]; + bytes[1] = I->src[1]; + bytes[2] = select_byte(I->src[2], 0); + bytes[3] = select_byte(I->src[2], 1); + break; + + case BI_OPCODE_MKVEC_V4I8: + for (unsigned i = 0; i < 4; i++) + bytes[i] = I->src[i]; + break; + + case BI_OPCODE_U8_TO_U32: + bytes[0] = I->src[0]; + bytes[1] = bi_imm_u8(0); + bytes[2] = bi_imm_u8(0); + bytes[3] = bi_imm_u8(0); + break; + + case BI_OPCODE_U16_TO_U32: + bytes[0] = select_byte(I->src[0], 0); + bytes[1] = select_byte(I->src[0], 1); + bytes[2] = bi_imm_u8(0); + bytes[3] = bi_imm_u8(0); + break; + + case BI_OPCODE_V2U8_TO_V2U16: + bytes[0] = select_byte(I->src[0], 0); + bytes[1] = bi_imm_u8(0); + bytes[2] = select_byte(I->src[0], 2); + bytes[3] = bi_imm_u8(0); + break; + + default: + UNREACHABLE("Unhanded vec instruction"); + } + + struct byte_prop_entry *dst_entry = add_byte_prop_entry(ht, I->dest[0]); + for (unsigned i = 0; i < 4; i++) { + /* Don't try to propagate registers */ + if (bytes[i].type != BI_INDEX_REGISTER) + dst_entry->bytes[i] = bytes[i]; + } + + if (!chase_bytes(dst_entry, ht)) + return false; + + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + bi_mkvec_v4i8_to(&b, I->dest[0], + dst_entry->bytes[0], dst_entry->bytes[1], + dst_entry->bytes[2], dst_entry->bytes[3]); + bi_remove_instruction(I); + + return true; +} + +/** Byte propagation + * + * This is actually two passes in one: Byte propagation and swizzle + * propagation. Any time we see a MKVEC, we look at its sources only as bytes + * and chase individual bytes back, through other MKVEC and SWZ, to their + * generating instruction and make the MKVEC only consume the original bytes. + * If the MKVEC happens to construct something that's just a swizzle of + * another def (this is fairly common), we record that as well. The idea here + * is that a lot of MKVEC just consume other MKVEC and we can get rid of the + * intermediate ones or even the whole chain if it just ends up being a + * swizzle in the end. + * + * For SWZ instructions, we first look at them like a MKVEC of the individual + * bytes they consume. If that doesn't yield a single swizzled word, we then + * crawl through the words table, just accumulating swizzles. This gives us + * the best (closest to the generating instructions) coherent word. We could + * also replace SWZ with MKVEC and just do byte propagation but MKVEC is often + * 2 instructions whereas SWZ is often one (or folded into a source) so this + * is probably the better balance. + * + * Finally, we not only replace the MKVEC and SWZ instructions but we also + * attempt to propagate swizzles into individual ALU op sources. + */ +static void +bi_opt_byte_prop(bi_context *ctx) +{ + struct hash_table *ht = _mesa_hash_table_create_u32_keys(NULL); + + bi_foreach_instr_global_safe(ctx, I) { + switch (I->op) { + case BI_OPCODE_MKVEC_V2I16: + case BI_OPCODE_MKVEC_V2I8: + case BI_OPCODE_MKVEC_V4I8: + case BI_OPCODE_U8_TO_U32: + case BI_OPCODE_U16_TO_U32: + case BI_OPCODE_V2U8_TO_V2U16: + byte_chase_add_vec_instr(ctx, I, ht); + break; + + case BI_OPCODE_MOV_I32: + case BI_OPCODE_SWZ_V2I16: + case BI_OPCODE_SWZ_V4I8: + byte_chase_add_swz_instr(ctx, I, ht); + break; + + default: + byte_chase_instr_srcs(ctx, I, ht); + break; + } + } + + _mesa_hash_table_destroy(ht, NULL); +} + +void +bi_opt_copy_prop(bi_context *ctx) +{ + bi_opt_word_prop(ctx); + bi_opt_byte_prop(ctx); +}