diff --git a/src/panfrost/compiler/bifrost/bi_lower_mkvec_swz.c b/src/panfrost/compiler/bifrost/bi_lower_mkvec_swz.c new file mode 100644 index 00000000000..5bdc4fc58fb --- /dev/null +++ b/src/panfrost/compiler/bifrost/bi_lower_mkvec_swz.c @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2026 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "bi_builder.h" +#include "bi_swizzles.h" +#include "compiler.h" +#include "valhall.h" + +static bi_index +bi_i8_pair_as_i16(bi_index lo, bi_index hi) +{ + /* If they're both constants, we can fold them together */ + if (lo.type == BI_INDEX_CONSTANT && hi.type == BI_INDEX_CONSTANT) { + uint32_t lo_val = bi_apply_swizzle(lo.value, lo.swizzle) & 0xff; + uint32_t hi_val = bi_apply_swizzle(hi.value, hi.swizzle) & 0xff; + return bi_imm_u16(lo_val | (hi_val << 8)); + } + + if (!bi_is_word_equiv(lo, hi)) + return bi_null(); + + unsigned lo_byte = lo.swizzle - BI_SWIZZLE_B0; + unsigned hi_byte = hi.swizzle - BI_SWIZZLE_B0; + assert(lo_byte < 4 && hi_byte < 4); + + if ((lo_byte & 1) == 0 && hi_byte == lo_byte + 1) { + bi_index i16 = lo; + i16.swizzle = lo_byte == 0 ? BI_SWIZZLE_H00 : BI_SWIZZLE_H11; + return i16; + } + + return bi_null(); +} + +static void +compact_i8_constants(bi_index *src, unsigned nr_src) +{ + uint8_t values[4] = { 0, 0, 0, 0 }; + unsigned nr_values = 0; + + for (unsigned i = 0; i < nr_src; i++) { + if (src[i].type != BI_INDEX_CONSTANT) + continue; + + if (src[i].value == 0) { + /* Sanitize zero swizzles */ + src[i].swizzle = BI_SWIZZLE_B0; + continue; + } + + /* Fold the swizzle (if any) and mask */ + src[i].value = bi_apply_swizzle(src[i].value, src[i].swizzle) & 0xff; + + unsigned v = 0; + for (; v < nr_values; v++) { + if (values[v] == src[i].value) + break; + } + if (v == nr_values) + values[nr_values++] = src[i].value; + } + + /* If the only constants we found were zero, we're done */ + if (nr_values == 0) + return; + + for (unsigned i = 0; i < nr_src; i++) { + if (src[i].type != BI_INDEX_CONSTANT || src[i].value == 0) + continue; + + unsigned v = 0; + for (; v < nr_values; v++) { + if (values[v] == src[i].value) + break; + } + assert(v < nr_values); + + /* Fold two constants into one so that bi_schedule will see half the + * number of unique constants. We only have .b0 and .b2 swizzles on + * MKVEC.i8v4 on Bifrost so we can't place them in b1 or b3. + */ + uint32_t v32_lo = values[v & ~1]; + uint32_t v32_hi = values[v | 1]; + src[i] = bi_imm_u32(v32_lo | (v32_hi << 16)); + src[i].swizzle = (v & 1) ? BI_SWIZZLE_B2 : BI_SWIZZLE_B0; + } +} + +static bi_instr * +build_swz_v2i16_to(bi_builder *b, bi_index dst, bi_index src) +{ + if (src.swizzle == BI_SWIZZLE_H01) + return bi_mov_i32_to(b, dst, src); + + /* On Valhall, we don't have SWZ.v2i16 but IADD has a swizzle */ + if (b->shader->arch >= 9) + return bi_iadd_v2u16_to(b, dst, src, bi_zero(), false); + else + return bi_swz_v2i16_to(b, dst, src); +} + +static bi_instr * +build_mkvec_v4i8_to(bi_builder *b, bi_index dst, const bi_index src[4]) +{ + unsigned bytes[4]; + bool all_constant = true; + for (unsigned i = 0; i < 4; i++) { + STATIC_ASSERT(BI_SWIZZLE_B3 - BI_SWIZZLE_B0 == 3); + assert(src[i].swizzle >= BI_SWIZZLE_B0); + assert(src[i].swizzle <= BI_SWIZZLE_B3); + bytes[i] = src[i].swizzle - BI_SWIZZLE_B0; + if (src[i].type != BI_INDEX_CONSTANT) + all_constant = false; + } + + if (all_constant) { + uint32_t v32 = 0; + for (unsigned i = 0; i < 4; i++) { + uint32_t v8 = bi_apply_swizzle(src[i].value, src[i].swizzle) & 0xff; + v32 |= v8 << (i * 8); + } + return bi_mov_i32_to(b, dst, bi_imm_u32(v32)); + } + + /* Check for U8_TO_U32 */ + if (bi_is_zero(src[1]) && bi_is_zero(src[2]) && bi_is_zero(src[3])) + return bi_u8_to_u32_to(b, dst, src[0]); + + /* Check for V2U8_TO_V2U16 */ + enum bi_swizzle swizzle = BI_SWIZZLE_B0123; + if (bi_is_word_equiv(src[0], src[2]) && + bi_is_zero(src[1]) && bi_is_zero(src[3])) { + unsigned v2u8_bytes[4] = { bytes[0], bytes[0], bytes[2], bytes[2] }; + bool valid_swizzle = + bi_swizzle_from_byte_channels(v2u8_bytes, &swizzle); + assert(valid_swizzle); + + bi_index v2u8_src = src[0]; + v2u8_src.swizzle = swizzle; + + return bi_v2u8_to_v2u16_to(b, dst, v2u8_src); + } + + /* Check if we can do a swizzled MOV of some form */ + if (bi_is_word_equiv(src[0], src[1]) && + bi_is_word_equiv(src[0], src[2]) && + bi_is_word_equiv(src[0], src[3]) && + bi_swizzle_from_byte_channels(bytes, &swizzle)) { + bi_index swz_src = src[0]; + swz_src.swizzle = swizzle; + + /* Check for MOV.i32 and SWZ.v2i16 */ + if (swizzle == BI_SWIZZLE_H00 || + swizzle == BI_SWIZZLE_H01 || + swizzle == BI_SWIZZLE_H10 || + swizzle == BI_SWIZZLE_H11) + return build_swz_v2i16_to(b, dst, swz_src); + + if (b->shader->arch >= 9) { + /* On v9 and v10, LSHIFT_OR.v4i8 has a limited swizzle */ + if (bi_op_supports_swizzle(BI_OPCODE_LSHIFT_OR_V4I8, 0, swizzle, + b->shader->arch)) { + return bi_lshift_or_v4i8_to(b, dst, swz_src, + bi_imm_u8(0), bi_imm_u8(0)); + } + } else { + /* Check for SWZ.v4i8 */ + if (bi_op_supports_swizzle(BI_OPCODE_SWZ_V4I8, 0, swizzle, + b->shader->arch)) { + return bi_swz_v4i8_to(b, dst, swz_src); + } + } + } + + bi_index v2_lo = bi_i8_pair_as_i16(src[0], src[1]); + bi_index v2_hi = bi_i8_pair_as_i16(src[2], src[3]); + if (!bi_is_null(v2_lo) && !bi_is_null(v2_hi)) { + /* Check for U16_TO_U32 */ + if (bi_is_zero(v2_hi)) + return bi_u16_to_u32_to(b, dst, v2_lo); + + /* Check for MKVEC.v2i16 */ + return bi_mkvec_v2i16_to(b, dst, v2_lo, v2_hi); + } + + /* On Valhal+, we can do any v4i8 in two instructions */ + if (b->shader->arch >= 9) { + if (bi_is_zero(src[2]) && bi_is_zero(src[3])) + return bi_mkvec_v2i8_to(b, dst, src[0], src[1], bi_zero()); + + if (bi_is_word_equiv(src[2], src[3]) && bytes[2] == 0 && bytes[3] == 1) + return bi_mkvec_v2i8_to(b, dst, src[0], src[1], src[2]); + + bi_index acc = bi_mkvec_v2i8(b, src[2], src[3], bi_zero()); + return bi_mkvec_v2i8_to(b, dst, src[0], src[1], acc); + } else { + bi_index v4_src[4]; + for (unsigned i = 0; i < 4; i++) { + v4_src[i] = src[i]; + + /* We can only swizzle to even bytes */ + if (src[i].type != BI_INDEX_CONSTANT && (bytes[i] & 1)) + v4_src[i] = bi_byte(bi_swz_v4i8(b, v4_src[i]), 0); + } + + compact_i8_constants(v4_src, 4); + + return bi_mkvec_v4i8_to(b, dst, v4_src[0], v4_src[1], + v4_src[2], v4_src[3]); + } +} + +static bi_instr * +lower_mkvec_v4i8(bi_builder *b, bi_instr *I) +{ + return build_mkvec_v4i8_to(b, I->dest[0], I->src); +} + +static bi_instr * +lower_swz_v4i8(bi_builder *b, bi_instr *I) +{ + unsigned bytes[4] = {0, 0, 0, 0}; + bi_swizzle_to_byte_channels(I->src[0].swizzle, bytes); + + bi_index src[4]; + for (unsigned i = 0; i < 4; i++) { + src[i] = I->src[0]; + src[i].swizzle = BI_SWIZZLE_B0 + bytes[i]; + } + + return build_mkvec_v4i8_to(b, I->dest[0], src); +} + +static bi_instr * +lower_mkvec_v2i16(bi_builder *b, bi_instr *I) +{ + for (unsigned i = 0; i < 2; i++) { + assert(I->src[i].swizzle == BI_SWIZZLE_H0 || + I->src[i].swizzle == BI_SWIZZLE_H1); + } + + if (bi_is_word_equiv(I->src[0], I->src[1])) { + bi_index src = I->src[0]; + src.swizzle = bi_swizzle_from_half(I->src[0].swizzle == BI_SWIZZLE_H1, + I->src[1].swizzle == BI_SWIZZLE_H1); + return build_swz_v2i16_to(b, I->dest[0], src); + } + + if (bi_is_zero(I->src[1])) + return bi_u16_to_u32_to(b, I->dest[0], I->src[0]); + + return NULL; +} + +static bi_instr * +lower_swz_v2i16(bi_builder *b, bi_instr *I) +{ + return build_swz_v2i16_to(b, I->dest[0], I->src[0]); +} + +static bi_instr * +lower(bi_builder *b, bi_instr *I) +{ + switch (I->op) { + case BI_OPCODE_MKVEC_V2I16: + return lower_mkvec_v2i16(b, I); + + case BI_OPCODE_MKVEC_V4I8: + return lower_mkvec_v4i8(b, I); + + case BI_OPCODE_SWZ_V2I16: + return lower_swz_v2i16(b, I); + + case BI_OPCODE_SWZ_V4I8: + return lower_swz_v4i8(b, I); + + default: + return NULL; + } +} + +void +bi_lower_mkvec_swz(bi_context *ctx) +{ + bi_foreach_instr_global_safe(ctx, I) { + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + + if (lower(&b, I)) + bi_remove_instruction(I); + } +} diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 012cb4ceaad..18042236db7 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -5352,6 +5352,7 @@ bi_compile_variant_nir(nir_shader *nir, } bi_lower_opt_instructions(ctx); + bi_lower_mkvec_swz(ctx); if (ctx->arch >= 9) { va_lower_isel(ctx); diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index d6dca04fc9b..d77ffd77820 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1523,6 +1523,7 @@ void va_optimize(bi_context *ctx); void va_lower_split_64bit(bi_context *ctx); void bi_lower_opt_instructions(bi_context *ctx); +void bi_lower_mkvec_swz(bi_context *ctx); void bi_iterator_schedule(bi_context *ctx); void bi_pressure_schedule(bi_context *ctx); diff --git a/src/panfrost/compiler/bifrost/meson.build b/src/panfrost/compiler/bifrost/meson.build index a8fa63328df..f46d87f1865 100644 --- a/src/panfrost/compiler/bifrost/meson.build +++ b/src/panfrost/compiler/bifrost/meson.build @@ -11,6 +11,7 @@ libpanfrost_bifrost_files = files( 'bi_layout.c', 'bi_liveness.c', 'bi_lower_divergent_indirects.c', + 'bi_lower_mkvec_swz.c', 'bi_lower_spill.c', 'bi_lower_swizzle.c', 'bi_print.c', diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-lower-isel.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-lower-isel.cpp index de1a5fd6adf..df4c4cd8dca 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-lower-isel.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-lower-isel.cpp @@ -32,24 +32,6 @@ class LowerIsel : public testing::Test { bi_index reg, x, y, z; }; -TEST_F(LowerIsel, 8BitSwizzles) -{ - for (unsigned i = 0; i < 4; ++i) { - CASE(bi_swz_v4i8_to(b, reg, bi_byte(reg, i)), - bi_iadd_v4u8_to(b, reg, bi_byte(reg, i), bi_zero(), false)); - } -} - -TEST_F(LowerIsel, 16BitSwizzles) -{ - for (unsigned i = 0; i < 2; ++i) { - for (unsigned j = 0; j < 2; ++j) { - CASE(bi_swz_v2i16_to(b, reg, bi_swz_16(reg, i, j)), - bi_iadd_v2u16_to(b, reg, bi_swz_16(reg, i, j), bi_zero(), false)); - } - } -} - TEST_F(LowerIsel, JumpsLoweredToBranches) { bi_block block = {}; diff --git a/src/panfrost/compiler/bifrost/valhall/va_lower_isel.c b/src/panfrost/compiler/bifrost/valhall/va_lower_isel.c index bf8f5a79cbf..45ccd63255e 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_lower_isel.c +++ b/src/panfrost/compiler/bifrost/valhall/va_lower_isel.c @@ -9,32 +9,10 @@ #include "valhall.h" #include "compiler.h" -static bi_instr * -lower_swz_v4i8(bi_builder *b, bi_instr *I) -{ - /* IADD.v4u8 is gone on v11 */ - if (b->shader->arch >= 11) { - bi_index srcs[4] = {I->src[0], I->src[0], I->src[0], I->src[0]}; - unsigned channels[4]; - bi_swizzle_to_byte_channels(I->src[0].swizzle, channels); - return bi_make_vec_to(b, I->dest[0], srcs, channels, 4, 8); - } - - return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false); -} - static bi_instr * lower(bi_builder *b, bi_instr *I) { switch (I->op) { - - /* Integer addition has swizzles and addition with 0 is canonical swizzle */ - case BI_OPCODE_SWZ_V2I16: - return bi_iadd_v2u16_to(b, I->dest[0], I->src[0], bi_zero(), false); - - case BI_OPCODE_SWZ_V4I8: - return lower_swz_v4i8(b, I); - case BI_OPCODE_ICMP_I32: return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type);