mesa/src/panfrost/compiler/bi_opt_mod_props.c
Mary Guillemard fac8c9def0
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run
pan/bi: Reintroduce bi_fuse_small_int_to_f32 on v11+
On v11+, all small integers instruction variants are gone, however we
can now use widen on src0 just fine.

That mean we can get ride of mid conversion by relying on swizzle
instead while respecting signess of the inner instruction.

This helps a little bit on clpeak with panvk+clvk, shader-db is also
happy:

Totals:
Instrs: 109541 -> 109354 (-0.17%)
CodeSize: 1110528 -> 1108864 (-0.15%)
Estimated normalized CVT cycles: 667.609375 -> 664.5625 (-0.46%)

Totals from 17 (2.12% of 803) affected shaders:
Instrs: 13637 -> 13450 (-1.37%)
CodeSize: 112256 -> 110592 (-1.48%)
Estimated normalized CVT cycles: 100.203125 -> 97.15625 (-3.04%)

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Eric R. Smith <eric.smith@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37125>
2025-09-03 17:32:02 +00:00

518 lines
15 KiB
C

/*
* Copyright (C) 2021 Collabora, Ltd.
* Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "bi_builder.h"
#include "compiler.h"
/*
* Due to a Bifrost encoding restriction, some instructions cannot have an abs
* modifier on both sources. Check if adding a fabs modifier to a given source
* of a binary instruction would cause this restriction to be hit.
*/
static bool
bi_would_impact_abs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
{
return (arch <= 8) && I->src[1 - s].abs &&
bi_is_word_equiv(I->src[1 - s], repl);
}
static bool
bi_takes_fabs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
{
switch (I->op) {
case BI_OPCODE_FCMP_V2F16:
case BI_OPCODE_FMAX_V2F16:
case BI_OPCODE_FMIN_V2F16:
return !bi_would_impact_abs(arch, I, repl, s);
case BI_OPCODE_FADD_V2F16:
/*
* For FADD.v2f16, the FMA pipe has the abs encoding hazard,
* while the FADD pipe cannot encode a clamp. Either case in
* isolation can be worked around in the scheduler, but both
* together is impossible to encode. Avoid the hazard.
*/
return !(I->clamp && bi_would_impact_abs(arch, I, repl, s));
case BI_OPCODE_V2F32_TO_V2F16:
/* TODO: Needs both match or lower */
return false;
case BI_OPCODE_FLOG_TABLE_F32:
/* TODO: Need to check mode */
return false;
default:
return bi_get_opcode_props(I)->abs & BITFIELD_BIT(s);
}
}
static bool
bi_takes_fneg(unsigned arch, bi_instr *I, unsigned s)
{
switch (I->op) {
case BI_OPCODE_CUBE_SSEL:
case BI_OPCODE_CUBE_TSEL:
case BI_OPCODE_CUBEFACE:
/* TODO: Bifrost encoding restriction: need to match or lower */
return arch >= 9;
case BI_OPCODE_FREXPE_F32:
case BI_OPCODE_FREXPE_V2F16:
case BI_OPCODE_FLOG_TABLE_F32:
/* TODO: Need to check mode */
return false;
default:
return bi_get_opcode_props(I)->neg & BITFIELD_BIT(s);
}
}
static bool
bi_is_fabsneg(enum bi_opcode op, enum bi_size size)
{
return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) ||
(size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16);
}
static enum bi_swizzle
bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b)
{
assert(a <= BI_SWIZZLE_H11);
assert(b <= BI_SWIZZLE_H11);
bool al = (a & BI_SWIZZLE_H10);
bool ar = (a & BI_SWIZZLE_H01);
bool bl = (b & BI_SWIZZLE_H10);
bool br = (b & BI_SWIZZLE_H01);
return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) |
((ar ? br : bl) ? BI_SWIZZLE_H01 : 0);
}
/* Like bi_replace_index, but composes instead of overwrites */
static inline bi_index
bi_compose_float_index(bi_index old, bi_index repl)
{
/* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise
* -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */
repl.neg = old.neg ^ (repl.neg && !old.abs);
/* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */
repl.abs |= old.abs;
/* Use the old swizzle to select from the replacement swizzle */
repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle);
return repl;
}
/* DISCARD.b32(FCMP.f(x, y)) --> DISCARD.f(x, y) */
static inline bool
bi_fuse_discard_fcmp(bi_context *ctx, bi_instr *I, bi_instr *mod)
{
if (!mod)
return false;
if (I->op != BI_OPCODE_DISCARD_B32)
return false;
if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16)
return false;
if (mod->cmpf >= BI_CMPF_GTLT)
return false;
/* result_type doesn't matter */
/* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */
bool absneg = mod->src[0].neg || mod->src[0].abs;
absneg |= mod->src[1].neg || mod->src[1].abs;
if (ctx->arch <= 8 && absneg)
return false;
enum bi_swizzle r = I->src[0].swizzle;
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
I = bi_discard_f32(&b, mod->src[0], mod->src[1], mod->cmpf);
if (mod->op == BI_OPCODE_FCMP_V2F16) {
I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle);
I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle);
}
return true;
}
/*
* S32_TO_F32(S8_TO_S32(x)) -> S8_TO_F32 and friends. Round modes don't matter
* because all 8-bit and 16-bit integers may be represented exactly as fp32.
*/
struct {
enum bi_opcode inner;
enum bi_opcode outer;
enum bi_opcode small_replacement;
enum bi_opcode wide_replacement;
} bi_small_int_patterns[] = {
{BI_OPCODE_S8_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S8_TO_F32, BI_OPCODE_S32_TO_F32},
{BI_OPCODE_U8_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U8_TO_F32, BI_OPCODE_U32_TO_F32},
{BI_OPCODE_U8_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U8_TO_F32, BI_OPCODE_U32_TO_F32},
{BI_OPCODE_S16_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S16_TO_F32, BI_OPCODE_S32_TO_F32},
{BI_OPCODE_U16_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U16_TO_F32, BI_OPCODE_U32_TO_F32},
{BI_OPCODE_U16_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U16_TO_F32, BI_OPCODE_U32_TO_F32},
};
static inline void
bi_fuse_small_int_to_f32(bi_context *ctx, bi_instr *I, bi_instr *mod)
{
for (unsigned i = 0; i < ARRAY_SIZE(bi_small_int_patterns); ++i) {
if (I->op != bi_small_int_patterns[i].outer)
continue;
if (mod->op != bi_small_int_patterns[i].inner)
continue;
assert(I->src[0].swizzle == BI_SWIZZLE_H01);
I->src[0] = mod->src[0];
I->round = BI_ROUND_NONE;
/* On v11, All small int instructions are gone but we now can widen instead */
if (ctx->arch >= 11)
bi_set_opcode(I, bi_small_int_patterns[i].wide_replacement);
else
bi_set_opcode(I, bi_small_int_patterns[i].small_replacement);
}
}
void
bi_opt_mod_prop_forward(bi_context *ctx)
{
bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc);
bi_foreach_instr_global_safe(ctx, I) {
/* Try fusing FCMP into DISCARD.b32, building a new DISCARD.f32
* instruction. As this is the only optimization DISCARD is
* involved in, this shortcircuits other processing.
*/
if (I->op == BI_OPCODE_DISCARD_B32) {
if (bi_is_ssa(I->src[0]) &&
bi_fuse_discard_fcmp(ctx, I, lut[I->src[0].value])) {
bi_remove_instruction(I);
}
continue;
}
bi_foreach_dest(I, d) {
lut[I->dest[d].value] = I;
}
bi_foreach_ssa_src(I, s) {
bi_instr *mod = lut[I->src[s].value];
if (!mod)
continue;
unsigned size = bi_get_opcode_props(I)->size;
bi_fuse_small_int_to_f32(ctx, I, mod);
if (bi_is_fabsneg(mod->op, size)) {
if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s))
continue;
if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s))
continue;
I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]);
}
}
}
free(lut);
}
/* RSCALE has restrictions on how the clamp may be used, only used for
* specialized transcendental sequences that set the clamp explicitly anyway */
static bool
bi_takes_clamp(bi_instr *I)
{
switch (I->op) {
case BI_OPCODE_FMA_RSCALE_F32:
case BI_OPCODE_FMA_RSCALE_V2F16:
case BI_OPCODE_FADD_RSCALE_F32:
return false;
case BI_OPCODE_FADD_V2F16:
/* Encoding restriction */
return !(I->src[0].abs && I->src[1].abs &&
bi_is_word_equiv(I->src[0], I->src[1]));
default:
return bi_get_opcode_props(I)->clamp;
}
}
static bool
bi_is_fclamp(enum bi_opcode op, enum bi_size size)
{
return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) ||
(size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16);
}
static bool
bi_optimizer_clamp(bi_instr *I, bi_instr *use)
{
if (!bi_is_fclamp(use->op, bi_get_opcode_props(I)->size))
return false;
if (!bi_takes_clamp(I))
return false;
/* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */
I->clamp |= use->clamp;
I->dest[0] = use->dest[0];
return true;
}
static enum bi_opcode
bi_sized_mux_op(unsigned size)
{
switch (size) {
case 8:
return BI_OPCODE_MUX_V4I8;
case 16:
return BI_OPCODE_MUX_V2I16;
case 32:
return BI_OPCODE_MUX_I32;
default:
UNREACHABLE("invalid size");
}
}
static bool
bi_is_fixed_mux(bi_instr *I, unsigned size, bi_index v1)
{
return I->op == bi_sized_mux_op(size) &&
bi_is_value_equiv(I->src[0], bi_zero()) &&
bi_is_value_equiv(I->src[1], v1);
}
static bool
bi_takes_int_result_type(enum bi_opcode op)
{
switch (op) {
case BI_OPCODE_ICMP_I32:
case BI_OPCODE_ICMP_S32:
case BI_OPCODE_ICMP_U32:
case BI_OPCODE_ICMP_V2I16:
case BI_OPCODE_ICMP_V2S16:
case BI_OPCODE_ICMP_V2U16:
case BI_OPCODE_ICMP_V4I8:
case BI_OPCODE_ICMP_V4S8:
case BI_OPCODE_ICMP_V4U8:
case BI_OPCODE_FCMP_F32:
case BI_OPCODE_FCMP_V2F16:
return true;
default:
return false;
}
}
static bool
bi_takes_float_result_type(enum bi_opcode op)
{
return (op == BI_OPCODE_FCMP_F32) || (op == BI_OPCODE_FCMP_V2F16);
}
/* CMP+MUX -> CMP with result type */
static bool
bi_optimizer_result_type(bi_instr *I, bi_instr *mux)
{
if (bi_get_opcode_props(I)->size != bi_get_opcode_props(mux)->size)
return false;
if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) ||
bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) {
if (!bi_takes_float_result_type(I->op))
return false;
I->result_type = BI_RESULT_TYPE_F1;
} else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) ||
bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) ||
bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) {
if (!bi_takes_int_result_type(I->op))
return false;
I->result_type = BI_RESULT_TYPE_I1;
} else {
return false;
}
I->dest[0] = mux->dest[0];
return true;
}
static bool
bi_is_var_tex(bi_instr *var, bi_instr *tex)
{
return (var->op == BI_OPCODE_LD_VAR_IMM) &&
(tex->op == BI_OPCODE_TEXS_2D_F16 ||
tex->op == BI_OPCODE_TEXS_2D_F32) &&
(var->register_format == BI_REGISTER_FORMAT_F32) &&
((var->sample == BI_SAMPLE_CENTER &&
var->update == BI_UPDATE_STORE) ||
(var->sample == BI_SAMPLE_NONE &&
var->update == BI_UPDATE_RETRIEVE)) &&
(tex->texture_index == tex->sampler_index) &&
(tex->texture_index < 4) && (var->index < 8);
}
static bool
bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex)
{
if (!bi_is_var_tex(var, tex))
return false;
/* Construct the corresponding VAR_TEX intruction */
bi_builder b = bi_init_builder(ctx, bi_after_instr(var));
bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode, var->sample,
var->update, tex->texture_index, var->index);
I->skip = tex->skip;
if (tex->op == BI_OPCODE_TEXS_2D_F16)
bi_set_opcode(I, BI_OPCODE_VAR_TEX_F16);
/* Dead code elimination will clean up for us */
return true;
}
void
bi_opt_mod_prop_backward(bi_context *ctx)
{
unsigned count = ctx->ssa_alloc;
bi_instr **uses = calloc(count, sizeof(*uses));
BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple));
bi_foreach_block_rev(ctx, block) {
/* Watch out for PHI instructions in loops!
* PHI sources are logically read at the end of the predecessor,
* so process our source in successor phis first
*/
bi_foreach_successor(block, succ) {
unsigned s = bi_predecessor_index(succ, block);
bi_foreach_instr_in_block(succ, phi) {
/* the PHIs are all at the start of the block, so stop
* when we see a non-PHI
*/
if (phi->op != BI_OPCODE_PHI)
break;
if (phi->src[s].type == BI_INDEX_NORMAL)
bi_record_use(uses, multiple, phi, s);
}
}
/* now go through the instructions backwards */
bi_foreach_instr_in_block_rev(block, I) {
/* skip PHIs, they are handled specially */
if (I->op == BI_OPCODE_PHI)
continue;
/* record uses */
bi_foreach_ssa_src(I, s) {
bi_record_use(uses, multiple, I, s);
}
/* check for definitions */
if (I->nr_dests != 1)
continue;
bi_instr *use = uses[I->dest[0].value];
if (!use || BITSET_TEST(multiple, I->dest[0].value))
continue;
/* Destination has a single use, try to propagate */
bool propagated =
bi_optimizer_clamp(I, use) || bi_optimizer_result_type(I, use);
if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM &&
use->op == BI_OPCODE_SPLIT_I32) {
/* Need to see through the split in a
* ld_var_imm/split/var_tex sequence
*/
bi_instr *tex = uses[use->dest[0].value];
if (!tex || BITSET_TEST(multiple, use->dest[0].value))
continue;
use = tex;
propagated = bi_optimizer_var_tex(ctx, I, use);
}
if (propagated) {
bi_remove_instruction(use);
continue;
}
}
}
free(uses);
free(multiple);
}
/*
* Lower pseudo instructions that exist to simplify the optimizer. Returns the
* replacement instruction, or NULL if no replacement is needed.
*/
static bool
bi_lower_opt_instruction_helper(bi_builder *b, bi_instr *I)
{
bi_instr *repl;
switch (I->op) {
case BI_OPCODE_FABSNEG_F32:
case BI_OPCODE_FCLAMP_F32:
repl = bi_fadd_f32_to(b, I->dest[0], I->src[0], bi_negzero());
repl->clamp = I->clamp;
return true;
case BI_OPCODE_FABSNEG_V2F16:
case BI_OPCODE_FCLAMP_V2F16:
repl = bi_fadd_v2f16_to(b, I->dest[0], I->src[0], bi_negzero());
repl->clamp = I->clamp;
return true;
case BI_OPCODE_DISCARD_B32:
bi_discard_f32(b, I->src[0], bi_zero(), BI_CMPF_NE);
return true;
default:
return false;
}
}
void
bi_lower_opt_instructions(bi_context *ctx)
{
bi_foreach_instr_global_safe(ctx, I) {
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
if (bi_lower_opt_instruction_helper(&b, I))
bi_remove_instruction(I);
}
}