mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-26 19:20:08 +01:00
pan/bi: Fuse FCMP/ICMP on Valhall
We have a lot of pattern like this on Valhall: FCMP_OR.f32.lt.m1 r28, ^r28, r27, 0x0 FCMP_OR.f32.lt.m1 r29, r27, r25, 0x0 LSHIFT_AND.i32 r28, ^r28, 0x0.b00, ^r29 That can be simplified into: FCMP_OR.f32.lt.m1 r29, r27, r25, 0x0 FCMP_AND.f32.lt.m1 r28, ^r28, r27, ^r29 This pass merge those specific cases while setting the appropriate logical variant of the CMP instruction. We do not try to merge the srcs that do not originate from a matching CMP instruction with matching result type as the logical operation is performed before the result type transformation. Now this is enough to optimise a lot of common cases anyway so it is still a win. Results on fossils/sascha-willems: Totals: Instrs: 42157 -> 42059 (-0.23%) CodeSize: 582784 -> 581760 (-0.18%) Estimated normalized SFU cycles: 159.9375 -> 153.75 (-3.87%) Totals from 13 (2.07% of 627) affected shaders: Instrs: 3490 -> 3392 (-2.81%) CodeSize: 29696 -> 28672 (-3.45%) Estimated normalized SFU cycles: 15.8125 -> 9.625 (-39.13%) Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Eric R. Smith <eric.smith@collabora.com> Reviewed-by: Olivia Lee <olivia.lee@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36327>
This commit is contained in:
parent
2e4a8e98bd
commit
800a861431
4 changed files with 225 additions and 13 deletions
|
|
@ -398,18 +398,6 @@ bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex)
|
|||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
bi_record_use(bi_instr **uses, BITSET_WORD *multiple, bi_instr *I, unsigned s)
|
||||
{
|
||||
unsigned v = I->src[s].value;
|
||||
|
||||
assert(I->src[s].type == BI_INDEX_NORMAL);
|
||||
if (uses[v] && uses[v] != I)
|
||||
BITSET_SET(multiple, v);
|
||||
else
|
||||
uses[v] = I;
|
||||
}
|
||||
|
||||
void
|
||||
bi_opt_mod_prop_backward(bi_context *ctx)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -6193,8 +6193,8 @@ bi_compile_variant_nir(nir_shader *nir,
|
|||
bi_lower_opt_instructions(ctx);
|
||||
|
||||
if (ctx->arch >= 9) {
|
||||
va_optimize(ctx);
|
||||
va_lower_isel(ctx);
|
||||
va_optimize(ctx);
|
||||
|
||||
bi_foreach_instr_global_safe(ctx, I) {
|
||||
/* Phis become single moves so shouldn't be affected */
|
||||
|
|
|
|||
|
|
@ -402,6 +402,12 @@ bi_is_ssa(bi_index idx)
|
|||
return idx.type == BI_INDEX_NORMAL;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
bi_is_zero(const bi_index idx)
|
||||
{
|
||||
return idx.type == BI_INDEX_CONSTANT && idx.value == 0;
|
||||
}
|
||||
|
||||
/* Compares equivalence as references. Does not compare offsets, swizzles, or
|
||||
* modifiers. In other words, this forms bi_index equivalence classes by
|
||||
* partitioning memory. E.g. -abs(foo[1].yx) == foo.xy but foo != bar */
|
||||
|
|
@ -1616,6 +1622,18 @@ bi_dontcare(bi_builder *b)
|
|||
#define bi_worklist_peek_tail(w) u_worklist_peek_tail(w, bi_block, index)
|
||||
#define bi_worklist_pop_tail(w) u_worklist_pop_tail(w, bi_block, index)
|
||||
|
||||
static inline void
|
||||
bi_record_use(bi_instr **uses, BITSET_WORD *multiple, bi_instr *I, unsigned s)
|
||||
{
|
||||
unsigned v = I->src[s].value;
|
||||
|
||||
assert(I->src[s].type == BI_INDEX_NORMAL);
|
||||
if (uses[v] && uses[v] != I)
|
||||
BITSET_SET(multiple, v);
|
||||
else
|
||||
uses[v] = I;
|
||||
}
|
||||
|
||||
/* NIR passes */
|
||||
|
||||
bool bi_lower_divergent_indirects(nir_shader *shader, unsigned lanes);
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "bi_builder.h"
|
||||
#include "va_compiler.h"
|
||||
|
||||
/* Valhall specific instruction selection optimizations */
|
||||
|
|
@ -123,10 +124,215 @@ va_fuse_add_imm(bi_instr *I)
|
|||
bi_drop_srcs(I, 1);
|
||||
}
|
||||
|
||||
enum va_cmp_type {
|
||||
VA_CMP_TYPE_INVALID,
|
||||
VA_CMP_TYPE_F,
|
||||
VA_CMP_TYPE_S,
|
||||
VA_CMP_TYPE_U,
|
||||
};
|
||||
|
||||
static enum bi_opcode
|
||||
va_remap_logical_to_logical_cmp(enum bi_opcode op, enum va_cmp_type type)
|
||||
{
|
||||
if (type == VA_CMP_TYPE_F) {
|
||||
switch (op) {
|
||||
case BI_OPCODE_LSHIFT_OR_I32:
|
||||
return BI_OPCODE_FCMP_OR_F32;
|
||||
case BI_OPCODE_LSHIFT_OR_V2I16:
|
||||
return BI_OPCODE_FCMP_OR_V2F16;
|
||||
case BI_OPCODE_LSHIFT_AND_I32:
|
||||
return BI_OPCODE_FCMP_AND_F32;
|
||||
case BI_OPCODE_LSHIFT_AND_V2I16:
|
||||
return BI_OPCODE_FCMP_AND_V2F16;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
} else if (type == VA_CMP_TYPE_S) {
|
||||
switch (op) {
|
||||
case BI_OPCODE_LSHIFT_OR_I32:
|
||||
return BI_OPCODE_ICMP_OR_S32;
|
||||
case BI_OPCODE_LSHIFT_OR_V2I16:
|
||||
return BI_OPCODE_ICMP_OR_V2S16;
|
||||
case BI_OPCODE_LSHIFT_OR_V4I8:
|
||||
return BI_OPCODE_ICMP_OR_V4S8;
|
||||
case BI_OPCODE_LSHIFT_AND_I32:
|
||||
return BI_OPCODE_ICMP_AND_S32;
|
||||
case BI_OPCODE_LSHIFT_AND_V2I16:
|
||||
return BI_OPCODE_ICMP_AND_V2S16;
|
||||
case BI_OPCODE_LSHIFT_AND_V4I8:
|
||||
return BI_OPCODE_ICMP_AND_V4S8;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
} else if (type == VA_CMP_TYPE_U) {
|
||||
switch (op) {
|
||||
case BI_OPCODE_LSHIFT_OR_I32:
|
||||
return BI_OPCODE_ICMP_OR_U32;
|
||||
case BI_OPCODE_LSHIFT_OR_V2I16:
|
||||
return BI_OPCODE_ICMP_OR_V2U16;
|
||||
case BI_OPCODE_LSHIFT_OR_V4I8:
|
||||
return BI_OPCODE_ICMP_OR_V4U8;
|
||||
case BI_OPCODE_LSHIFT_AND_I32:
|
||||
return BI_OPCODE_ICMP_AND_U32;
|
||||
case BI_OPCODE_LSHIFT_AND_V2I16:
|
||||
return BI_OPCODE_ICMP_AND_V2U16;
|
||||
case BI_OPCODE_LSHIFT_AND_V4I8:
|
||||
return BI_OPCODE_ICMP_AND_V4U8;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
assert(0 && "invalid va_cmp_type");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
va_cmp_can_fuse(enum bi_opcode op)
|
||||
{
|
||||
/* We only allow fusing with OR variants */
|
||||
switch (op) {
|
||||
case BI_OPCODE_FCMP_OR_F32:
|
||||
case BI_OPCODE_FCMP_OR_V2F16:
|
||||
case BI_OPCODE_ICMP_OR_S32:
|
||||
case BI_OPCODE_ICMP_OR_V2S16:
|
||||
case BI_OPCODE_ICMP_OR_V4S8:
|
||||
case BI_OPCODE_ICMP_OR_U32:
|
||||
case BI_OPCODE_ICMP_OR_V2U16:
|
||||
case BI_OPCODE_ICMP_OR_V4U8:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static enum va_cmp_type
|
||||
va_cmp_opcode_to_cmp_type(enum bi_opcode op)
|
||||
{
|
||||
switch (op) {
|
||||
case BI_OPCODE_FCMP_AND_F32:
|
||||
case BI_OPCODE_FCMP_AND_V2F16:
|
||||
case BI_OPCODE_FCMP_OR_F32:
|
||||
case BI_OPCODE_FCMP_OR_V2F16:
|
||||
return VA_CMP_TYPE_F;
|
||||
case BI_OPCODE_ICMP_AND_S32:
|
||||
case BI_OPCODE_ICMP_AND_V2S16:
|
||||
case BI_OPCODE_ICMP_OR_S32:
|
||||
case BI_OPCODE_ICMP_OR_V2S16:
|
||||
case BI_OPCODE_ICMP_OR_V4S8:
|
||||
return VA_CMP_TYPE_S;
|
||||
case BI_OPCODE_ICMP_AND_U32:
|
||||
case BI_OPCODE_ICMP_AND_V2U16:
|
||||
case BI_OPCODE_ICMP_OR_U32:
|
||||
case BI_OPCODE_ICMP_OR_V2U16:
|
||||
case BI_OPCODE_ICMP_OR_V4U8:
|
||||
return VA_CMP_TYPE_U;
|
||||
default:
|
||||
return VA_CMP_TYPE_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
/* LSHIFT_X_F32(FCMP_OR_F32(a, b, 0), FCMP_Y_F32(c, d, e), 0) -> FCMP_X_F32(a,
|
||||
* b, FCMP_Y_F32(c, d, e))) */
|
||||
static void
|
||||
va_fuse_cmp(bi_context *ctx, bi_instr **lut, const BITSET_WORD *multiple,
|
||||
bi_instr *I)
|
||||
{
|
||||
/* Expect SSA values on other sources */
|
||||
if (I->nr_srcs != 3 || !bi_is_ssa(I->src[0]) || !bi_is_ssa(I->src[1]))
|
||||
return;
|
||||
|
||||
bi_instr *src0_ins = lut[I->src[0].value];
|
||||
bi_instr *src1_ins = lut[I->src[1].value];
|
||||
|
||||
enum va_cmp_type cmp_type = va_cmp_opcode_to_cmp_type(src0_ins->op);
|
||||
|
||||
/* Expect both side to use the same form type */
|
||||
if (cmp_type == VA_CMP_TYPE_INVALID ||
|
||||
cmp_type != va_cmp_opcode_to_cmp_type(src1_ins->op))
|
||||
return;
|
||||
|
||||
/* Expect both side to use the same result type */
|
||||
if (src0_ins->result_type != src1_ins->result_type)
|
||||
return;
|
||||
|
||||
/* Ensure we really have a LSHIFT that we can remap (so without shift) */
|
||||
if (!va_remap_logical_to_logical_cmp(I->op, cmp_type) ||
|
||||
!bi_is_zero(I->src[2]))
|
||||
return;
|
||||
|
||||
bi_instr *old_ins;
|
||||
bi_index src2;
|
||||
|
||||
/* Try to fuse general case of LSHIFT_X_F32(FCMP_OR_F32(a, b, 0),
|
||||
* FCMP_Y_F32(c, d, e), 0), otherwise try to fuse LSHIFT_OR_F32(FCMP_Y_F32(c,
|
||||
* d, e), FCMP_OR_F32(a, b, 0), 0) */
|
||||
if (va_cmp_can_fuse(src0_ins->op) &&
|
||||
!BITSET_TEST(multiple, src0_ins->dest[0].value) &&
|
||||
bi_is_zero(src0_ins->src[2])) {
|
||||
old_ins = src0_ins;
|
||||
src2 = src1_ins->dest[0];
|
||||
} else if ((I->op == BI_OPCODE_LSHIFT_OR_I32 ||
|
||||
I->op == BI_OPCODE_LSHIFT_OR_V2I16) &&
|
||||
va_cmp_can_fuse(src1_ins->op) &&
|
||||
!BITSET_TEST(multiple, src1_ins->dest[0].value) &&
|
||||
bi_is_zero(src1_ins->src[2])) {
|
||||
old_ins = src1_ins;
|
||||
src2 = src0_ins->dest[0];
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Replace old LSHIFT logic op with the CMP with correct logical op and
|
||||
* accumulate other src */
|
||||
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
|
||||
bi_instr *new_ins =
|
||||
bi_fcmp_or_f32_to(&b, I->dest[0], old_ins->src[0], old_ins->src[1], src2,
|
||||
old_ins->cmpf, old_ins->result_type);
|
||||
bi_set_opcode(new_ins, va_remap_logical_to_logical_cmp(I->op, cmp_type));
|
||||
|
||||
/* Remove the old instructions */
|
||||
lut[old_ins->dest[0].value] = NULL;
|
||||
lut[new_ins->dest[0].value] = new_ins;
|
||||
bi_remove_instruction(old_ins);
|
||||
bi_remove_instruction(I);
|
||||
}
|
||||
|
||||
static void
|
||||
va_optimize_forward(bi_context *ctx)
|
||||
{
|
||||
unsigned count = ctx->ssa_alloc;
|
||||
bi_instr **lut = calloc(count, sizeof(*lut));
|
||||
bi_instr **uses = calloc(count, sizeof(*uses));
|
||||
BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple));
|
||||
|
||||
/* Record usage across blocks */
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_foreach_instr_in_block(block, I) {
|
||||
bi_foreach_dest(I, d) {
|
||||
lut[I->dest[d].value] = I;
|
||||
}
|
||||
|
||||
bi_foreach_ssa_src(I, s) {
|
||||
bi_record_use(uses, multiple, I, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bi_foreach_instr_global_safe(ctx, I) {
|
||||
va_fuse_cmp(ctx, lut, multiple, I);
|
||||
}
|
||||
|
||||
free(uses);
|
||||
free(multiple);
|
||||
}
|
||||
|
||||
void
|
||||
va_optimize(bi_context *ctx)
|
||||
{
|
||||
bi_foreach_instr_global(ctx, I) {
|
||||
va_fuse_add_imm(I);
|
||||
}
|
||||
|
||||
va_optimize_forward(ctx);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue