ir3: optimize bitwise ops that can directly write predicates

On a6xx+, bitwise operations can directly write to predicate registers.
The result will be 1 iff the result of the non-predicate operation would
be non-zero.

When generating instructions that need a predicate source, ir3 will
insert a cmps.s.ne 0 instruction to guarantee a predicate can be
produced. This is kept in place by this patch and we add a pass that
tries to optimize useless comparisons away.

Concretely:
- Look through chains of multiple cmps.s.ne instructions and remove all
  but the first.
- If the source of the cmps.s.ne can write directly to predicates,
  remove the cmps.s.ne.

In both cases, no instructions are actually removed but clones are made
and we rely on DCE to remove anything that became unused. Note that it's
fine to always make a clone since even in the case that the original
instruction is also used for non-predicate sources (so it won't be
DCE'd), we replaced a cmps.ne.s with another instruction so this pass
should never increase instruction count.

Note that this pass replaces the double-comparison folding that was
performed by ir3_cp before.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27411>
This commit is contained in:
Job Noorman 2024-02-01 14:51:22 +01:00 committed by Marge Bot
parent 9905b6f2f4
commit c6a6902e4d
7 changed files with 172 additions and 44 deletions

View file

@ -1970,6 +1970,8 @@ soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
}
}
bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
/* unreachable block elimination: */
bool ir3_remove_unreachable(struct ir3 *ir);

View file

@ -157,6 +157,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->local_mem_size = dev_info->cs_shared_mem_size;
compiler->num_predicates = 1;
compiler->bitops_can_write_predicates = false;
if (compiler->gen >= 6) {
compiler->samgq_workaround = true;
@ -215,6 +216,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
compiler->num_predicates = 4;
compiler->bitops_can_write_predicates = true;
} else {
compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512;

View file

@ -217,6 +217,9 @@ struct ir3_compiler {
/* Number of available predicate registers (p0.c) */
uint32_t num_predicates;
/* True if bitops (and.b, or.b, xor.b, not.b) can write to p0.c */
bool bitops_can_write_predicates;
/* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
uint32_t max_variable_workgroup_size;

View file

@ -4986,6 +4986,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
progress |= IR3_PASS(ir, ir3_cp, so);
progress |= IR3_PASS(ir, ir3_cse);
progress |= IR3_PASS(ir, ir3_dce, so);
progress |= IR3_PASS(ir, ir3_opt_predicates, so);
} while (progress);
/* at this point, for binning pass, throw away unneeded outputs:

View file

@ -89,24 +89,6 @@ is_eligible_mov(struct ir3_instruction *instr,
return false;
}
/* we can end up with extra cmps.s from frontend, which uses a
*
* cmps.s p0.x, cond, 0
*
* as a way to mov into the predicate register. But frequently 'cond'
* is itself a cmps.s/cmps.f/cmps.u. So detect this special case.
*/
static bool
is_foldable_double_cmp(struct ir3_instruction *cmp)
{
struct ir3_instruction *cond = ssa(cmp->srcs[0]);
return (cmp->dsts[0]->flags & IR3_REG_PREDICATE) && cond &&
(cmp->srcs[1]->flags & IR3_REG_IMMED) &&
(cmp->srcs[1]->iim_val == 0) &&
(cmp->cat2.condition == IR3_COND_NE) &&
(!cond->address || cond->address->def->instr->block == cmp->block);
}
/* propagate register flags from src to dst.. negates need special
* handling to cancel each other out.
*/
@ -611,32 +593,6 @@ instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
ctx->progress = true;
}
/* Re-write the instruction writing predicate register to get rid
* of the double cmps.
*/
if ((instr->opc == OPC_CMPS_S) && is_foldable_double_cmp(instr)) {
struct ir3_instruction *cond = ssa(instr->srcs[0]);
switch (cond->opc) {
case OPC_CMPS_S:
case OPC_CMPS_F:
case OPC_CMPS_U:
instr->opc = cond->opc;
instr->flags = cond->flags;
instr->cat2 = cond->cat2;
if (cond->address)
ir3_instr_set_address(instr, cond->address->def->instr);
instr->srcs[0] = ir3_reg_clone(ctx->shader, cond->srcs[0]);
instr->srcs[1] = ir3_reg_clone(ctx->shader, cond->srcs[1]);
instr->barrier_class |= cond->barrier_class;
instr->barrier_conflict |= cond->barrier_conflict;
unuse(cond);
ctx->progress = true;
break;
default:
break;
}
}
/* Handle converting a sam.s2en (taking samp/tex idx params via register)
* into a normal sam (encoding immediate samp/tex idx) if they are
* immediate. This saves some instructions and regs in the common case

View file

@ -0,0 +1,163 @@
/*
* Copyright © 2024 Igalia S.L.
* SPDX-License-Identifier: MIT
*/
#include "ir3.h"
#include "ir3_shader.h"
/* This pass tries to optimize away cmps.s.ne instructions created by
* ir3_get_predicate in order to write predicates. It does two things:
* - Look through chains of multiple cmps.s.ne instructions and remove all but
* the first.
* - If the source of the cmps.s.ne can write directly to predicates (true for
* bitops on a6xx+), remove the cmps.s.ne.
*
* In both cases, no instructions are actually removed but clones are made and
* we rely on DCE to remove anything that became unused. Note that it's fine to
* always make a clone since even in the case that the original instruction is
* also used for non-predicate sources (so it won't be DCE'd), we replaced a
* cmps.ne.s with another instruction so this pass should never increase
* instruction count.
*/
struct opt_predicates_ctx {
struct ir3 *ir;
/* Map from instructions to their clones with a predicate destination. Used
* to prevent instructions being cloned multiple times.
*/
struct hash_table *predicate_clones;
};
static struct ir3_instruction *
clone_with_predicate_dst(struct opt_predicates_ctx *ctx,
struct ir3_instruction *instr)
{
struct hash_entry *entry =
_mesa_hash_table_search(ctx->predicate_clones, instr);
if (entry)
return entry->data;
assert(instr->dsts_count == 1);
assert(!(instr->dsts[0]->flags & IR3_REG_SHARED));
struct ir3_instruction *clone = ir3_instr_clone(instr);
ir3_instr_move_after(clone, instr);
clone->dsts[0]->flags |= IR3_REG_PREDICATE;
clone->dsts[0]->flags &= ~IR3_REG_HALF;
_mesa_hash_table_insert(ctx->predicate_clones, instr, clone);
return clone;
}
static bool
can_write_predicate(struct opt_predicates_ctx *ctx,
struct ir3_instruction *instr)
{
switch (instr->opc) {
case OPC_CMPS_S:
case OPC_CMPS_U:
case OPC_CMPS_F:
return true;
case OPC_AND_B:
case OPC_OR_B:
case OPC_NOT_B:
case OPC_XOR_B:
case OPC_GETBIT_B:
return ctx->ir->compiler->bitops_can_write_predicates;
default:
return false;
}
}
/* Detects the pattern used by ir3_get_predicate to write a predicate register:
* cmps.s.ne pssa_x, ssa_y, 0
*/
static bool
is_gpr_to_predicate_mov(struct ir3_instruction *instr)
{
return (instr->opc == OPC_CMPS_S) &&
(instr->cat2.condition == IR3_COND_NE) &&
(instr->srcs[0]->flags & IR3_REG_SSA) &&
(instr->srcs[1]->flags & IR3_REG_IMMED) &&
(instr->srcs[1]->iim_val == 0);
}
/* Look through a chain of cmps.s.ne 0 instructions to find the initial source.
* Return it if it can write to predicates. Otherwise, return the first
* cmps.s.ne in the chain.
*/
static struct ir3_register *
resolve_predicate_def(struct opt_predicates_ctx *ctx, struct ir3_register *src)
{
struct ir3_register *def = src->def;
while (is_gpr_to_predicate_mov(def->instr)) {
struct ir3_register *next_def = def->instr->srcs[0]->def;
if (!can_write_predicate(ctx, next_def->instr))
return def;
def = next_def;
}
return def;
}
/* Find all predicate sources and try to replace their defs with instructions
* that can directly write to predicates.
*/
static bool
opt_instr(struct opt_predicates_ctx *ctx, struct ir3_instruction *instr)
{
bool progress = false;
foreach_src (src, instr) {
if (!(src->flags & IR3_REG_PREDICATE))
continue;
struct ir3_register *def = resolve_predicate_def(ctx, src);
if (src->def == def)
continue;
assert(can_write_predicate(ctx, def->instr) &&
!(def->flags & IR3_REG_PREDICATE));
struct ir3_instruction *predicate =
clone_with_predicate_dst(ctx, def->instr);
assert(predicate->dsts_count == 1);
src->def = predicate->dsts[0];
progress = true;
}
return progress;
}
static bool
opt_blocks(struct opt_predicates_ctx *ctx)
{
bool progress = false;
foreach_block (block, &ctx->ir->block_list) {
foreach_instr (instr, &block->instr_list) {
progress |= opt_instr(ctx, instr);
}
}
return progress;
}
bool
ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v)
{
struct opt_predicates_ctx *ctx = rzalloc(NULL, struct opt_predicates_ctx);
ctx->ir = ir;
ctx->predicate_clones = _mesa_pointer_hash_table_create(ctx);
bool progress = opt_blocks(ctx);
ralloc_free(ctx);
return progress;
}

View file

@ -104,6 +104,7 @@ libfreedreno_ir3_files = files(
'ir3_nir_move_varying_inputs.c',
'ir3_nir_lower_layer_id.c',
'ir3_nir_opt_preamble.c',
'ir3_opt_predicates.c',
'ir3_postsched.c',
'ir3_print.c',
'ir3_ra.c',