diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 9fdb313fad3..dcdb96ed35b 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1357,6 +1357,15 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) # Should be used in the shader preamble. intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE]) +intrinsic("brcst_active_ir3", dest_comp=1, src_comp=[1, 1], bit_sizes=src0, + indices=[CLUSTER_SIZE]) +intrinsic("reduce_clusters_ir3", dest_comp=1, src_comp=[1], bit_sizes=src0, + indices=[REDUCTION_OP]) +intrinsic("inclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1], + bit_sizes=src0, indices=[REDUCTION_OP]) +intrinsic("exclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1, 1], + bit_sizes=src0, indices=[REDUCTION_OP]) + # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index f842e7fdb1b..2ba4df7a127 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -193,6 +193,7 @@ static const struct opc_info { OPC(1, OPC_READ_FIRST_MACRO, read_first.macro), OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro), OPC(1, OPC_SCAN_MACRO, scan.macro), + OPC(1, OPC_SCAN_CLUSTERS_MACRO, scan_clusters.macro), OPC(1, OPC_SHPS_MACRO, shps.macro), OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro), diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index 777cfeb7113..0279abf1e63 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -130,6 +130,7 @@ typedef enum { /* Macros that expand to a loop */ OPC_SCAN_MACRO = _OPC(1, 58), + OPC_SCAN_CLUSTERS_MACRO = _OPC(1, 60), /* Macros that expand to an stsc at the start of the preamble. * It loads into const file and should not be optimized in any way. diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 79ae245cd26..d326de7c080 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -619,11 +619,12 @@ struct ir3_array { struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id); enum ir3_branch_type { - IR3_BRANCH_COND, /* condition */ - IR3_BRANCH_ANY, /* subgroupAny(condition) */ - IR3_BRANCH_ALL, /* subgroupAll(condition) */ - IR3_BRANCH_GETONE, /* subgroupElect() */ - IR3_BRANCH_SHPS, /* preamble start */ + IR3_BRANCH_COND, /* condition */ + IR3_BRANCH_ANY, /* subgroupAny(condition) */ + IR3_BRANCH_ALL, /* subgroupAll(condition) */ + IR3_BRANCH_GETONE, /* subgroupElect() */ + IR3_BRANCH_GETLAST, /* getlast.w8 */ + IR3_BRANCH_SHPS, /* preamble start */ }; struct ir3_block { @@ -2328,6 +2329,7 @@ INSTR1NODST(PREDT) INSTR0(PREDF) INSTR0(PREDE) INSTR0(GETONE) +INSTR0(GETLAST) INSTR0(SHPS) INSTR0(SHPE) @@ -2481,6 +2483,26 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask, return sam; } +/* brcst.active rx, ry behaves like a conditional move: rx either keeps its + * value or is set to ry. In order to model this in SSA form, we add an extra + * argument (the initial value of rx) and tie it to the destination. + */ +static inline struct ir3_instruction * +ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size, + struct ir3_instruction *src, + struct ir3_instruction *dst_default) +{ + struct ir3_instruction *brcst = + ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2); + brcst->cat5.cluster_size = cluster_size; + brcst->cat5.type = TYPE_U32; + struct ir3_register *brcst_dst = __ssa_dst(brcst); + __ssa_src(brcst, src, 0); + struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0); + ir3_reg_tie(brcst_dst, default_src); + return brcst; +} + /* cat6 instructions: */ INSTR0(GETFIBERID) INSTR2(LDLV) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index bcceaa678d4..8216e95c9f1 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1984,6 +1984,111 @@ emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr) return create_multidst_mov(ctx->block, dst); } +static struct ir3_instruction * +emit_intrinsic_reduce_clusters(struct ir3_context *ctx, + nir_intrinsic_instr *intr) +{ + nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr); + reduce_op_t reduce_op = get_reduce_op(nir_reduce_op); + unsigned dst_size = intr->def.bit_size; + + bool need_exclusive = + intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3; + bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32; + + /* Note: the shared reg is initialized to the identity, so we need it to + * always be 32-bit even when the source isn't because half shared regs are + * not supported. + */ + struct ir3_instruction *identity = + create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size)); + identity->dsts[0]->flags |= IR3_REG_SHARED; + + /* OPC_SCAN_CLUSTERS_MACRO has the following destinations: + * - Shared reg reduction result, must be initialized to the identity + * - Inclusive scan result + * - (iff exclusive) Exclusive scan result. Conditionally added because + * calculating the exclusive value is optional (i.e., not a side-effect of + * calculating the inclusive value) and won't be DCE'd anymore at this + * point. + * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for + * most ops but this isn't possible for the 32b mul_u macro since its + * destination is clobbered. So conditionally allocate an extra + * register in that case. + * + * Note that the getlast loop this macro expands to iterates over all + * clusters. However, for each iteration, not only the fibers in the current + * cluster are active but all later ones as well. Since they still need their + * sources when their cluster is handled, all destinations interfere with + * the sources. + */ + unsigned ndst = 2 + need_exclusive + need_scratch; + unsigned nsrc = 2 + need_exclusive; + struct ir3_instruction *scan = + ir3_instr_create(ctx->block, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc); + scan->cat1.reduce_op = reduce_op; + + unsigned dst_flags = IR3_REG_EARLY_CLOBBER; + if (ir3_bitsize(ctx, dst_size) == 16) + dst_flags |= IR3_REG_HALF; + + struct ir3_register *reduce = __ssa_dst(scan); + reduce->flags |= IR3_REG_SHARED; + struct ir3_register *inclusive = __ssa_dst(scan); + inclusive->flags |= dst_flags; + + struct ir3_register *exclusive = NULL; + if (need_exclusive) { + exclusive = __ssa_dst(scan); + exclusive->flags |= dst_flags; + } + + if (need_scratch) { + struct ir3_register *scratch = __ssa_dst(scan); + scratch->flags |= dst_flags; + } + + struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED); + ir3_reg_tie(reduce, reduce_init); + + struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0]; + __ssa_src(scan, inclusive_src, 0); + + if (need_exclusive) { + struct ir3_instruction *exclusive_src = + ir3_get_src(ctx, &intr->src[1])[0]; + __ssa_src(scan, exclusive_src, 0); + } + + struct ir3_register *dst; + switch (intr->intrinsic) { + case nir_intrinsic_reduce_clusters_ir3: + dst = reduce; + break; + case nir_intrinsic_inclusive_scan_clusters_ir3: + dst = inclusive; + break; + case nir_intrinsic_exclusive_scan_clusters_ir3: { + assert(exclusive != NULL); + dst = exclusive; + break; + } + default: + unreachable("unknown reduce intrinsic"); + } + + return create_multidst_mov(ctx->block, dst); +} + +static struct ir3_instruction * +emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0]; + struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0]; + return ir3_BRCST_ACTIVE(ctx->block, nir_intrinsic_cluster_size(intr), + brcst_val, default_src); +} + static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr); static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr); @@ -2637,6 +2742,16 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) dst[0] = emit_intrinsic_reduce(ctx, intr); break; + case nir_intrinsic_reduce_clusters_ir3: + case nir_intrinsic_inclusive_scan_clusters_ir3: + case nir_intrinsic_exclusive_scan_clusters_ir3: + dst[0] = emit_intrinsic_reduce_clusters(ctx, intr); + break; + + case nir_intrinsic_brcst_active_ir3: + dst[0] = emit_intrinsic_brcst_active(ctx, intr); + break; + case nir_intrinsic_preamble_end_ir3: { struct ir3_instruction *instr = ir3_SHPE(ctx->block); instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W; diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 795d3c7d7bb..990a6efacbc 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -695,6 +695,7 @@ block_sched(struct ir3 *ir) struct ir3_instruction *br1, *br2; if (block->brtype == IR3_BRANCH_GETONE || + block->brtype == IR3_BRANCH_GETLAST || block->brtype == IR3_BRANCH_SHPS) { /* getone/shps can't be inverted, and it wouldn't even make sense * to follow it with an inverted branch, so follow it by an @@ -703,6 +704,8 @@ block_sched(struct ir3 *ir) assert(!block->condition); if (block->brtype == IR3_BRANCH_GETONE) br1 = ir3_GETONE(block); + else if (block->brtype == IR3_BRANCH_GETLAST) + br1 = ir3_GETLAST(block); else br1 = ir3_SHPS(block); br1->cat0.target = block->successors[1]; @@ -740,6 +743,7 @@ block_sched(struct ir3 *ir) br2->cat0.brtype = BRANCH_ANY; break; case IR3_BRANCH_GETONE: + case IR3_BRANCH_GETLAST: case IR3_BRANCH_SHPS: unreachable("can't get here"); } diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c index da416497d12..91b99b7df65 100644 --- a/src/freedreno/ir3/ir3_lower_subgroups.c +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -22,6 +22,7 @@ */ #include "ir3.h" +#include "ir3_nir.h" #include "util/ralloc.h" /* Lower several macro-instructions needed for shader subgroup support that @@ -241,6 +242,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in case OPC_READ_COND_MACRO: case OPC_SWZ_SHARED_MACRO: case OPC_SCAN_MACRO: + case OPC_SCAN_CLUSTERS_MACRO: break; case OPC_READ_FIRST_MACRO: /* Moves to shared registers read the first active fiber, so we can just @@ -313,6 +315,79 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in mov_reg(exit, exclusive, reduce); do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive); mov_reg(exit, reduce, inclusive); + } else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) { + /* The pseudo-code for the scan macro is: + * + * while (true) { + * body: + * scratch = reduce; + * + * inclusive = inclusive_src OP scratch; + * + * static if (is exclusive scan) + * exclusive = exclusive_src OP scratch + * + * if (getlast()) { + * store: + * reduce = inclusive; + * if (elect()) + * break; + * } else { + * break; + * } + * } + * after_block: + */ + struct ir3_block *body = ir3_block_create(ir); + list_add(&body->node, &before_block->node); + + struct ir3_block *store = ir3_block_create(ir); + list_add(&store->node, &body->node); + + link_blocks(before_block, body, 0); + + link_blocks(body, store, 0); + link_blocks(body, after_block, 1); + body->brtype = IR3_BRANCH_GETLAST; + + link_blocks(store, after_block, 0); + link_blocks(store, body, 1); + store->brtype = IR3_BRANCH_GETONE; + + struct ir3_register *reduce = instr->dsts[0]; + struct ir3_register *inclusive = instr->dsts[1]; + struct ir3_register *inclusive_src = instr->srcs[1]; + + /* We need to perform the following operations: + * - inclusive = inclusive_src OP reduce + * - exclusive = exclusive_src OP reduce (iff exclusive scan) + * Since reduce is initially in a shared register, we need to copy it to a + * scratch register before performing the operations. + * + * The scratch register used is: + * - an explicitly allocated one if op is 32b mul_u. + * - necessary because we cannot do 'foo = foo mul_u bar' since mul_u + * clobbers its destination. + * - exclusive if this is an exclusive scan (and not 32b mul_u). + * - since we calculate inclusive first. + * - inclusive otherwise. + * + * In all cases, this is the last destination. + */ + struct ir3_register *scratch = instr->dsts[instr->dsts_count - 1]; + + mov_reg(body, scratch, reduce); + do_reduce(body, instr->cat1.reduce_op, inclusive, inclusive_src, scratch); + + /* exclusive scan */ + if (instr->srcs_count == 3) { + struct ir3_register *exclusive_src = instr->srcs[2]; + struct ir3_register *exclusive = instr->dsts[2]; + do_reduce(body, instr->cat1.reduce_op, exclusive, exclusive_src, + scratch); + } + + mov_reg(store, reduce, inclusive); } else { struct ir3_block *then_block = create_if(ir, before_block, after_block); @@ -447,3 +522,65 @@ ir3_lower_subgroups(struct ir3 *ir) return progress; } + +static bool +filter_scan_reduce(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + switch (intrin->intrinsic) { + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + return true; + default: + return false; + } +} + +static nir_def * +lower_scan_reduce(struct nir_builder *b, nir_instr *instr, void *data) +{ + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + unsigned bit_size = intrin->def.bit_size; + + nir_op op = nir_intrinsic_reduction_op(intrin); + nir_const_value ident_val = nir_alu_binop_identity(op, bit_size); + nir_def *ident = nir_build_imm(b, 1, bit_size, &ident_val); + nir_def *inclusive = intrin->src[0].ssa; + nir_def *exclusive = ident; + + for (unsigned cluster_size = 2; cluster_size <= 8; cluster_size *= 2) { + nir_def *brcst = nir_brcst_active_ir3(b, ident, inclusive, + .cluster_size = cluster_size); + inclusive = nir_build_alu2(b, op, inclusive, brcst); + + if (intrin->intrinsic == nir_intrinsic_exclusive_scan) + exclusive = nir_build_alu2(b, op, exclusive, brcst); + } + + switch (intrin->intrinsic) { + case nir_intrinsic_reduce: + return nir_reduce_clusters_ir3(b, inclusive, .reduction_op = op); + case nir_intrinsic_inclusive_scan: + return nir_inclusive_scan_clusters_ir3(b, inclusive, .reduction_op = op); + case nir_intrinsic_exclusive_scan: + return nir_exclusive_scan_clusters_ir3(b, inclusive, exclusive, + .reduction_op = op); + default: + unreachable("filtered intrinsic"); + } +} + +bool +ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v) +{ + if (!v->compiler->has_getfiberid) + return false; + + return nir_shader_lower_instructions(nir, filter_scan_reduce, + lower_scan_reduce, NULL); +} diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index e3596c54b9b..793d3084d07 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -740,6 +740,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) progress |= OPT(s, nir_opt_constant_folding); } + OPT(s, ir3_nir_opt_subgroups, so); + /* Do the preamble before analysing UBO ranges, because it's usually * higher-value and because it can result in eliminating some indirect UBO * accesses where otherwise we'd have to push the whole range. However we diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index a4adde07225..d311096c2d3 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -86,6 +86,8 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_def *offset, int32_t shift); +bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v); + static inline nir_intrinsic_instr * ir3_bindless_resource(nir_src src) { diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index 3e3c54ff72b..d3a86bc64a5 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -137,7 +137,8 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, disasm_a3xx_instr_name(instr->opc)); } - if (instr->opc == OPC_SCAN_MACRO) { + if (instr->opc == OPC_SCAN_MACRO || + instr->opc == OPC_SCAN_CLUSTERS_MACRO) { switch (instr->cat1.reduce_op) { case REDUCE_OP_ADD_U: mesa_log_stream_printf(stream, ".add.u"); @@ -548,6 +549,9 @@ print_block(struct ir3_block *block, int lvl) case IR3_BRANCH_GETONE: mesa_log_stream_printf(stream, "getone "); break; + case IR3_BRANCH_GETLAST: + mesa_log_stream_printf(stream, "getlast "); + break; case IR3_BRANCH_SHPS: mesa_log_stream_printf(stream, "shps "); break; diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index 3bd89566a4c..30575c3012e 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -249,6 +249,27 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) validate_assert(ctx, reg_class_flags(instr->dsts[1]) == reg_class_flags(instr->srcs[0])); validate_assert(ctx, reg_class_flags(instr->dsts[2]) == IR3_REG_SHARED); + } else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) { + validate_assert(ctx, instr->dsts_count >= 2 && instr->dsts_count < 5); + validate_assert(ctx, instr->srcs_count >= 2 && instr->srcs_count < 4); + validate_assert(ctx, + reg_class_flags(instr->dsts[0]) == IR3_REG_SHARED); + validate_assert(ctx, reg_class_flags(instr->dsts[1]) == + reg_class_flags(instr->srcs[1])); + + /* exclusive scan */ + if (instr->srcs_count == 3) { + validate_assert(ctx, instr->dsts_count >= 3); + validate_assert(ctx, reg_class_flags(instr->srcs[2]) == + reg_class_flags(instr->srcs[1])); + validate_assert(ctx, reg_class_flags(instr->dsts[2]) == + reg_class_flags(instr->srcs[1])); + } + + /* scratch register */ + validate_assert(ctx, + reg_class_flags(instr->dsts[instr->dsts_count - 1]) == + reg_class_flags(instr->srcs[1])); } else { foreach_dst (dst, instr) validate_reg_size(ctx, dst, instr->cat1.dst_type);