diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index c0feeb13730..e9f29b6a1d6 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -192,6 +192,7 @@ static const struct opc_info { OPC(1, OPC_READ_COND_MACRO, read_cond.macro), OPC(1, OPC_READ_FIRST_MACRO, read_first.macro), OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro), + OPC(1, OPC_SCAN_MACRO, scan.macro), /* category 2: */ OPC(2, OPC_ADD_F, add.f), diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index 069324845ae..f1f41de3710 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -127,6 +127,9 @@ typedef enum { OPC_READ_FIRST_MACRO = _OPC(1, 55), OPC_SWZ_SHARED_MACRO = _OPC(1, 56), + /* Macros that expand to a loop */ + OPC_SCAN_MACRO = _OPC(1, 57), + /* category 2: */ OPC_ADD_F = _OPC(2, 0), OPC_MIN_F = _OPC(2, 1), diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index c71e8da329d..25de40d95d8 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -239,6 +239,22 @@ struct ir3_register { arr[arr##_count++] = __VA_ARGS__; \ } while (0) +typedef enum { + REDUCE_OP_ADD_U, + REDUCE_OP_ADD_F, + REDUCE_OP_MUL_U, + REDUCE_OP_MUL_F, + REDUCE_OP_MIN_U, + REDUCE_OP_MIN_S, + REDUCE_OP_MIN_F, + REDUCE_OP_MAX_U, + REDUCE_OP_MAX_S, + REDUCE_OP_MAX_F, + REDUCE_OP_AND_B, + REDUCE_OP_OR_B, + REDUCE_OP_XOR_B, +} reduce_op_t; + struct ir3_instruction { struct ir3_block *block; opc_t opc; @@ -324,6 +340,7 @@ struct ir3_instruction { struct { type_t src_type, dst_type; round_t round; + reduce_op_t reduce_op; } cat1; struct { enum { @@ -896,6 +913,7 @@ is_subgroup_cond_mov_macro(struct ir3_instruction *instr) case OPC_READ_COND_MACRO: case OPC_READ_FIRST_MACRO: case OPC_SWZ_SHARED_MACRO: + case OPC_SCAN_MACRO: return true; default: return false; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 09954806b71..0e730636731 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1823,6 +1823,148 @@ get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr) return ctx->frag_coord; } +/* This is a bit of a hack until ir3_context is converted to store SSA values + * as ir3_register's instead of ir3_instruction's. Pick out a given destination + * of an instruction with multiple destinations using a mov that will get folded + * away by ir3_cp. + */ +static struct ir3_instruction * +create_multidst_mov(struct ir3_block *block, struct ir3_register *dst) +{ + struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); + unsigned dst_flags = dst->flags & IR3_REG_HALF; + unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED); + + __ssa_dst(mov)->flags |= dst_flags; + struct ir3_register *src = + ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags); + src->wrmask = dst->wrmask; + src->def = dst; + debug_assert(!(dst->flags & IR3_REG_RELATIV)); + mov->cat1.src_type = mov->cat1.dst_type = + (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + return mov; +} + +static reduce_op_t +get_reduce_op(nir_op opc) +{ + switch (opc) { + case nir_op_iadd: return REDUCE_OP_ADD_U; + case nir_op_fadd: return REDUCE_OP_ADD_F; + case nir_op_imul: return REDUCE_OP_MUL_U; + case nir_op_fmul: return REDUCE_OP_MUL_F; + case nir_op_umin: return REDUCE_OP_MIN_U; + case nir_op_imin: return REDUCE_OP_MIN_S; + case nir_op_fmin: return REDUCE_OP_MIN_F; + case nir_op_umax: return REDUCE_OP_MAX_U; + case nir_op_imax: return REDUCE_OP_MAX_S; + case nir_op_fmax: return REDUCE_OP_MAX_F; + case nir_op_iand: return REDUCE_OP_AND_B; + case nir_op_ior: return REDUCE_OP_OR_B; + case nir_op_ixor: return REDUCE_OP_XOR_B; + default: + unreachable("unknown NIR reduce op"); + } +} + +static uint32_t +get_reduce_identity(nir_op opc, unsigned size) +{ + switch (opc) { + case nir_op_iadd: + return 0; + case nir_op_fadd: + return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f); + case nir_op_imul: + return 1; + case nir_op_fmul: + return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f); + case nir_op_umax: + return 0; + case nir_op_imax: + return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN; + case nir_op_fmax: + return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY); + case nir_op_umin: + return size == 32 ? UINT32_MAX : UINT16_MAX; + case nir_op_imin: + return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX; + case nir_op_fmin: + return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY); + case nir_op_iand: + return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1); + case nir_op_ior: + return 0; + case nir_op_ixor: + return 0; + default: + unreachable("unknown NIR reduce op"); + } +} + +static struct ir3_instruction * +emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; + nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr); + reduce_op_t reduce_op = get_reduce_op(nir_reduce_op); + unsigned dst_size = nir_dest_bit_size(intr->dest); + unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0; + + /* Note: the shared reg is initialized to the identity, so we need it to + * always be 32-bit even when the source isn't because half shared regs are + * not supported. + */ + struct ir3_instruction *identity = + create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size)); + identity = ir3_READ_FIRST_MACRO(ctx->block, identity, 0); + identity->dsts[0]->flags |= IR3_REG_SHARED; + + /* OPC_SCAN_MACRO has the following destinations: + * - Exclusive scan result (interferes with source) + * - Inclusive scan result + * - Shared reg reduction result, must be initialized to the identity + * + * The loop computes all three results at the same time, we just have to + * choose which destination to return. + */ + struct ir3_instruction *scan = + ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2); + scan->cat1.reduce_op = reduce_op; + + struct ir3_register *exclusive = __ssa_dst(scan); + exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER; + struct ir3_register *inclusive = __ssa_dst(scan); + inclusive->flags |= flags; + struct ir3_register *reduce = __ssa_dst(scan); + reduce->flags |= IR3_REG_SHARED; + + /* The 32-bit multiply macro reads its sources after writing a partial result + * to the destination, therefore inclusive also interferes with the source. + */ + if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32) + inclusive->flags |= IR3_REG_EARLY_CLOBBER; + + /* Normal source */ + __ssa_src(scan, src, 0); + + /* shared reg tied source */ + struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED); + ir3_reg_tie(reduce, reduce_init); + + struct ir3_register *dst; + switch (intr->intrinsic) { + case nir_intrinsic_reduce: dst = reduce; break; + case nir_intrinsic_inclusive_scan: dst = inclusive; break; + case nir_intrinsic_exclusive_scan: dst = exclusive; break; + default: + unreachable("unknown reduce intrinsic"); + } + + return create_multidst_mov(ctx->block, dst); +} + static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr); static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr); @@ -2425,6 +2567,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) break; } + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + dst[0] = emit_intrinsic_reduce(ctx, intr); + break; + default: ir3_context_error(ctx, "Unhandled intrinsic type: %s\n", nir_intrinsic_infos[intr->intrinsic].name); diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c index 041be19f208..afc88a1e9ad 100644 --- a/src/freedreno/ir3/ir3_lower_subgroups.c +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -71,14 +71,106 @@ mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed) mov->repeat = util_last_bit(mov_dst->wrmask) - 1; } +static void +mov_reg(struct ir3_block *block, struct ir3_register *dst, + struct ir3_register *src) +{ + struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); + + struct ir3_register *mov_dst = + ir3_dst_create(mov, dst->num, dst->flags & (IR3_REG_HALF | IR3_REG_SHARED)); + struct ir3_register *mov_src = + ir3_src_create(mov, src->num, src->flags & (IR3_REG_HALF | IR3_REG_SHARED)); + mov_dst->wrmask = dst->wrmask; + mov_src->wrmask = src->wrmask; + mov->repeat = util_last_bit(mov_dst->wrmask) - 1; + + mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + mov->cat1.src_type = (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; +} + +static void +binop(struct ir3_block *block, opc_t opc, struct ir3_register *dst, + struct ir3_register *src0, struct ir3_register *src1) +{ + struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2); + + unsigned flags = dst->flags & IR3_REG_HALF; + struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags); + struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags); + struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags); + + instr_dst->wrmask = dst->wrmask; + instr_src0->wrmask = src0->wrmask; + instr_src1->wrmask = src1->wrmask; + instr->repeat = util_last_bit(instr_dst->wrmask) - 1; +} + +static void +triop(struct ir3_block *block, opc_t opc, struct ir3_register *dst, + struct ir3_register *src0, struct ir3_register *src1, + struct ir3_register *src2) +{ + struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 3); + + unsigned flags = dst->flags & IR3_REG_HALF; + struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags); + struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags); + struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags); + struct ir3_register *instr_src2 = ir3_src_create(instr, src2->num, flags); + + instr_dst->wrmask = dst->wrmask; + instr_src0->wrmask = src0->wrmask; + instr_src1->wrmask = src1->wrmask; + instr_src2->wrmask = src2->wrmask; + instr->repeat = util_last_bit(instr_dst->wrmask) - 1; +} + +static void +do_reduce(struct ir3_block *block, reduce_op_t opc, + struct ir3_register *dst, struct ir3_register *src0, + struct ir3_register *src1) +{ + switch (opc) { +#define CASE(name) \ + case REDUCE_OP_##name: \ + binop(block, OPC_##name, dst, src0, src1); \ + break; + + CASE(ADD_U) + CASE(ADD_F) + CASE(MUL_F) + CASE(MIN_U) + CASE(MIN_S) + CASE(MIN_F) + CASE(MAX_U) + CASE(MAX_S) + CASE(MAX_F) + CASE(AND_B) + CASE(OR_B) + CASE(XOR_B) + +#undef CASE + + case REDUCE_OP_MUL_U: + if (dst->flags & IR3_REG_HALF) { + binop(block, OPC_MUL_S24, dst, src0, src1); + } else { + /* 32-bit multiplication macro - see ir3_nir_imul */ + binop(block, OPC_MULL_U, dst, src0, src1); + triop(block, OPC_MADSH_M16, dst, src0, src1, dst); + triop(block, OPC_MADSH_M16, dst, src1, src0, dst); + } + break; + } +} + static struct ir3_block * split_block(struct ir3 *ir, struct ir3_block *before_block, - struct ir3_instruction *instr, struct ir3_block **then) + struct ir3_instruction *instr) { - struct ir3_block *then_block = ir3_block_create(ir); struct ir3_block *after_block = ir3_block_create(ir); - list_add(&then_block->node, &before_block->node); - list_add(&after_block->node, &then_block->node); + list_add(&after_block->node, &before_block->node); for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) { after_block->successors[i] = before_block->successors[i]; @@ -96,19 +188,8 @@ split_block(struct ir3 *ir, struct ir3_block *before_block, } } - before_block->successors[0] = then_block; - before_block->successors[1] = after_block; - before_block->physical_successors[0] = then_block; - before_block->physical_successors[1] = after_block; - ir3_block_add_predecessor(then_block, before_block); - ir3_block_add_predecessor(after_block, before_block); - ir3_block_add_physical_predecessor(then_block, before_block); - ir3_block_add_physical_predecessor(after_block, before_block); - - then_block->successors[0] = after_block; - then_block->physical_successors[0] = after_block; - ir3_block_add_predecessor(after_block, then_block); - ir3_block_add_physical_predecessor(after_block, then_block); + before_block->successors[0] = before_block->successors[1] = NULL; + before_block->physical_successors[0] = before_block->physical_successors[1] = NULL; foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) { @@ -120,10 +201,39 @@ split_block(struct ir3 *ir, struct ir3_block *before_block, after_block->brtype = before_block->brtype; after_block->condition = before_block->condition; - *then = then_block; return after_block; } +static void +link_blocks_physical(struct ir3_block *pred, struct ir3_block *succ, + unsigned index) +{ + pred->physical_successors[index] = succ; + ir3_block_add_physical_predecessor(succ, pred); +} + +static void +link_blocks(struct ir3_block *pred, struct ir3_block *succ, unsigned index) +{ + pred->successors[index] = succ; + ir3_block_add_predecessor(succ, pred); + link_blocks_physical(pred, succ, index); +} + +static struct ir3_block * +create_if(struct ir3 *ir, struct ir3_block *before_block, + struct ir3_block *after_block) +{ + struct ir3_block *then_block = ir3_block_create(ir); + list_add(&then_block->node, &before_block->node); + + link_blocks(before_block, then_block, 0); + link_blocks(before_block, after_block, 1); + link_blocks(then_block, after_block, 0); + + return then_block; +} + static bool lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *instr) { @@ -135,106 +245,156 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in case OPC_READ_COND_MACRO: case OPC_READ_FIRST_MACRO: case OPC_SWZ_SHARED_MACRO: + case OPC_SCAN_MACRO: break; default: return false; } struct ir3_block *before_block = *block; - struct ir3_block *then_block; - struct ir3_block *after_block = - split_block(ir, before_block, instr, &then_block); + struct ir3_block *after_block = split_block(ir, before_block, instr); - /* For ballot, the destination must be initialized to 0 before we do - * the movmsk because the condition may be 0 and then the movmsk will - * be skipped. Because it's a shared register we have to wrap the - * initialization in a getone block. - */ - if (instr->opc == OPC_BALLOT_MACRO) { - before_block->brtype = IR3_BRANCH_GETONE; - before_block->condition = NULL; - mov_immed(instr->dsts[0], then_block, 0); - before_block = after_block; - after_block = split_block(ir, before_block, instr, &then_block); - } + if (instr->opc == OPC_SCAN_MACRO) { + /* The pseudo-code for the scan macro is: + * + * while (true) { + * header: + * if (elect()) { + * exit: + * exclusive = reduce; + * inclusive = src OP exclusive; + * reduce = inclusive; + * } + * footer: + * } + * + * This is based on the blob's sequence, and carefully crafted to avoid + * using the shared register "reduce" except in move instructions, since + * using it in the actual OP isn't possible for half-registers. + */ + struct ir3_block *header = ir3_block_create(ir); + list_add(&header->node, &before_block->node); - switch (instr->opc) { - case OPC_BALLOT_MACRO: - case OPC_READ_COND_MACRO: - case OPC_ANY_MACRO: - case OPC_ALL_MACRO: - before_block->condition = instr->srcs[0]->def->instr; - break; - default: - before_block->condition = NULL; - break; - } + struct ir3_block *exit = ir3_block_create(ir); + list_add(&exit->node, &header->node); - switch (instr->opc) { - case OPC_BALLOT_MACRO: - case OPC_READ_COND_MACRO: - before_block->brtype = IR3_BRANCH_COND; - break; - case OPC_ANY_MACRO: - before_block->brtype = IR3_BRANCH_ANY; - break; - case OPC_ALL_MACRO: - before_block->brtype = IR3_BRANCH_ALL; - break; - case OPC_ELECT_MACRO: - case OPC_READ_FIRST_MACRO: - case OPC_SWZ_SHARED_MACRO: - before_block->brtype = IR3_BRANCH_GETONE; - break; - default: - unreachable("bad opcode"); - } + struct ir3_block *footer = ir3_block_create(ir); + list_add(&footer->node, &exit->node); - switch (instr->opc) { - case OPC_ALL_MACRO: - case OPC_ANY_MACRO: - case OPC_ELECT_MACRO: - mov_immed(instr->dsts[0], then_block, 1); - mov_immed(instr->dsts[0], before_block, 0); - break; + link_blocks(before_block, header, 0); - case OPC_BALLOT_MACRO: { - unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask); - struct ir3_instruction *movmsk = - ir3_instr_create(then_block, OPC_MOVMSK, 1, 0); - ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags); - movmsk->repeat = comp_count - 1; - break; - } + link_blocks(header, exit, 0); + link_blocks(header, footer, 1); + header->brtype = IR3_BRANCH_GETONE; - case OPC_READ_COND_MACRO: - case OPC_READ_FIRST_MACRO: { - struct ir3_instruction *mov = - ir3_instr_create(then_block, OPC_MOV, 1, 1); - unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0; - ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags); - struct ir3_register *new_src = ir3_src_create(mov, 0, 0); - *new_src = *instr->srcs[src]; - mov->cat1.dst_type = TYPE_U32; - mov->cat1.src_type = - (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; - break; - } + link_blocks(exit, after_block, 0); + link_blocks_physical(exit, footer, 1); - case OPC_SWZ_SHARED_MACRO: { - struct ir3_instruction *swz = - ir3_instr_create(then_block, OPC_SWZ, 2, 2); - ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags); - ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags); - ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags); - ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags); - swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32; - swz->repeat = 1; - break; - } + link_blocks(footer, header, 0); - default: - unreachable("bad opcode"); + struct ir3_register *exclusive = instr->dsts[0]; + struct ir3_register *inclusive = instr->dsts[1]; + struct ir3_register *reduce = instr->dsts[2]; + struct ir3_register *src = instr->srcs[0]; + + mov_reg(exit, exclusive, reduce); + do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive); + mov_reg(exit, reduce, inclusive); + } else { + struct ir3_block *then_block = create_if(ir, before_block, after_block); + + /* For ballot, the destination must be initialized to 0 before we do + * the movmsk because the condition may be 0 and then the movmsk will + * be skipped. Because it's a shared register we have to wrap the + * initialization in a getone block. + */ + if (instr->opc == OPC_BALLOT_MACRO) { + before_block->brtype = IR3_BRANCH_GETONE; + before_block->condition = NULL; + mov_immed(instr->dsts[0], then_block, 0); + before_block = after_block; + after_block = split_block(ir, before_block, instr); + then_block = create_if(ir, before_block, after_block); + } + + switch (instr->opc) { + case OPC_BALLOT_MACRO: + case OPC_READ_COND_MACRO: + case OPC_ANY_MACRO: + case OPC_ALL_MACRO: + before_block->condition = instr->srcs[0]->def->instr; + break; + default: + before_block->condition = NULL; + break; + } + + switch (instr->opc) { + case OPC_BALLOT_MACRO: + case OPC_READ_COND_MACRO: + before_block->brtype = IR3_BRANCH_COND; + break; + case OPC_ANY_MACRO: + before_block->brtype = IR3_BRANCH_ANY; + break; + case OPC_ALL_MACRO: + before_block->brtype = IR3_BRANCH_ALL; + break; + case OPC_ELECT_MACRO: + case OPC_READ_FIRST_MACRO: + case OPC_SWZ_SHARED_MACRO: + before_block->brtype = IR3_BRANCH_GETONE; + break; + default: + unreachable("bad opcode"); + } + + switch (instr->opc) { + case OPC_ALL_MACRO: + case OPC_ANY_MACRO: + case OPC_ELECT_MACRO: + mov_immed(instr->dsts[0], then_block, 1); + mov_immed(instr->dsts[0], before_block, 0); + break; + + case OPC_BALLOT_MACRO: { + unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask); + struct ir3_instruction *movmsk = + ir3_instr_create(then_block, OPC_MOVMSK, 1, 0); + ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags); + movmsk->repeat = comp_count - 1; + break; + } + + case OPC_READ_COND_MACRO: + case OPC_READ_FIRST_MACRO: { + struct ir3_instruction *mov = + ir3_instr_create(then_block, OPC_MOV, 1, 1); + unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0; + ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags); + struct ir3_register *new_src = ir3_src_create(mov, 0, 0); + *new_src = *instr->srcs[src]; + mov->cat1.dst_type = TYPE_U32; + mov->cat1.src_type = + (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + break; + } + + case OPC_SWZ_SHARED_MACRO: { + struct ir3_instruction *swz = + ir3_instr_create(then_block, OPC_SWZ, 2, 2); + ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags); + ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags); + ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags); + ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags); + swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32; + swz->repeat = 1; + break; + } + + default: + unreachable("bad opcode"); + } } *block = after_block; diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index 48f7cdb0983..431ae3c2f8b 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -137,7 +137,51 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, disasm_a3xx_instr_name(instr->opc)); } - if (instr->opc != OPC_MOVMSK) { + if (instr->opc == OPC_SCAN_MACRO) { + switch (instr->cat1.reduce_op) { + case REDUCE_OP_ADD_U: + mesa_log_stream_printf(stream, ".add.u"); + break; + case REDUCE_OP_ADD_F: + mesa_log_stream_printf(stream, ".add.f"); + break; + case REDUCE_OP_MUL_U: + mesa_log_stream_printf(stream, ".mul.u"); + break; + case REDUCE_OP_MUL_F: + mesa_log_stream_printf(stream, ".mul.f"); + break; + case REDUCE_OP_MIN_U: + mesa_log_stream_printf(stream, ".min.u"); + break; + case REDUCE_OP_MIN_S: + mesa_log_stream_printf(stream, ".min.s"); + break; + case REDUCE_OP_MIN_F: + mesa_log_stream_printf(stream, ".min.f"); + break; + case REDUCE_OP_MAX_U: + mesa_log_stream_printf(stream, ".max.u"); + break; + case REDUCE_OP_MAX_S: + mesa_log_stream_printf(stream, ".max.s"); + break; + case REDUCE_OP_MAX_F: + mesa_log_stream_printf(stream, ".max.f"); + break; + case REDUCE_OP_AND_B: + mesa_log_stream_printf(stream, ".and.b"); + break; + case REDUCE_OP_OR_B: + mesa_log_stream_printf(stream, ".or.b"); + break; + case REDUCE_OP_XOR_B: + mesa_log_stream_printf(stream, ".xor.b"); + break; + } + } + + if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO) { mesa_log_stream_printf(stream, ".%s%s", type_name(instr->cat1.src_type), type_name(instr->cat1.dst_type)); diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index f10116a8032..b842daee5db 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -238,6 +238,14 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) } else if (instr->opc == OPC_ELECT_MACRO) { validate_assert(ctx, instr->dsts_count == 1); validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED)); + } else if (instr->opc == OPC_SCAN_MACRO) { + validate_assert(ctx, instr->dsts_count == 3); + validate_assert(ctx, instr->srcs_count == 2); + validate_assert(ctx, reg_class_flags(instr->dsts[0]) == + reg_class_flags(instr->srcs[0])); + validate_assert(ctx, reg_class_flags(instr->dsts[1]) == + reg_class_flags(instr->srcs[0])); + validate_assert(ctx, reg_class_flags(instr->dsts[2]) == IR3_REG_SHARED); } else { foreach_dst (dst, instr) validate_reg_size(ctx, dst, instr->cat1.dst_type);