diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index c2a97557947..32ff0ca91dc 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -39,6 +39,83 @@ #include "ir3.h" #include "ir3_context.h" +static struct ir3_instruction_rpt +rpt_instr(struct ir3_instruction *instr, unsigned nrpt) +{ + struct ir3_instruction_rpt dst = {{0}}; + + for (unsigned i = 0; i < nrpt; ++i) + dst.rpts[i] = instr; + + return dst; +} + +static void +cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[], + unsigned n) +{ + for (unsigned i = 0; i < n; ++i) + dst[i] = instrs[i]; +} + +static struct ir3_instruction_rpt +create_immed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val) +{ + return rpt_instr(create_immed(block, val), nrpt); +} + +static struct ir3_instruction_rpt +create_immed_shared_rpt(struct ir3_block *block, unsigned nrpt, uint32_t val, + bool shared) +{ + return rpt_instr(create_immed_shared(block, val, shared), nrpt); +} + +static struct ir3_instruction_rpt +create_immed_typed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val, + type_t type) +{ + return rpt_instr(create_immed_typed(block, val, type), nrpt); +} + +static inline struct ir3_instruction_rpt +create_immed_typed_shared_rpt(struct ir3_block *block, unsigned nrpt, + uint32_t val, type_t type, bool shared) +{ + return rpt_instr(create_immed_typed_shared(block, val, type, shared), nrpt); +} + +static void +set_instr_flags(struct ir3_instruction *instrs[], unsigned n, + ir3_instruction_flags flags) +{ + for (unsigned i = 0; i < n; ++i) + instrs[i]->flags |= flags; +} + +static void +set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round) +{ + for (unsigned i = 0; i < n; ++i) + instrs[i]->cat1.round = round; +} + +static void +set_cat2_condition(struct ir3_instruction *instrs[], unsigned n, + unsigned condition) +{ + for (unsigned i = 0; i < n; ++i) + instrs[i]->cat2.condition = condition; +} + +static void +set_dst_flags(struct ir3_instruction *instrs[], unsigned n, + ir3_register_flags flags) +{ + for (unsigned i = 0; i < n; ++i) + instrs[i]->dsts[0]->flags |= flags; +} + void ir3_handle_nonuniform(struct ir3_instruction *instr, nir_intrinsic_instr *intrin) @@ -135,9 +212,9 @@ create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp, * alu/sfu instructions: */ -static struct ir3_instruction * -create_cov(struct ir3_context *ctx, struct ir3_instruction *src, - unsigned src_bitsize, nir_op op) +static struct ir3_instruction_rpt +create_cov(struct ir3_context *ctx, unsigned nrpt, + struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op) { type_t src_type, dst_type; @@ -277,9 +354,11 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, * is used to achieve the result. */ if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) { - struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8); - struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0); - cov->dsts[0]->flags |= type_flags(dst_type); + struct ir3_instruction_rpt mask = + create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8); + struct ir3_instruction_rpt cov = + ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0); + set_dst_flags(cov.rpts, nrpt, type_flags(dst_type)); return cov; } @@ -291,15 +370,16 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, assert(op == nir_op_u2f16 || op == nir_op_i2f16 || op == nir_op_u2f32 || op == nir_op_i2f32); - struct ir3_instruction *cov; + struct ir3_instruction_rpt cov; if (op == nir_op_u2f16 || op == nir_op_u2f32) { - struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8); - cov = ir3_AND_B(ctx->block, src, 0, mask, 0); - cov->dsts[0]->flags |= IR3_REG_HALF; - cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type); + struct ir3_instruction_rpt mask = + create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8); + cov = ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0); + set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF); + cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_U16, dst_type); } else { - cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16); - cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type); + cov = ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U8, TYPE_S16); + cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_S16, dst_type); } return cov; } @@ -312,17 +392,19 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, assert(op == nir_op_f2u8 || op == nir_op_f2i8); type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16; - struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type); - cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8); + struct ir3_instruction_rpt cov = + ir3_COV_rpt(ctx->block, nrpt, src, src_type, intermediate_type); + cov = ir3_COV_rpt(ctx->block, nrpt, cov, intermediate_type, TYPE_U8); return cov; } - struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type); + struct ir3_instruction_rpt cov = + ir3_COV_rpt(ctx->block, nrpt, src, src_type, dst_type); if (op == nir_op_f2f16_rtne) { - cov->cat1.round = ROUND_EVEN; + set_cat1_round(cov.rpts, nrpt, ROUND_EVEN); } else if (op == nir_op_f2f16_rtz) { - cov->cat1.round = ROUND_ZERO; + set_cat1_round(cov.rpts, nrpt, ROUND_ZERO); } else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) { unsigned execution_mode = ctx->s->info.float_controls_execution_mode; nir_alu_type type = @@ -330,23 +412,23 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, nir_rounding_mode rounding_mode = nir_get_rounding_mode_from_float_controls(execution_mode, type); if (rounding_mode == nir_rounding_mode_rtne) - cov->cat1.round = ROUND_EVEN; + set_cat1_round(cov.rpts, nrpt, ROUND_EVEN); else if (rounding_mode == nir_rounding_mode_rtz) - cov->cat1.round = ROUND_ZERO; + set_cat1_round(cov.rpts, nrpt, ROUND_ZERO); } return cov; } /* For shift instructions NIR always has shift amount as 32 bit integer */ -static struct ir3_instruction * -resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src, - unsigned bs) +static struct ir3_instruction_rpt +resize_shift_amount(struct ir3_context *ctx, unsigned nrpt, + struct ir3_instruction_rpt src, unsigned bs) { if (bs == 16) - return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16); + return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U16); else if (bs == 8) - return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U8); + return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U8); else return src; } @@ -452,18 +534,52 @@ emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu, } } +static bool +all_sat_compatible(struct ir3_instruction *instrs[], unsigned n) +{ + for (unsigned i = 0; i < n; i++) { + if (!is_sat_compatible(instrs[i]->opc)) + return false; + } + + return true; +} + +/* Is src the only use of its def, taking components into account. */ +static bool +is_unique_use(nir_src *src) +{ + nir_def *def = src->ssa; + + if (list_is_singular(&def->uses)) + return true; + + nir_component_mask_t src_read_mask = nir_src_components_read(src); + + nir_foreach_use (use, def) { + if (use == src) + continue; + + if (nir_src_components_read(use) & src_read_mask) + return false; + } + + return true; +} + static void emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) { const nir_op_info *info = &nir_op_infos[alu->op]; - struct ir3_instruction **dst, *src[info->num_inputs]; + struct ir3_instruction_rpt dst, src[info->num_inputs]; unsigned bs[info->num_inputs]; /* bit size */ struct ir3_block *b = ctx->block; - unsigned dst_sz, wrmask; + unsigned dst_sz; type_t dst_type = type_uint_size(alu->def.bit_size); dst_sz = alu->def.num_components; - wrmask = (1 << dst_sz) - 1; + assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op)); + assert(dst_sz <= ARRAY_SIZE(src[0].rpts)); bool use_shared = !alu->def.divergent && ctx->compiler->has_scalar_alu && @@ -477,7 +593,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) /* not supported in HW, we have to fall back to normal registers */ alu->op != nir_op_ffma; - dst = ir3_get_def(ctx, &alu->def, dst_sz); + struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz); /* Vectors are special in that they have non-scalarized writemasks, * and just take the first swizzle channel for each argument in @@ -486,53 +602,39 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) || (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) || (alu->op == nir_op_vec16)) { + struct ir3_instruction_rpt src; + assert(info->num_inputs <= ARRAY_SIZE(src.rpts)); for (int i = 0; i < info->num_inputs; i++) { nir_alu_src *asrc = &alu->src[i]; - - src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]]; - if (!src[i]) - src[i] = create_immed_typed_shared(ctx->block, 0, dst_type, use_shared); - dst[i] = ir3_MOV(b, src[i], dst_type); + src.rpts[i] = + ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]]; + compile_assert(ctx, src.rpts[i]); } + dst = ir3_MOV_rpt(b, dst_sz, src, dst_type); + cp_instrs(def, dst.rpts, dst_sz); ir3_put_def(ctx, &alu->def); return; } - /* We also get mov's with more than one component for mov's so - * handle those specially: - */ - if (alu->op == nir_op_mov) { - nir_alu_src *asrc = &alu->src[0]; - struct ir3_instruction *const *src0 = - ir3_get_src_shared(ctx, &asrc->src, use_shared); - - for (unsigned i = 0; i < dst_sz; i++) { - if (wrmask & (1 << i)) { - dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type); - } else { - dst[i] = NULL; - } - } - - ir3_put_def(ctx, &alu->def); - return; - } - - /* General case: We can just grab the one used channel per src. */ - assert(alu->def.num_components == 1); - for (int i = 0; i < info->num_inputs; i++) { nir_alu_src *asrc = &alu->src[i]; - - src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]]; + struct ir3_instruction *const *input_src = + ir3_get_src_shared(ctx, &asrc->src, use_shared); bs[i] = nir_src_bit_size(asrc->src); - compile_assert(ctx, src[i]); + for (unsigned rpt = 0; rpt < dst_sz; rpt++) { + src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]]; + compile_assert(ctx, src[i].rpts[rpt]); + } } switch (alu->op) { + case nir_op_mov: + dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type); + break; + case nir_op_f2f32: case nir_op_f2f16_rtne: case nir_op_f2f16_rtz: @@ -558,12 +660,13 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) case nir_op_b2i8: case nir_op_b2i16: case nir_op_b2i32: - dst[0] = create_cov(ctx, src[0], bs[0], alu->op); + dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op); break; case nir_op_fquantize2f16: - dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne), - 16, nir_op_f2f32); + dst = create_cov(ctx, dst_sz, + create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne), + 16, nir_op_f2f32); break; case nir_op_b2b1: @@ -574,7 +677,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) * * A negate can turn those into a 1 or 0 for us. */ - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); + dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG); break; case nir_op_b2b32: @@ -583,20 +686,20 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) * * A negate can turn those into a ~0 for us. */ - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); + dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG); break; case nir_op_fneg: - dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); + dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG); break; case nir_op_fabs: - dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); + dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS); break; case nir_op_fmax: - dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); + dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_fmin: - dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); + dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_fsat: /* if there is just a single use of the src, and it supports @@ -604,309 +707,334 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) * src instruction and create a mov. This is easier for cp * to eliminate. */ - if (is_sat_compatible(src[0]->opc) && - (list_length(&alu->src[0].src.ssa->uses) == 1)) { - src[0]->flags |= IR3_INSTR_SAT; - dst[0] = ir3_MOV(b, src[0], dst_type); + if (all_sat_compatible(src[0].rpts, dst_sz) && + is_unique_use(&alu->src[0].src)) { + set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT); + dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type); } else { /* otherwise generate a max.f that saturates.. blob does * similar (generating a cat2 mov using max.f) */ - dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0); - dst[0]->flags |= IR3_INSTR_SAT; + dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0); + set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT); } break; case nir_op_fmul: - dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); + dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_fadd: - dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); + dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_fsub: - dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); + dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG); break; case nir_op_ffma: - dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); + dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0); break; case nir_op_flt: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; + dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT); break; case nir_op_fge: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; + dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE); break; case nir_op_feq: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_EQ; + dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ); break; case nir_op_fneu: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_NE; + dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE); break; case nir_op_fceil: - dst[0] = ir3_CEIL_F(b, src[0], 0); + dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0); break; case nir_op_ffloor: - dst[0] = ir3_FLOOR_F(b, src[0], 0); + dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0); break; case nir_op_ftrunc: - dst[0] = ir3_TRUNC_F(b, src[0], 0); + dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0); break; case nir_op_fround_even: - dst[0] = ir3_RNDNE_F(b, src[0], 0); + dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0); break; case nir_op_fsign: - dst[0] = ir3_SIGN_F(b, src[0], 0); + dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0); break; case nir_op_fsin: - dst[0] = ir3_SIN(b, src[0], 0); + dst = ir3_SIN_rpt(b, dst_sz, src[0], 0); break; case nir_op_fcos: - dst[0] = ir3_COS(b, src[0], 0); + dst = ir3_COS_rpt(b, dst_sz, src[0], 0); break; case nir_op_frsq: - dst[0] = ir3_RSQ(b, src[0], 0); + dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0); break; case nir_op_frcp: - dst[0] = ir3_RCP(b, src[0], 0); + assert(dst_sz == 1); + dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0); break; case nir_op_flog2: - dst[0] = ir3_LOG2(b, src[0], 0); + dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0); break; case nir_op_fexp2: - dst[0] = ir3_EXP2(b, src[0], 0); + dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0); break; case nir_op_fsqrt: - dst[0] = ir3_SQRT(b, src[0], 0); + dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0); break; case nir_op_iabs: - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); + dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS); break; case nir_op_iadd: - dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); + dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_ihadd: - dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0); - dst[0]->dsts[0]->flags |= IR3_REG_EI; + dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI); break; case nir_op_uhadd: - dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); - dst[0]->dsts[0]->flags |= IR3_REG_EI; + dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI); break; case nir_op_iand: - dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); + dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_imax: - dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); + dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_umax: - dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); + dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_imin: - dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); + dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_umin: - dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); + dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_umul_low: - dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0); + dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_imadsh_mix16: if (use_shared) { - struct ir3_instruction *sixteen = create_immed_shared(b, 16, true); - struct ir3_instruction *src1 = ir3_SHR_B(b, src[1], 0, sixteen, 0); - struct ir3_instruction *mul = ir3_MULL_U(b, src[0], 0, src1, 0); - dst[0] = ir3_ADD_U(b, ir3_SHL_B(b, mul, 0, sixteen, 0), 0, src[2], 0); + struct ir3_instruction_rpt sixteen = + create_immed_shared_rpt(b, dst_sz, 16, true); + struct ir3_instruction_rpt src1 = + ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0); + struct ir3_instruction_rpt mul = + ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0); + dst = ir3_ADD_U_rpt(b, dst_sz, + ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0, + src[2], 0); } else { - dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0); + dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0); } break; case nir_op_imad24_ir3: if (use_shared) { - dst[0] = ir3_ADD_U(b, ir3_MUL_U24(b, src[0], 0, src[1], 0), 0, src[2], 0); + dst = ir3_ADD_U_rpt(b, dst_sz, + ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0), + 0, src[2], 0); } else { - dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0); + dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0); } break; case nir_op_imul: compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16); - dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0); + dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_imul24: - dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0); + dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_ineg: - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); + dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG); break; case nir_op_inot: if (bs[0] == 1) { - struct ir3_instruction *one = - create_immed_typed_shared(ctx->block, 1, ctx->compiler->bool_type, - use_shared); - dst[0] = ir3_SUB_U(b, one, 0, src[0], 0); + struct ir3_instruction_rpt one = create_immed_typed_shared_rpt( + ctx->block, dst_sz, 1, ctx->compiler->bool_type, use_shared); + dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0); } else { - dst[0] = ir3_NOT_B(b, src[0], 0); + dst = ir3_NOT_B_rpt(ctx->block, dst_sz, src[0], 0); } break; case nir_op_ior: - dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); + dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_ishl: - dst[0] = - ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); + dst = ir3_SHL_B_rpt(ctx->block, dst_sz, src[0], 0, + resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0); break; case nir_op_ishr: - dst[0] = - ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); + dst = ir3_ASHR_B_rpt(ctx->block, dst_sz, src[0], 0, + resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0); break; case nir_op_isub: - dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); + dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_ixor: - dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); + dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0); break; case nir_op_ushr: - dst[0] = - ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0); + dst = ir3_SHR_B_rpt(ctx->block, dst_sz, src[0], 0, + resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0); break; case nir_op_ilt: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; + dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT); break; case nir_op_ige: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; + dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE); break; case nir_op_ieq: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_EQ; + dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ); break; case nir_op_ine: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_NE; + dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE); break; case nir_op_ult: - dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; + dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT); break; case nir_op_uge: - dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; + dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE); break; case nir_op_bcsel: { - struct ir3_instruction *cond = ir3_get_cond_for_nonzero_compare(src[0]); + struct ir3_instruction_rpt conds; compile_assert(ctx, bs[1] == bs[2]); - /* The condition's size has to match the other two arguments' size, so - * convert down if necessary. - * - * Single hashtable is fine, because the conversion will either be - * 16->32 or 32->16, but never both - */ - if (is_half(src[1]) != is_half(cond)) { - struct hash_entry *prev_entry = - _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]); - if (prev_entry) { - cond = prev_entry->data; - } else { - if (is_half(cond)) { - if (bs[0] == 8) { - /* Zero-extension of an 8-bit value has to be done through masking, - * as in create_cov. - */ - struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8); - cond = ir3_AND_B(b, cond, 0, mask, 0); - } else { - cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32); - } + /* TODO: repeat the covs when possible. */ + for (unsigned rpt = 0; rpt < dst_sz; ++rpt) { + struct ir3_instruction *cond = + ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]); + + /* The condition's size has to match the other two arguments' size, so + * convert down if necessary. + * + * Single hashtable is fine, because the conversion will either be + * 16->32 or 32->16, but never both + */ + if (is_half(src[1].rpts[rpt]) != is_half(cond)) { + struct hash_entry *prev_entry = _mesa_hash_table_search( + ctx->sel_cond_conversions, src[0].rpts[rpt]); + if (prev_entry) { + cond = prev_entry->data; } else { - cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); + if (is_half(cond)) { + if (bs[0] == 8) { + /* Zero-extension of an 8-bit value has to be done through + * masking, as in create_cov. + */ + struct ir3_instruction *mask = + create_immed_typed(b, 0xff, TYPE_U8); + cond = ir3_AND_B(b, cond, 0, mask, 0); + } else { + cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32); + } + } else { + cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); + } + _mesa_hash_table_insert(ctx->sel_cond_conversions, + src[0].rpts[rpt], cond); } - _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond); } + conds.rpts[rpt] = cond; } - if (is_half(src[1])) { - dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0); - } else { - dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); - } - + if (is_half(src[1].rpts[0])) + dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); + else + dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); break; } case nir_op_bit_count: { - if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) { - dst[0] = ir3_CBITS_B(b, src[0], 0); + if (ctx->compiler->gen < 5 || + (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) { + dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0); break; } // We need to do this 16b at a time on a5xx+a6xx. Once half-precision // support is in place, this should probably move to a NIR lowering pass: - struct ir3_instruction *hi, *lo; + struct ir3_instruction_rpt hi, lo; - hi = ir3_COV(b, - ir3_SHR_B(b, src[0], 0, create_immed_shared(b, 16, use_shared), 0), - TYPE_U32, TYPE_U16); - lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16); + hi = ir3_COV_rpt( + b, dst_sz, + ir3_SHR_B_rpt(b, dst_sz, src[0], 0, + create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0), + TYPE_U32, TYPE_U16); + lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16); - hi = ir3_CBITS_B(b, hi, 0); - lo = ir3_CBITS_B(b, lo, 0); + hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0); + lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0); // TODO maybe the builders should default to making dst half-precision // if the src's were half precision, to make this less awkward.. otoh // we should probably just do this lowering in NIR. - hi->dsts[0]->flags |= IR3_REG_HALF; - lo->dsts[0]->flags |= IR3_REG_HALF; + set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF); + set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF); - dst[0] = ir3_ADD_S(b, hi, 0, lo, 0); - dst[0]->dsts[0]->flags |= IR3_REG_HALF; - dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32); + dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0); + set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF); + dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32); break; } case nir_op_ifind_msb: { - struct ir3_instruction *cmp; - dst[0] = ir3_CLZ_S(b, src[0], 0); - cmp = ir3_CMPS_S(b, dst[0], 0, create_immed_shared(b, 0, use_shared), 0); - cmp->cat2.condition = IR3_COND_GE; - dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0, - dst[0], 0), - 0, cmp, 0, dst[0], 0); + struct ir3_instruction_rpt cmp; + dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0); + cmp = + ir3_CMPS_S_rpt(b, dst_sz, dst, 0, + create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0); + set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE); + dst = ir3_SEL_B32_rpt( + b, dst_sz, + ir3_SUB_U_rpt(b, dst_sz, + create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0, + dst, 0), + 0, cmp, 0, dst, 0); break; } case nir_op_ufind_msb: - dst[0] = ir3_CLZ_B(b, src[0], 0); - dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0, - dst[0], 0), - 0, src[0], 0, dst[0], 0); + dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0); + dst = ir3_SEL_B32_rpt( + b, dst_sz, + ir3_SUB_U_rpt(b, dst_sz, + create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0, + dst, 0), + 0, src[0], 0, dst, 0); break; case nir_op_find_lsb: - dst[0] = ir3_BFREV_B(b, src[0], 0); - dst[0] = ir3_CLZ_B(b, dst[0], 0); + dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0); + dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0); break; case nir_op_bitfield_reverse: - dst[0] = ir3_BFREV_B(b, src[0], 0); + dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0); break; case nir_op_uadd_sat: - dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); - dst[0]->flags |= IR3_INSTR_SAT; + dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT); break; case nir_op_iadd_sat: - dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0); - dst[0]->flags |= IR3_INSTR_SAT; + dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT); break; case nir_op_usub_sat: - dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); - dst[0]->flags |= IR3_INSTR_SAT; + dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT); break; case nir_op_isub_sat: - dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0); - dst[0]->flags |= IR3_INSTR_SAT; + dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0); + set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT); break; case nir_op_udot_4x8_uadd: @@ -915,10 +1043,15 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) case nir_op_sdot_4x8_iadd_sat: case nir_op_sudot_4x8_iadd: case nir_op_sudot_4x8_iadd_sat: { + assert(dst_sz == 1); + + struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0], + src[2].rpts[0]}; + if (ctx->compiler->has_dp4acc) { - emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src); + emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0); } else if (ctx->compiler->has_dp2acc) { - emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src); + emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0); } else { ir3_context_error(ctx, "ALU op should have been lowered: %s\n", nir_op_infos[alu->op].name); @@ -935,12 +1068,12 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) { assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32); - assert(dst_sz == 1); } else { /* 1-bit values stored in 32-bit registers are only valid for certain * ALU ops. */ switch (alu->op) { + case nir_op_mov: case nir_op_iand: case nir_op_ior: case nir_op_ixor: @@ -952,6 +1085,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) } } + cp_instrs(def, dst.rpts, dst_sz); ir3_put_def(ctx, &alu->def); } @@ -2365,6 +2499,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_block *b = ctx->block; unsigned dest_components = nir_intrinsic_dest_components(intr); int idx; + bool create_rpt = false; if (info->has_dest) { dst = ir3_get_def(ctx, &intr->def, dest_components); @@ -2443,6 +2578,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) b, idx + i, intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32); } + create_rpt = true; } else { src = ctx->compiler->has_scalar_alu ? ir3_get_src_maybe_shared(ctx, &intr->src[0]) : @@ -2708,6 +2844,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) unsigned n = idx * 4 + i; dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); } + create_rpt = true; break; case nir_intrinsic_load_front_face: if (!ctx->frag_face) { @@ -2749,16 +2886,19 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i); } + create_rpt = true; break; case nir_intrinsic_load_num_workgroups: for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); } + create_rpt = true; break; case nir_intrinsic_load_workgroup_size: for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); } + create_rpt = true; break; case nir_intrinsic_load_subgroup_size: { assert(ctx->so->type == MESA_SHADER_COMPUTE || @@ -2784,11 +2924,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i); } + create_rpt = true; break; case nir_intrinsic_load_tess_level_inner_default: for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i); } + create_rpt = true; break; case nir_intrinsic_load_frag_invocation_count: dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT); @@ -2803,6 +2945,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param(ctx, param + 4 * view + i); } + create_rpt = true; } else { struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0]; for (int i = 0; i < dest_components; i++) { @@ -3131,8 +3274,11 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) break; } - if (info->has_dest) + if (info->has_dest) { + if (create_rpt) + ir3_instr_create_rpt(dst, dest_components); ir3_put_def(ctx, &intr->def); + } } static void @@ -4042,16 +4188,17 @@ static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); * instead of adding an explicit not.b/sub.u instruction. */ static struct ir3_instruction * -get_branch_condition(struct ir3_context *ctx, nir_src *src, bool *inv) +get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp, + bool *inv) { - struct ir3_instruction *condition = ir3_get_src(ctx, src)[0]; + struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp]; if (src->ssa->parent_instr->type == nir_instr_type_alu) { nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr); if (nir_cond->op == nir_op_inot) { - struct ir3_instruction *inv_cond = - get_branch_condition(ctx, &nir_cond->src[0].src, inv); + struct ir3_instruction *inv_cond = get_branch_condition( + ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv); *inv = !*inv; return inv_cond; } @@ -4087,10 +4234,10 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond) return NULL; bool inv1, inv2; - struct ir3_instruction *cond1 = - get_branch_condition(ctx, &alu_cond->src[0].src, &inv1); - struct ir3_instruction *cond2 = - get_branch_condition(ctx, &alu_cond->src[1].src, &inv2); + struct ir3_instruction *cond1 = get_branch_condition( + ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1); + struct ir3_instruction *cond2 = get_branch_condition( + ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2); struct ir3_instruction *branch; if (alu_cond->op == nir_op_iand) { @@ -4224,7 +4371,7 @@ emit_predicated_branch(struct ir3_context *ctx, nir_if *nif) bool inv; struct ir3_instruction *condition = - get_branch_condition(ctx, &nif->condition, &inv); + get_branch_condition(ctx, &nif->condition, 0, &inv); struct ir3_instruction *pred, *pred_inv; if (!inv) { @@ -4253,7 +4400,8 @@ emit_conditional_branch(struct ir3_context *ctx, nir_if *nif) return predicated; bool inv1; - struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1); + struct ir3_instruction *cond1 = + get_branch_condition(ctx, nir_cond, 0, &inv1); struct ir3_instruction *branch = ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE); branch->cat0.inv1 = inv1; diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 695b5a6b27f..1edf61b2259 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -121,9 +121,17 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch) NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch); - NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true); + bool vectorized = false; + NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter, + NULL); - NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true); + if (vectorized) { + NIR_PASS_V(ctx->s, nir_opt_undef); + NIR_PASS_V(ctx->s, nir_copy_prop); + NIR_PASS_V(ctx->s, nir_opt_dce); + } + + NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true); /* This has to go at the absolute end to make sure that all SSA defs are * correctly marked. diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 94b78a037bf..ba1cb751ed8 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -61,6 +61,9 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology); void ir3_nir_lower_gs(nir_shader *shader); +bool ir3_supports_vectorized_nir_op(nir_op op); +uint8_t ir3_nir_vectorize_filter(const nir_instr *instr, const void *data); + /* * 64b related lowering: */ diff --git a/src/freedreno/ir3/ir3_rpt.c b/src/freedreno/ir3/ir3_rpt.c index 45b9c5d136d..597fe911599 100644 --- a/src/freedreno/ir3/ir3_rpt.c +++ b/src/freedreno/ir3/ir3_rpt.c @@ -5,6 +5,52 @@ #include "ir3_nir.h" +bool +ir3_supports_vectorized_nir_op(nir_op op) +{ + switch (op) { + /* TODO: emitted as absneg which can often be folded away (e.g., into + * (neg)). This seems to often fail when repeated. + */ + case nir_op_b2b1: + + /* dsx/dsy don't seem to support repeat. */ + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + + /* dp2acc/dp4acc don't seem to support repeat. */ + case nir_op_udot_4x8_uadd: + case nir_op_udot_4x8_uadd_sat: + case nir_op_sudot_4x8_iadd: + case nir_op_sudot_4x8_iadd_sat: + + /* Among SFU instructions, only rcp doesn't seem to support repeat. */ + case nir_op_frcp: + return false; + + default: + return true; + } +} + +uint8_t +ir3_nir_vectorize_filter(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return 0; + + struct nir_alu_instr *alu = nir_instr_as_alu(instr); + + if (!ir3_supports_vectorized_nir_op(alu->op)) + return 0; + + return 4; +} + static void rpt_list_split(struct list_head *list, struct list_head *at) {