ir3: lower vectorized NIR instructions

Use the new repeat group builders to lower vectorized NIR instructions.
Add NIR pass to vectorize NIR before lowering.

Support for repeated instruction is added over a number of different
commits. Here's how they all tie together:

ir3 is a scalar architecture and as such most instructions cannot be
vectorized. However, many instructions support the (rptN) modifier that
allows us to mimic vector instructions. Whenever an instruction has the
(rptN) modifier set it will execute N more times, incrementing its
destination register for each repetition. Additionally, source registers
with the (r) flag set will also be incremented.

For example:

(rpt1)add.f r0.x, (r)r1.x, r2.x

is the same as:

add.f r0.x, r1.x, r2.x
add.f r0.y, r1.y, r2.x

The main benefit of using repeated instructions is a reduction in code
size. Since every iteration is still executed as a scalar instruction,
there's no direct benefit in terms of runtime. The only exception seems
to be for 3-source instructions pre-a7xx: if one of the sources is
constant (i.e., without the (r) flag), a repeated instruction executes
faster than the equivalent expanded sequence. Presumably, this is
because the ALU only has 2 register read ports. I have not been able to
measure this difference on a7xx though.

Support for repeated instructions consists of two parts. First, we need
to make sure NIR is (mostly) vectorized when translating to ir3. I have
not been able to find a way to keep NIR vectorized all the way and still
generate decent code. Therefore, I have taken the approach of
vectorizing the (scalarized) NIR right before translating it to ir3.

Secondly, ir3 needs to be adapted to ingest vectorized NIR and translate
it to repeated instructions. To this end, I have introduced the concept
of "repeat groups" to ir3. A repeat group is a group of instructions
that were produced from a vectorized NIR operation and linked together.
They are, however, still separate scalar instructions until quite late.

More concretely:
1. Instruction emission: for every vectorized NIR operation, emit
   separate scalar instructions for its components and link them
   together in a repeat group. For every instruction builder ir3_X, a
   new repeat builder ir3_X_rpt has been added to facilitate this.
2. Optimization passes: for now, repeat groups are completely ignored by
   optimizations.
3. Pre-RA: clean up repeat groups that can never be merged into an
   actual rptN instruction (e.g., because their instructions are not
   consecutive anymore). This ensures no useless merge sets will be
   created in the next step.
4. RA: create merge sets for the sources and defs of the instructions in
   repeat groups. This way, RA will try to allocate consecutive
   registers for them. This will not be forced though because we prefer
   to split-up repeat groups over creating movs to reorder registers.
5. Post-RA: create actual rptN instructions for repeat groups where the
   allocated registers allow it.

The idea for step 2 is that we prefer that any potential optimizations
take precedence over creating rptN instructions as the latter will only
yield a code size benefit. However, it might be interesting to
investigate if we could make some optimizations repeat aware. For
example, the scheduler could try to schedule instructions of a repeat
group together.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28341>
This commit is contained in:
Job Noorman 2024-08-15 08:46:36 +02:00 committed by Marge Bot
parent 4c4366179b
commit 58d18bc7a8
4 changed files with 435 additions and 230 deletions

View file

@ -39,6 +39,83 @@
#include "ir3.h"
#include "ir3_context.h"
static struct ir3_instruction_rpt
rpt_instr(struct ir3_instruction *instr, unsigned nrpt)
{
struct ir3_instruction_rpt dst = {{0}};
for (unsigned i = 0; i < nrpt; ++i)
dst.rpts[i] = instr;
return dst;
}
static void
cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[],
unsigned n)
{
for (unsigned i = 0; i < n; ++i)
dst[i] = instrs[i];
}
static struct ir3_instruction_rpt
create_immed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val)
{
return rpt_instr(create_immed(block, val), nrpt);
}
static struct ir3_instruction_rpt
create_immed_shared_rpt(struct ir3_block *block, unsigned nrpt, uint32_t val,
bool shared)
{
return rpt_instr(create_immed_shared(block, val, shared), nrpt);
}
static struct ir3_instruction_rpt
create_immed_typed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val,
type_t type)
{
return rpt_instr(create_immed_typed(block, val, type), nrpt);
}
static inline struct ir3_instruction_rpt
create_immed_typed_shared_rpt(struct ir3_block *block, unsigned nrpt,
uint32_t val, type_t type, bool shared)
{
return rpt_instr(create_immed_typed_shared(block, val, type, shared), nrpt);
}
static void
set_instr_flags(struct ir3_instruction *instrs[], unsigned n,
ir3_instruction_flags flags)
{
for (unsigned i = 0; i < n; ++i)
instrs[i]->flags |= flags;
}
static void
set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round)
{
for (unsigned i = 0; i < n; ++i)
instrs[i]->cat1.round = round;
}
static void
set_cat2_condition(struct ir3_instruction *instrs[], unsigned n,
unsigned condition)
{
for (unsigned i = 0; i < n; ++i)
instrs[i]->cat2.condition = condition;
}
static void
set_dst_flags(struct ir3_instruction *instrs[], unsigned n,
ir3_register_flags flags)
{
for (unsigned i = 0; i < n; ++i)
instrs[i]->dsts[0]->flags |= flags;
}
void
ir3_handle_nonuniform(struct ir3_instruction *instr,
nir_intrinsic_instr *intrin)
@ -135,9 +212,9 @@ create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp,
* alu/sfu instructions:
*/
static struct ir3_instruction *
create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
unsigned src_bitsize, nir_op op)
static struct ir3_instruction_rpt
create_cov(struct ir3_context *ctx, unsigned nrpt,
struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op)
{
type_t src_type, dst_type;
@ -277,9 +354,11 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
* is used to achieve the result.
*/
if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
cov->dsts[0]->flags |= type_flags(dst_type);
struct ir3_instruction_rpt mask =
create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
struct ir3_instruction_rpt cov =
ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
set_dst_flags(cov.rpts, nrpt, type_flags(dst_type));
return cov;
}
@ -291,15 +370,16 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
op == nir_op_u2f32 || op == nir_op_i2f32);
struct ir3_instruction *cov;
struct ir3_instruction_rpt cov;
if (op == nir_op_u2f16 || op == nir_op_u2f32) {
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
cov->dsts[0]->flags |= IR3_REG_HALF;
cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type);
struct ir3_instruction_rpt mask =
create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
cov = ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF);
cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_U16, dst_type);
} else {
cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16);
cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type);
cov = ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U8, TYPE_S16);
cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_S16, dst_type);
}
return cov;
}
@ -312,17 +392,19 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
assert(op == nir_op_f2u8 || op == nir_op_f2i8);
type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type);
cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8);
struct ir3_instruction_rpt cov =
ir3_COV_rpt(ctx->block, nrpt, src, src_type, intermediate_type);
cov = ir3_COV_rpt(ctx->block, nrpt, cov, intermediate_type, TYPE_U8);
return cov;
}
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
struct ir3_instruction_rpt cov =
ir3_COV_rpt(ctx->block, nrpt, src, src_type, dst_type);
if (op == nir_op_f2f16_rtne) {
cov->cat1.round = ROUND_EVEN;
set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
} else if (op == nir_op_f2f16_rtz) {
cov->cat1.round = ROUND_ZERO;
set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
} else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) {
unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
nir_alu_type type =
@ -330,23 +412,23 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
nir_rounding_mode rounding_mode =
nir_get_rounding_mode_from_float_controls(execution_mode, type);
if (rounding_mode == nir_rounding_mode_rtne)
cov->cat1.round = ROUND_EVEN;
set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
else if (rounding_mode == nir_rounding_mode_rtz)
cov->cat1.round = ROUND_ZERO;
set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
}
return cov;
}
/* For shift instructions NIR always has shift amount as 32 bit integer */
static struct ir3_instruction *
resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
unsigned bs)
static struct ir3_instruction_rpt
resize_shift_amount(struct ir3_context *ctx, unsigned nrpt,
struct ir3_instruction_rpt src, unsigned bs)
{
if (bs == 16)
return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U16);
else if (bs == 8)
return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U8);
return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U8);
else
return src;
}
@ -452,18 +534,52 @@ emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
}
}
static bool
all_sat_compatible(struct ir3_instruction *instrs[], unsigned n)
{
for (unsigned i = 0; i < n; i++) {
if (!is_sat_compatible(instrs[i]->opc))
return false;
}
return true;
}
/* Is src the only use of its def, taking components into account. */
static bool
is_unique_use(nir_src *src)
{
nir_def *def = src->ssa;
if (list_is_singular(&def->uses))
return true;
nir_component_mask_t src_read_mask = nir_src_components_read(src);
nir_foreach_use (use, def) {
if (use == src)
continue;
if (nir_src_components_read(use) & src_read_mask)
return false;
}
return true;
}
static void
emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
{
const nir_op_info *info = &nir_op_infos[alu->op];
struct ir3_instruction **dst, *src[info->num_inputs];
struct ir3_instruction_rpt dst, src[info->num_inputs];
unsigned bs[info->num_inputs]; /* bit size */
struct ir3_block *b = ctx->block;
unsigned dst_sz, wrmask;
unsigned dst_sz;
type_t dst_type = type_uint_size(alu->def.bit_size);
dst_sz = alu->def.num_components;
wrmask = (1 << dst_sz) - 1;
assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op));
assert(dst_sz <= ARRAY_SIZE(src[0].rpts));
bool use_shared = !alu->def.divergent &&
ctx->compiler->has_scalar_alu &&
@ -477,7 +593,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
/* not supported in HW, we have to fall back to normal registers */
alu->op != nir_op_ffma;
dst = ir3_get_def(ctx, &alu->def, dst_sz);
struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);
/* Vectors are special in that they have non-scalarized writemasks,
* and just take the first swizzle channel for each argument in
@ -486,53 +602,39 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
(alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
(alu->op == nir_op_vec16)) {
struct ir3_instruction_rpt src;
assert(info->num_inputs <= ARRAY_SIZE(src.rpts));
for (int i = 0; i < info->num_inputs; i++) {
nir_alu_src *asrc = &alu->src[i];
src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
if (!src[i])
src[i] = create_immed_typed_shared(ctx->block, 0, dst_type, use_shared);
dst[i] = ir3_MOV(b, src[i], dst_type);
src.rpts[i] =
ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
compile_assert(ctx, src.rpts[i]);
}
dst = ir3_MOV_rpt(b, dst_sz, src, dst_type);
cp_instrs(def, dst.rpts, dst_sz);
ir3_put_def(ctx, &alu->def);
return;
}
/* We also get mov's with more than one component for mov's so
* handle those specially:
*/
if (alu->op == nir_op_mov) {
nir_alu_src *asrc = &alu->src[0];
struct ir3_instruction *const *src0 =
for (int i = 0; i < info->num_inputs; i++) {
nir_alu_src *asrc = &alu->src[i];
struct ir3_instruction *const *input_src =
ir3_get_src_shared(ctx, &asrc->src, use_shared);
for (unsigned i = 0; i < dst_sz; i++) {
if (wrmask & (1 << i)) {
dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
} else {
dst[i] = NULL;
}
}
ir3_put_def(ctx, &alu->def);
return;
}
/* General case: We can just grab the one used channel per src. */
assert(alu->def.num_components == 1);
for (int i = 0; i < info->num_inputs; i++) {
nir_alu_src *asrc = &alu->src[i];
src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
bs[i] = nir_src_bit_size(asrc->src);
compile_assert(ctx, src[i]);
for (unsigned rpt = 0; rpt < dst_sz; rpt++) {
src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]];
compile_assert(ctx, src[i].rpts[rpt]);
}
}
switch (alu->op) {
case nir_op_mov:
dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
break;
case nir_op_f2f32:
case nir_op_f2f16_rtne:
case nir_op_f2f16_rtz:
@ -558,11 +660,12 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
case nir_op_b2i8:
case nir_op_b2i16:
case nir_op_b2i32:
dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op);
break;
case nir_op_fquantize2f16:
dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
dst = create_cov(ctx, dst_sz,
create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne),
16, nir_op_f2f32);
break;
@ -574,7 +677,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
*
* A negate can turn those into a 1 or 0 for us.
*/
dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
break;
case nir_op_b2b32:
@ -583,20 +686,20 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
*
* A negate can turn those into a ~0 for us.
*/
dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
break;
case nir_op_fneg:
dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG);
break;
case nir_op_fabs:
dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS);
break;
case nir_op_fmax:
dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_fmin:
dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_fsat:
/* if there is just a single use of the src, and it supports
@ -604,222 +707,235 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
* src instruction and create a mov. This is easier for cp
* to eliminate.
*/
if (is_sat_compatible(src[0]->opc) &&
(list_length(&alu->src[0].src.ssa->uses) == 1)) {
src[0]->flags |= IR3_INSTR_SAT;
dst[0] = ir3_MOV(b, src[0], dst_type);
if (all_sat_compatible(src[0].rpts, dst_sz) &&
is_unique_use(&alu->src[0].src)) {
set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT);
dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
} else {
/* otherwise generate a max.f that saturates.. blob does
* similar (generating a cat2 mov using max.f)
*/
dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
dst[0]->flags |= IR3_INSTR_SAT;
dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0);
set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
}
break;
case nir_op_fmul:
dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_fadd:
dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_fsub:
dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
break;
case nir_op_ffma:
dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
break;
case nir_op_flt:
dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_LT;
dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
break;
case nir_op_fge:
dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_GE;
dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
break;
case nir_op_feq:
dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_EQ;
dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
break;
case nir_op_fneu:
dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_NE;
dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
break;
case nir_op_fceil:
dst[0] = ir3_CEIL_F(b, src[0], 0);
dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_ffloor:
dst[0] = ir3_FLOOR_F(b, src[0], 0);
dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_ftrunc:
dst[0] = ir3_TRUNC_F(b, src[0], 0);
dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fround_even:
dst[0] = ir3_RNDNE_F(b, src[0], 0);
dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fsign:
dst[0] = ir3_SIGN_F(b, src[0], 0);
dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fsin:
dst[0] = ir3_SIN(b, src[0], 0);
dst = ir3_SIN_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fcos:
dst[0] = ir3_COS(b, src[0], 0);
dst = ir3_COS_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_frsq:
dst[0] = ir3_RSQ(b, src[0], 0);
dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_frcp:
dst[0] = ir3_RCP(b, src[0], 0);
assert(dst_sz == 1);
dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0);
break;
case nir_op_flog2:
dst[0] = ir3_LOG2(b, src[0], 0);
dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fexp2:
dst[0] = ir3_EXP2(b, src[0], 0);
dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_fsqrt:
dst[0] = ir3_SQRT(b, src[0], 0);
dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_iabs:
dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS);
break;
case nir_op_iadd:
dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_ihadd:
dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
dst[0]->dsts[0]->flags |= IR3_REG_EI;
dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
break;
case nir_op_uhadd:
dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
dst[0]->dsts[0]->flags |= IR3_REG_EI;
dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
break;
case nir_op_iand:
dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_imax:
dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_umax:
dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_imin:
dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_umin:
dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_umul_low:
dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_imadsh_mix16:
if (use_shared) {
struct ir3_instruction *sixteen = create_immed_shared(b, 16, true);
struct ir3_instruction *src1 = ir3_SHR_B(b, src[1], 0, sixteen, 0);
struct ir3_instruction *mul = ir3_MULL_U(b, src[0], 0, src1, 0);
dst[0] = ir3_ADD_U(b, ir3_SHL_B(b, mul, 0, sixteen, 0), 0, src[2], 0);
struct ir3_instruction_rpt sixteen =
create_immed_shared_rpt(b, dst_sz, 16, true);
struct ir3_instruction_rpt src1 =
ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0);
struct ir3_instruction_rpt mul =
ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0);
dst = ir3_ADD_U_rpt(b, dst_sz,
ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0,
src[2], 0);
} else {
dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
}
break;
case nir_op_imad24_ir3:
if (use_shared) {
dst[0] = ir3_ADD_U(b, ir3_MUL_U24(b, src[0], 0, src[1], 0), 0, src[2], 0);
dst = ir3_ADD_U_rpt(b, dst_sz,
ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0),
0, src[2], 0);
} else {
dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
}
break;
case nir_op_imul:
compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16);
dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_imul24:
dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_ineg:
dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
break;
case nir_op_inot:
if (bs[0] == 1) {
struct ir3_instruction *one =
create_immed_typed_shared(ctx->block, 1, ctx->compiler->bool_type,
use_shared);
dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
struct ir3_instruction_rpt one = create_immed_typed_shared_rpt(
ctx->block, dst_sz, 1, ctx->compiler->bool_type, use_shared);
dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0);
} else {
dst[0] = ir3_NOT_B(b, src[0], 0);
dst = ir3_NOT_B_rpt(ctx->block, dst_sz, src[0], 0);
}
break;
case nir_op_ior:
dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_ishl:
dst[0] =
ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
dst = ir3_SHL_B_rpt(ctx->block, dst_sz, src[0], 0,
resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
break;
case nir_op_ishr:
dst[0] =
ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
dst = ir3_ASHR_B_rpt(ctx->block, dst_sz, src[0], 0,
resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
break;
case nir_op_isub:
dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_ixor:
dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
break;
case nir_op_ushr:
dst[0] =
ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
dst = ir3_SHR_B_rpt(ctx->block, dst_sz, src[0], 0,
resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
break;
case nir_op_ilt:
dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_LT;
dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
break;
case nir_op_ige:
dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_GE;
dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
break;
case nir_op_ieq:
dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_EQ;
dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
break;
case nir_op_ine:
dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_NE;
dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
break;
case nir_op_ult:
dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_LT;
dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
break;
case nir_op_uge:
dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
dst[0]->cat2.condition = IR3_COND_GE;
dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
break;
case nir_op_bcsel: {
struct ir3_instruction *cond = ir3_get_cond_for_nonzero_compare(src[0]);
struct ir3_instruction_rpt conds;
compile_assert(ctx, bs[1] == bs[2]);
/* TODO: repeat the covs when possible. */
for (unsigned rpt = 0; rpt < dst_sz; ++rpt) {
struct ir3_instruction *cond =
ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]);
/* The condition's size has to match the other two arguments' size, so
* convert down if necessary.
*
* Single hashtable is fine, because the conversion will either be
* 16->32 or 32->16, but never both
*/
if (is_half(src[1]) != is_half(cond)) {
struct hash_entry *prev_entry =
_mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
if (is_half(src[1].rpts[rpt]) != is_half(cond)) {
struct hash_entry *prev_entry = _mesa_hash_table_search(
ctx->sel_cond_conversions, src[0].rpts[rpt]);
if (prev_entry) {
cond = prev_entry->data;
} else {
if (is_half(cond)) {
if (bs[0] == 8) {
/* Zero-extension of an 8-bit value has to be done through masking,
* as in create_cov.
/* Zero-extension of an 8-bit value has to be done through
* masking, as in create_cov.
*/
struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
struct ir3_instruction *mask =
create_immed_typed(b, 0xff, TYPE_U8);
cond = ir3_AND_B(b, cond, 0, mask, 0);
} else {
cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
@ -827,86 +943,98 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
} else {
cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
}
_mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
_mesa_hash_table_insert(ctx->sel_cond_conversions,
src[0].rpts[rpt], cond);
}
}
conds.rpts[rpt] = cond;
}
if (is_half(src[1])) {
dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
} else {
dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
}
if (is_half(src[1].rpts[0]))
dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
else
dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
break;
}
case nir_op_bit_count: {
if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) {
dst[0] = ir3_CBITS_B(b, src[0], 0);
if (ctx->compiler->gen < 5 ||
(src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {
dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0);
break;
}
// We need to do this 16b at a time on a5xx+a6xx. Once half-precision
// support is in place, this should probably move to a NIR lowering pass:
struct ir3_instruction *hi, *lo;
struct ir3_instruction_rpt hi, lo;
hi = ir3_COV(b,
ir3_SHR_B(b, src[0], 0, create_immed_shared(b, 16, use_shared), 0),
hi = ir3_COV_rpt(
b, dst_sz,
ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0),
TYPE_U32, TYPE_U16);
lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16);
hi = ir3_CBITS_B(b, hi, 0);
lo = ir3_CBITS_B(b, lo, 0);
hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0);
lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0);
// TODO maybe the builders should default to making dst half-precision
// if the src's were half precision, to make this less awkward.. otoh
// we should probably just do this lowering in NIR.
hi->dsts[0]->flags |= IR3_REG_HALF;
lo->dsts[0]->flags |= IR3_REG_HALF;
set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF);
set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF);
dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
dst[0]->dsts[0]->flags |= IR3_REG_HALF;
dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0);
set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF);
dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32);
break;
}
case nir_op_ifind_msb: {
struct ir3_instruction *cmp;
dst[0] = ir3_CLZ_S(b, src[0], 0);
cmp = ir3_CMPS_S(b, dst[0], 0, create_immed_shared(b, 0, use_shared), 0);
cmp->cat2.condition = IR3_COND_GE;
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
dst[0], 0),
0, cmp, 0, dst[0], 0);
struct ir3_instruction_rpt cmp;
dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0);
cmp =
ir3_CMPS_S_rpt(b, dst_sz, dst, 0,
create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0);
set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE);
dst = ir3_SEL_B32_rpt(
b, dst_sz,
ir3_SUB_U_rpt(b, dst_sz,
create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
dst, 0),
0, cmp, 0, dst, 0);
break;
}
case nir_op_ufind_msb:
dst[0] = ir3_CLZ_B(b, src[0], 0);
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
dst[0], 0),
0, src[0], 0, dst[0], 0);
dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0);
dst = ir3_SEL_B32_rpt(
b, dst_sz,
ir3_SUB_U_rpt(b, dst_sz,
create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
dst, 0),
0, src[0], 0, dst, 0);
break;
case nir_op_find_lsb:
dst[0] = ir3_BFREV_B(b, src[0], 0);
dst[0] = ir3_CLZ_B(b, dst[0], 0);
dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0);
break;
case nir_op_bitfield_reverse:
dst[0] = ir3_BFREV_B(b, src[0], 0);
dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
break;
case nir_op_uadd_sat:
dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
dst[0]->flags |= IR3_INSTR_SAT;
dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
break;
case nir_op_iadd_sat:
dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
dst[0]->flags |= IR3_INSTR_SAT;
dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
break;
case nir_op_usub_sat:
dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
dst[0]->flags |= IR3_INSTR_SAT;
dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
break;
case nir_op_isub_sat:
dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0);
dst[0]->flags |= IR3_INSTR_SAT;
dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
break;
case nir_op_udot_4x8_uadd:
@ -915,10 +1043,15 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
case nir_op_sdot_4x8_iadd_sat:
case nir_op_sudot_4x8_iadd:
case nir_op_sudot_4x8_iadd_sat: {
assert(dst_sz == 1);
struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0],
src[2].rpts[0]};
if (ctx->compiler->has_dp4acc) {
emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0);
} else if (ctx->compiler->has_dp2acc) {
emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0);
} else {
ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
nir_op_infos[alu->op].name);
@ -935,12 +1068,12 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
assert(dst_sz == 1);
} else {
/* 1-bit values stored in 32-bit registers are only valid for certain
* ALU ops.
*/
switch (alu->op) {
case nir_op_mov:
case nir_op_iand:
case nir_op_ior:
case nir_op_ixor:
@ -952,6 +1085,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
}
}
cp_instrs(def, dst.rpts, dst_sz);
ir3_put_def(ctx, &alu->def);
}
@ -2365,6 +2499,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
struct ir3_block *b = ctx->block;
unsigned dest_components = nir_intrinsic_dest_components(intr);
int idx;
bool create_rpt = false;
if (info->has_dest) {
dst = ir3_get_def(ctx, &intr->def, dest_components);
@ -2443,6 +2578,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
b, idx + i,
intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
}
create_rpt = true;
} else {
src = ctx->compiler->has_scalar_alu ?
ir3_get_src_maybe_shared(ctx, &intr->src[0]) :
@ -2708,6 +2844,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
unsigned n = idx * 4 + i;
dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
}
create_rpt = true;
break;
case nir_intrinsic_load_front_face:
if (!ctx->frag_face) {
@ -2749,16 +2886,19 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_num_workgroups:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_workgroup_size:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_subgroup_size: {
assert(ctx->so->type == MESA_SHADER_COMPUTE ||
@ -2784,11 +2924,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_tess_level_inner_default:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_frag_invocation_count:
dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT);
@ -2803,6 +2945,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, param + 4 * view + i);
}
create_rpt = true;
} else {
struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
for (int i = 0; i < dest_components; i++) {
@ -3131,8 +3274,11 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
break;
}
if (info->has_dest)
if (info->has_dest) {
if (create_rpt)
ir3_instr_create_rpt(dst, dest_components);
ir3_put_def(ctx, &intr->def);
}
}
static void
@ -4042,16 +4188,17 @@ static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
* instead of adding an explicit not.b/sub.u instruction.
*/
static struct ir3_instruction *
get_branch_condition(struct ir3_context *ctx, nir_src *src, bool *inv)
get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp,
bool *inv)
{
struct ir3_instruction *condition = ir3_get_src(ctx, src)[0];
struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp];
if (src->ssa->parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);
if (nir_cond->op == nir_op_inot) {
struct ir3_instruction *inv_cond =
get_branch_condition(ctx, &nir_cond->src[0].src, inv);
struct ir3_instruction *inv_cond = get_branch_condition(
ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv);
*inv = !*inv;
return inv_cond;
}
@ -4087,10 +4234,10 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
return NULL;
bool inv1, inv2;
struct ir3_instruction *cond1 =
get_branch_condition(ctx, &alu_cond->src[0].src, &inv1);
struct ir3_instruction *cond2 =
get_branch_condition(ctx, &alu_cond->src[1].src, &inv2);
struct ir3_instruction *cond1 = get_branch_condition(
ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1);
struct ir3_instruction *cond2 = get_branch_condition(
ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2);
struct ir3_instruction *branch;
if (alu_cond->op == nir_op_iand) {
@ -4224,7 +4371,7 @@ emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
bool inv;
struct ir3_instruction *condition =
get_branch_condition(ctx, &nif->condition, &inv);
get_branch_condition(ctx, &nif->condition, 0, &inv);
struct ir3_instruction *pred, *pred_inv;
if (!inv) {
@ -4253,7 +4400,8 @@ emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
return predicated;
bool inv1;
struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1);
struct ir3_instruction *cond1 =
get_branch_condition(ctx, nir_cond, 0, &inv1);
struct ir3_instruction *branch =
ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE);
branch->cat0.inv1 = inv1;

View file

@ -121,9 +121,17 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch)
NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true);
bool vectorized = false;
NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter,
NULL);
NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
if (vectorized) {
NIR_PASS_V(ctx->s, nir_opt_undef);
NIR_PASS_V(ctx->s, nir_copy_prop);
NIR_PASS_V(ctx->s, nir_opt_dce);
}
NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true);
/* This has to go at the absolute end to make sure that all SSA defs are
* correctly marked.

View file

@ -61,6 +61,9 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
unsigned topology);
void ir3_nir_lower_gs(nir_shader *shader);
bool ir3_supports_vectorized_nir_op(nir_op op);
uint8_t ir3_nir_vectorize_filter(const nir_instr *instr, const void *data);
/*
* 64b related lowering:
*/

View file

@ -5,6 +5,52 @@
#include "ir3_nir.h"
bool
ir3_supports_vectorized_nir_op(nir_op op)
{
switch (op) {
/* TODO: emitted as absneg which can often be folded away (e.g., into
* (neg)). This seems to often fail when repeated.
*/
case nir_op_b2b1:
/* dsx/dsy don't seem to support repeat. */
case nir_op_fddx:
case nir_op_fddx_coarse:
case nir_op_fddx_fine:
case nir_op_fddy:
case nir_op_fddy_coarse:
case nir_op_fddy_fine:
/* dp2acc/dp4acc don't seem to support repeat. */
case nir_op_udot_4x8_uadd:
case nir_op_udot_4x8_uadd_sat:
case nir_op_sudot_4x8_iadd:
case nir_op_sudot_4x8_iadd_sat:
/* Among SFU instructions, only rcp doesn't seem to support repeat. */
case nir_op_frcp:
return false;
default:
return true;
}
}
uint8_t
ir3_nir_vectorize_filter(const nir_instr *instr, const void *data)
{
if (instr->type != nir_instr_type_alu)
return 0;
struct nir_alu_instr *alu = nir_instr_as_alu(instr);
if (!ir3_supports_vectorized_nir_op(alu->op))
return 0;
return 4;
}
static void
rpt_list_split(struct list_head *list, struct list_head *at)
{