mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 04:38:03 +02:00
ir3: Use scalar ALU instructions when possible
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22075>
This commit is contained in:
parent
32308fe9f1
commit
b4874aa5cf
2 changed files with 75 additions and 18 deletions
|
|
@ -385,6 +385,23 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
dst_sz = alu->def.num_components;
|
dst_sz = alu->def.num_components;
|
||||||
wrmask = (1 << dst_sz) - 1;
|
wrmask = (1 << dst_sz) - 1;
|
||||||
|
|
||||||
|
bool use_shared = !alu->def.divergent &&
|
||||||
|
ctx->compiler->has_scalar_alu &&
|
||||||
|
/* not ALU ops */
|
||||||
|
alu->op != nir_op_fddx &&
|
||||||
|
alu->op != nir_op_fddx_fine &&
|
||||||
|
alu->op != nir_op_fddx_coarse &&
|
||||||
|
alu->op != nir_op_fddy &&
|
||||||
|
alu->op != nir_op_fddy_fine &&
|
||||||
|
alu->op != nir_op_fddy_coarse &&
|
||||||
|
/* it probably isn't worth emulating these with scalar-only ops */
|
||||||
|
alu->op != nir_op_udot_4x8_uadd &&
|
||||||
|
alu->op != nir_op_udot_4x8_uadd_sat &&
|
||||||
|
alu->op != nir_op_sudot_4x8_iadd &&
|
||||||
|
alu->op != nir_op_sudot_4x8_iadd_sat &&
|
||||||
|
/* not supported in HW, we have to fall back to normal registers */
|
||||||
|
alu->op != nir_op_ffma;
|
||||||
|
|
||||||
dst = ir3_get_def(ctx, &alu->def, dst_sz);
|
dst = ir3_get_def(ctx, &alu->def, dst_sz);
|
||||||
|
|
||||||
/* Vectors are special in that they have non-scalarized writemasks,
|
/* Vectors are special in that they have non-scalarized writemasks,
|
||||||
|
|
@ -398,9 +415,9 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
for (int i = 0; i < info->num_inputs; i++) {
|
for (int i = 0; i < info->num_inputs; i++) {
|
||||||
nir_alu_src *asrc = &alu->src[i];
|
nir_alu_src *asrc = &alu->src[i];
|
||||||
|
|
||||||
src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
|
src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
|
||||||
if (!src[i])
|
if (!src[i])
|
||||||
src[i] = create_immed_typed(ctx->block, 0, dst_type);
|
src[i] = create_immed_typed_shared(ctx->block, 0, dst_type, use_shared);
|
||||||
dst[i] = ir3_MOV(b, src[i], dst_type);
|
dst[i] = ir3_MOV(b, src[i], dst_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -413,7 +430,8 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
*/
|
*/
|
||||||
if (alu->op == nir_op_mov) {
|
if (alu->op == nir_op_mov) {
|
||||||
nir_alu_src *asrc = &alu->src[0];
|
nir_alu_src *asrc = &alu->src[0];
|
||||||
struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
|
struct ir3_instruction *const *src0 =
|
||||||
|
ir3_get_src_shared(ctx, &asrc->src, use_shared);
|
||||||
|
|
||||||
for (unsigned i = 0; i < dst_sz; i++) {
|
for (unsigned i = 0; i < dst_sz; i++) {
|
||||||
if (wrmask & (1 << i)) {
|
if (wrmask & (1 << i)) {
|
||||||
|
|
@ -433,7 +451,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
for (int i = 0; i < info->num_inputs; i++) {
|
for (int i = 0; i < info->num_inputs; i++) {
|
||||||
nir_alu_src *asrc = &alu->src[i];
|
nir_alu_src *asrc = &alu->src[i];
|
||||||
|
|
||||||
src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
|
src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
|
||||||
bs[i] = nir_src_bit_size(asrc->src);
|
bs[i] = nir_src_bit_size(asrc->src);
|
||||||
|
|
||||||
compile_assert(ctx, src[i]);
|
compile_assert(ctx, src[i]);
|
||||||
|
|
@ -641,10 +659,21 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
|
dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
|
||||||
break;
|
break;
|
||||||
case nir_op_imadsh_mix16:
|
case nir_op_imadsh_mix16:
|
||||||
dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
|
if (use_shared) {
|
||||||
|
struct ir3_instruction *sixteen = create_immed_shared(b, 16, true);
|
||||||
|
struct ir3_instruction *src1 = ir3_SHR_B(b, src[1], 0, sixteen, 0);
|
||||||
|
struct ir3_instruction *mul = ir3_MULL_U(b, src[0], 0, src1, 0);
|
||||||
|
dst[0] = ir3_ADD_U(b, ir3_SHL_B(b, mul, 0, sixteen, 0), 0, src[2], 0);
|
||||||
|
} else {
|
||||||
|
dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case nir_op_imad24_ir3:
|
case nir_op_imad24_ir3:
|
||||||
dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
|
if (use_shared) {
|
||||||
|
dst[0] = ir3_ADD_U(b, ir3_MUL_U24(b, src[0], 0, src[1], 0), 0, src[2], 0);
|
||||||
|
} else {
|
||||||
|
dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case nir_op_imul:
|
case nir_op_imul:
|
||||||
compile_assert(ctx, alu->def.bit_size == 16);
|
compile_assert(ctx, alu->def.bit_size == 16);
|
||||||
|
|
@ -659,7 +688,8 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
case nir_op_inot:
|
case nir_op_inot:
|
||||||
if (bs[0] == 1) {
|
if (bs[0] == 1) {
|
||||||
struct ir3_instruction *one =
|
struct ir3_instruction *one =
|
||||||
create_immed_typed(ctx->block, 1, ctx->compiler->bool_type);
|
create_immed_typed_shared(ctx->block, 1, ctx->compiler->bool_type,
|
||||||
|
use_shared);
|
||||||
dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
|
dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
|
||||||
} else {
|
} else {
|
||||||
dst[0] = ir3_NOT_B(b, src[0], 0);
|
dst[0] = ir3_NOT_B(b, src[0], 0);
|
||||||
|
|
@ -755,8 +785,9 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
// support is in place, this should probably move to a NIR lowering pass:
|
// support is in place, this should probably move to a NIR lowering pass:
|
||||||
struct ir3_instruction *hi, *lo;
|
struct ir3_instruction *hi, *lo;
|
||||||
|
|
||||||
hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
|
hi = ir3_COV(b,
|
||||||
TYPE_U16);
|
ir3_SHR_B(b, src[0], 0, create_immed_shared(b, 16, use_shared), 0),
|
||||||
|
TYPE_U32, TYPE_U16);
|
||||||
lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
|
lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
|
||||||
|
|
||||||
hi = ir3_CBITS_B(b, hi, 0);
|
hi = ir3_CBITS_B(b, hi, 0);
|
||||||
|
|
@ -776,15 +807,17 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||||
case nir_op_ifind_msb: {
|
case nir_op_ifind_msb: {
|
||||||
struct ir3_instruction *cmp;
|
struct ir3_instruction *cmp;
|
||||||
dst[0] = ir3_CLZ_S(b, src[0], 0);
|
dst[0] = ir3_CLZ_S(b, src[0], 0);
|
||||||
cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
|
cmp = ir3_CMPS_S(b, dst[0], 0, create_immed_shared(b, 0, use_shared), 0);
|
||||||
cmp->cat2.condition = IR3_COND_GE;
|
cmp->cat2.condition = IR3_COND_GE;
|
||||||
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
|
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
|
||||||
|
dst[0], 0),
|
||||||
0, cmp, 0, dst[0], 0);
|
0, cmp, 0, dst[0], 0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_op_ufind_msb:
|
case nir_op_ufind_msb:
|
||||||
dst[0] = ir3_CLZ_B(b, src[0], 0);
|
dst[0] = ir3_CLZ_B(b, src[0], 0);
|
||||||
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
|
dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
|
||||||
|
dst[0], 0),
|
||||||
0, src[0], 0, dst[0], 0);
|
0, src[0], 0, dst[0], 0);
|
||||||
break;
|
break;
|
||||||
case nir_op_find_lsb:
|
case nir_op_find_lsb:
|
||||||
|
|
@ -881,6 +914,12 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||||
ctx->so->bindless_ubo = true;
|
ctx->so->bindless_ubo = true;
|
||||||
ir3_handle_nonuniform(ldc, intr);
|
ir3_handle_nonuniform(ldc, intr);
|
||||||
|
|
||||||
|
if (!intr->def.divergent &&
|
||||||
|
ctx->compiler->has_scalar_alu) {
|
||||||
|
ldc->dsts[0]->flags |= IR3_REG_SHARED;
|
||||||
|
ldc->flags |= IR3_INSTR_U;
|
||||||
|
}
|
||||||
|
|
||||||
ir3_split_dest(b, dst, ldc, 0, ncomp);
|
ir3_split_dest(b, dst, ldc, 0, ncomp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2205,12 +2244,20 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
|
intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
src = ir3_get_src(ctx, &intr->src[0]);
|
src = ctx->compiler->has_scalar_alu ?
|
||||||
|
ir3_get_src_maybe_shared(ctx, &intr->src[0]) :
|
||||||
|
ir3_get_src(ctx, &intr->src[0]);
|
||||||
for (int i = 0; i < dest_components; i++) {
|
for (int i = 0; i < dest_components; i++) {
|
||||||
dst[i] = create_uniform_indirect(
|
dst[i] = create_uniform_indirect(
|
||||||
b, idx + i,
|
b, idx + i,
|
||||||
intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
|
intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
|
||||||
ir3_get_addr0(ctx, src[0], 1));
|
ir3_get_addr0(ctx, src[0], 1));
|
||||||
|
/* Since this may not be foldable into conversions into shared
|
||||||
|
* registers, manually make it shared. Optimizations can undo this if
|
||||||
|
* the user can't use shared regs.
|
||||||
|
*/
|
||||||
|
if (ctx->compiler->has_scalar_alu && !intr->def.divergent)
|
||||||
|
dst[i]->dsts[0]->flags |= IR3_REG_SHARED;
|
||||||
}
|
}
|
||||||
/* NOTE: if relative addressing is used, we set
|
/* NOTE: if relative addressing is used, we set
|
||||||
* constlen in the compiler (to worst-case value)
|
* constlen in the compiler (to worst-case value)
|
||||||
|
|
@ -2767,7 +2814,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||||
unsigned dst_hi = dst >> 8;
|
unsigned dst_hi = dst >> 8;
|
||||||
|
|
||||||
struct ir3_instruction *src =
|
struct ir3_instruction *src =
|
||||||
ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), components);
|
ir3_create_collect(b, ir3_get_src_shared(ctx, &intr->src[0],
|
||||||
|
ctx->compiler->has_scalar_alu),
|
||||||
|
components);
|
||||||
struct ir3_instruction *a1 = NULL;
|
struct ir3_instruction *a1 = NULL;
|
||||||
if (dst_hi) {
|
if (dst_hi) {
|
||||||
/* Encode only the high part of the destination in a1.x to increase the
|
/* Encode only the high part of the destination in a1.x to increase the
|
||||||
|
|
@ -3565,6 +3614,10 @@ emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
|
||||||
__ssa_dst(phi);
|
__ssa_dst(phi);
|
||||||
phi->phi.nphi = nphi;
|
phi->phi.nphi = nphi;
|
||||||
|
|
||||||
|
if (ctx->compiler->has_scalar_alu &&
|
||||||
|
!nphi->def.divergent)
|
||||||
|
phi->dsts[0]->flags |= IR3_REG_SHARED;
|
||||||
|
|
||||||
dst[0] = phi;
|
dst[0] = phi;
|
||||||
|
|
||||||
ir3_put_def(ctx, &nphi->def);
|
ir3_put_def(ctx, &nphi->def);
|
||||||
|
|
@ -3603,7 +3656,9 @@ read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
|
||||||
/* We need to insert the move at the end of the block */
|
/* We need to insert the move at the end of the block */
|
||||||
struct ir3_block *old_block = ctx->block;
|
struct ir3_block *old_block = ctx->block;
|
||||||
ctx->block = blk;
|
ctx->block = blk;
|
||||||
struct ir3_instruction *src = ir3_get_src(ctx, &nsrc->src)[0];
|
struct ir3_instruction *src =
|
||||||
|
ir3_get_src_shared(ctx, &nsrc->src,
|
||||||
|
phi->dsts[0]->flags & IR3_REG_SHARED)[0];
|
||||||
ctx->block = old_block;
|
ctx->block = old_block;
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -423,6 +423,7 @@ create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
|
||||||
struct ir3_instruction *instr, *immed;
|
struct ir3_instruction *instr, *immed;
|
||||||
|
|
||||||
instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
|
instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
|
||||||
|
bool shared = (src->dsts[0]->flags & IR3_REG_SHARED);
|
||||||
|
|
||||||
switch (align) {
|
switch (align) {
|
||||||
case 1:
|
case 1:
|
||||||
|
|
@ -430,17 +431,17 @@ create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
/* src *= 2 => src <<= 1: */
|
/* src *= 2 => src <<= 1: */
|
||||||
immed = create_immed_typed(block, 1, TYPE_S16);
|
immed = create_immed_typed_shared(block, 1, TYPE_S16, shared);
|
||||||
instr = ir3_SHL_B(block, instr, 0, immed, 0);
|
instr = ir3_SHL_B(block, instr, 0, immed, 0);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
/* src *= 3: */
|
/* src *= 3: */
|
||||||
immed = create_immed_typed(block, 3, TYPE_S16);
|
immed = create_immed_typed_shared(block, 3, TYPE_S16, shared);
|
||||||
instr = ir3_MULL_U(block, instr, 0, immed, 0);
|
instr = ir3_MULL_U(block, instr, 0, immed, 0);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
/* src *= 4 => src <<= 2: */
|
/* src *= 4 => src <<= 2: */
|
||||||
immed = create_immed_typed(block, 2, TYPE_S16);
|
immed = create_immed_typed_shared(block, 2, TYPE_S16, shared);
|
||||||
instr = ir3_SHL_B(block, instr, 0, immed, 0);
|
instr = ir3_SHL_B(block, instr, 0, immed, 0);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
@ -452,6 +453,7 @@ create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
|
||||||
|
|
||||||
instr = ir3_MOV(block, instr, TYPE_S16);
|
instr = ir3_MOV(block, instr, TYPE_S16);
|
||||||
instr->dsts[0]->num = regid(REG_A0, 0);
|
instr->dsts[0]->num = regid(REG_A0, 0);
|
||||||
|
instr->dsts[0]->flags &= ~IR3_REG_SHARED;
|
||||||
|
|
||||||
return instr;
|
return instr;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue