mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 20:20:18 +01:00
aco/isel: remove uniform reduce/scan optimization
This is now done in NIR, with the exception of exclusive min/max/and/or scans. But those are not really useful, and if we ever come across them we can optimize them in NIR using write_invocation_amd. No Foz-DB changes on Navi21. Acked-by: Marek Olšák <marek.olsak@amd.com> Acked-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38902>
This commit is contained in:
parent
81245e262f
commit
a2b70ce4ec
2 changed files with 0 additions and 173 deletions
|
|
@ -784,11 +784,6 @@ validate_ir(Program* program)
|
||||||
check(instr->definitions[0].regClass().type() == RegType::sgpr ||
|
check(instr->definitions[0].regClass().type() == RegType::sgpr ||
|
||||||
program->wave_size == 32,
|
program->wave_size == 32,
|
||||||
"The result of unclustered reductions must go into an SGPR.", instr.get());
|
"The result of unclustered reductions must go into an SGPR.", instr.get());
|
||||||
else
|
|
||||||
check(instr->definitions[0].regClass().type() == RegType::vgpr,
|
|
||||||
"The result of scans and clustered reductions must go into a VGPR.",
|
|
||||||
instr.get());
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Format::SMEM: {
|
case Format::SMEM: {
|
||||||
|
|
|
||||||
|
|
@ -3440,156 +3440,6 @@ emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
|
||||||
bld.copy(dst, src);
|
bld.copy(dst, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
|
|
||||||
{
|
|
||||||
Builder bld(ctx->program, ctx->block);
|
|
||||||
Temp src_tmp = get_ssa_temp(ctx, src.ssa);
|
|
||||||
|
|
||||||
if (op == nir_op_fadd) {
|
|
||||||
src_tmp = as_vgpr(ctx, src_tmp);
|
|
||||||
Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
|
|
||||||
: dst.getTemp();
|
|
||||||
|
|
||||||
if (src.ssa->bit_size == 16) {
|
|
||||||
count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
|
|
||||||
bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
|
|
||||||
} else {
|
|
||||||
assert(src.ssa->bit_size == 32);
|
|
||||||
count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
|
|
||||||
bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tmp != dst.getTemp())
|
|
||||||
bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dst.regClass() == s1)
|
|
||||||
src_tmp = bld.as_uniform(src_tmp);
|
|
||||||
|
|
||||||
if (op == nir_op_ixor && count.type() == RegType::sgpr)
|
|
||||||
count =
|
|
||||||
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
|
|
||||||
else if (op == nir_op_ixor)
|
|
||||||
count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
|
|
||||||
|
|
||||||
assert(dst.getTemp().type() == count.type());
|
|
||||||
|
|
||||||
if (nir_src_is_const(src)) {
|
|
||||||
uint32_t imm = nir_src_as_uint(src);
|
|
||||||
if (imm == 1 && dst.bytes() <= 2)
|
|
||||||
bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
|
|
||||||
else if (imm == 1)
|
|
||||||
bld.copy(dst, count);
|
|
||||||
else if (imm == 0)
|
|
||||||
bld.copy(dst, Operand::zero(dst.bytes()));
|
|
||||||
else if (count.type() == RegType::vgpr)
|
|
||||||
bld.v_mul_imm(dst, count, imm, true, true);
|
|
||||||
else if (imm == 0xffffffff)
|
|
||||||
bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
|
|
||||||
else if (util_is_power_of_two_or_zero(imm))
|
|
||||||
bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
|
|
||||||
Operand::c32(ffs(imm) - 1u));
|
|
||||||
else
|
|
||||||
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
|
|
||||||
} else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
|
|
||||||
bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
|
|
||||||
} else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
|
|
||||||
bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
|
|
||||||
} else if (dst.getTemp().type() == RegType::vgpr) {
|
|
||||||
bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
|
|
||||||
} else {
|
|
||||||
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
||||||
{
|
|
||||||
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
|
||||||
if (op == nir_op_imul || op == nir_op_fmul)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
|
|
||||||
Builder bld(ctx->program, ctx->block);
|
|
||||||
Definition dst(get_ssa_temp(ctx, &instr->def));
|
|
||||||
unsigned bit_size = instr->src[0].ssa->bit_size;
|
|
||||||
if (bit_size > 32)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Temp thread_count =
|
|
||||||
bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
|
|
||||||
set_wqm(ctx);
|
|
||||||
|
|
||||||
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
|
|
||||||
} else {
|
|
||||||
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
||||||
{
|
|
||||||
Builder bld(ctx->program, ctx->block);
|
|
||||||
Definition dst(get_ssa_temp(ctx, &instr->def));
|
|
||||||
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
|
||||||
bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
|
|
||||||
|
|
||||||
if (op == nir_op_imul || op == nir_op_fmul)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
|
|
||||||
if (instr->src[0].ssa->bit_size > 32)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Temp packed_tid;
|
|
||||||
if (inc)
|
|
||||||
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
|
|
||||||
else
|
|
||||||
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
|
|
||||||
set_wqm(ctx);
|
|
||||||
|
|
||||||
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
|
|
||||||
op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
|
|
||||||
|
|
||||||
if (inc) {
|
|
||||||
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Copy the source and write the reduction operation identity to the first lane. */
|
|
||||||
Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
|
|
||||||
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
|
||||||
ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
|
|
||||||
if (dst.bytes() == 8) {
|
|
||||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
|
||||||
uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
|
|
||||||
uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
|
|
||||||
|
|
||||||
lo =
|
|
||||||
bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
|
|
||||||
hi =
|
|
||||||
bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
|
|
||||||
bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
|
|
||||||
} else {
|
|
||||||
uint32_t identity = get_reduction_identity(reduce_op, 0);
|
|
||||||
bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
|
|
||||||
as_vgpr(ctx, src));
|
|
||||||
}
|
|
||||||
|
|
||||||
set_wqm(ctx);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Temp
|
Temp
|
||||||
emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
|
emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
|
||||||
Definition dst, Temp src)
|
Definition dst, Temp src)
|
||||||
|
|
@ -4498,24 +4348,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
const unsigned bit_size = instr->src[0].ssa->bit_size;
|
const unsigned bit_size = instr->src[0].ssa->bit_size;
|
||||||
assert(bit_size != 1);
|
assert(bit_size != 1);
|
||||||
|
|
||||||
if (!nir_src_is_divergent(&instr->src[0])) {
|
|
||||||
/* We use divergence analysis to assign the regclass, so check if it's
|
|
||||||
* working as expected */
|
|
||||||
ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
|
|
||||||
if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
|
|
||||||
cluster_size != ctx->program->wave_size)
|
|
||||||
expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
|
|
||||||
op == nir_op_imul || op == nir_op_fmul;
|
|
||||||
assert(instr->def.divergent == expected_divergent);
|
|
||||||
|
|
||||||
if (instr->intrinsic == nir_intrinsic_reduce) {
|
|
||||||
if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
|
|
||||||
break;
|
|
||||||
} else if (emit_uniform_scan(ctx, instr)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
|
src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
|
||||||
ReduceOp reduce_op = get_reduce_op(op, bit_size);
|
ReduceOp reduce_op = get_reduce_op(op, bit_size);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue