mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 13:30:12 +01:00
aco: don't insert a copy when emitting p_wqm
Totals from 351 (0.46% of 76572) affected shaders: (GFX11) Instrs: 709202 -> 709600 (+0.06%); split: -0.02%, +0.08% CodeSize: 3606364 -> 3608040 (+0.05%); split: -0.01%, +0.06% Latency: 3589841 -> 3590756 (+0.03%); split: -0.01%, +0.03% InvThroughput: 463303 -> 463324 (+0.00%) SClause: 28147 -> 28201 (+0.19%); split: -0.02%, +0.22% Copies: 43243 -> 43204 (-0.09%); split: -0.24%, +0.15% PreSGPRs: 21028 -> 21042 (+0.07%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25038>
This commit is contained in:
parent
040142684c
commit
28904839da
1 changed files with 118 additions and 148 deletions
|
|
@ -168,20 +168,13 @@ emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base =
|
||||||
return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
|
return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp
|
inline void
|
||||||
emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
|
emit_wqm(Builder& bld, bool program_needs_wqm = false)
|
||||||
{
|
{
|
||||||
if (dst.id())
|
|
||||||
bld.copy(Definition(dst), src);
|
|
||||||
else
|
|
||||||
dst = src;
|
|
||||||
|
|
||||||
if (bld.program->stage == fragment_fs) {
|
if (bld.program->stage == fragment_fs) {
|
||||||
bld.pseudo(aco_opcode::p_wqm);
|
bld.pseudo(aco_opcode::p_wqm);
|
||||||
bld.program->needs_wqm |= program_needs_wqm;
|
bld.program->needs_wqm |= program_needs_wqm;
|
||||||
}
|
}
|
||||||
|
|
||||||
return dst;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static Temp
|
static Temp
|
||||||
|
|
@ -3836,13 +3829,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
|
||||||
Temp tmp;
|
Temp tmp;
|
||||||
if (ctx->program->gfx_level >= GFX8) {
|
if (ctx->program->gfx_level >= GFX8) {
|
||||||
Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
|
Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
|
||||||
tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
|
bld.vop2_dpp(aco_opcode::v_sub_f32, Definition(dst), src, tl, dpp_ctrl2);
|
||||||
} else {
|
} else {
|
||||||
Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
|
Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
|
||||||
Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
|
Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
|
||||||
tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
|
bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
|
||||||
}
|
}
|
||||||
emit_wqm(bld, tmp, dst, true);
|
emit_wqm(bld, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: isel_err(&instr->instr, "Unknown NIR ALU instr");
|
default: isel_err(&instr->instr, "Unknown NIR ALU instr");
|
||||||
|
|
@ -5315,15 +5308,13 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem
|
||||||
Temp p10 =
|
Temp p10 =
|
||||||
bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
|
bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
|
||||||
res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
|
res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
|
||||||
|
emit_extract_vector(ctx, res, 0, dst);
|
||||||
} else {
|
} else {
|
||||||
Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
|
Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
|
||||||
res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), p, coord2, p10);
|
bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
|
||||||
}
|
}
|
||||||
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
||||||
if (dst.regClass() != v2b)
|
emit_wqm(bld, true);
|
||||||
emit_wqm(bld, res, dst, true);
|
|
||||||
else
|
|
||||||
emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -5389,13 +5380,14 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig
|
||||||
} else {
|
} else {
|
||||||
Temp p =
|
Temp p =
|
||||||
bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
||||||
|
if (dst.regClass() == v2b) {
|
||||||
Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
|
Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
|
||||||
|
emit_extract_vector(ctx, res, 0, dst);
|
||||||
|
} else {
|
||||||
|
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
|
||||||
|
}
|
||||||
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
||||||
if (dst.regClass() != v2b)
|
emit_wqm(bld, true);
|
||||||
emit_wqm(bld, res, dst, true);
|
|
||||||
else
|
|
||||||
emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
|
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
|
||||||
|
|
@ -5943,12 +5935,11 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v
|
||||||
|
|
||||||
bool has_dst = dst.id() != 0;
|
bool has_dst = dst.id() != 0;
|
||||||
assert(!needs_wqm || has_dst);
|
assert(!needs_wqm || has_dst);
|
||||||
Temp tmp_dst = needs_wqm ? bld.tmp(dst.regClass()) : dst;
|
|
||||||
|
|
||||||
aco_ptr<MIMG_instruction> mimg{
|
aco_ptr<MIMG_instruction> mimg{
|
||||||
create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
|
create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
|
||||||
if (has_dst)
|
if (has_dst)
|
||||||
mimg->definitions[0] = Definition(tmp_dst);
|
mimg->definitions[0] = Definition(dst);
|
||||||
mimg->operands[0] = Operand(rsrc);
|
mimg->operands[0] = Operand(rsrc);
|
||||||
mimg->operands[1] = samp;
|
mimg->operands[1] = samp;
|
||||||
mimg->operands[2] = vdata;
|
mimg->operands[2] = vdata;
|
||||||
|
|
@ -5959,7 +5950,7 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v
|
||||||
MIMG_instruction* res = mimg.get();
|
MIMG_instruction* res = mimg.get();
|
||||||
bld.insert(std::move(mimg));
|
bld.insert(std::move(mimg));
|
||||||
if (needs_wqm)
|
if (needs_wqm)
|
||||||
emit_wqm(bld, tmp_dst, dst, true);
|
emit_wqm(bld, true);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7528,24 +7519,25 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp
|
void
|
||||||
emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
|
emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src, Temp dst)
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
assert(dst.regClass() == bld.lm);
|
||||||
|
|
||||||
if (cluster_size == 1) {
|
if (cluster_size == 1) {
|
||||||
return src;
|
bld.copy(Definition(dst), src);
|
||||||
}
|
}
|
||||||
if (op == nir_op_iand && cluster_size == 4) {
|
if (op == nir_op_iand && cluster_size == 4) {
|
||||||
/* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */
|
/* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */
|
||||||
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
|
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
|
||||||
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
|
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
|
||||||
return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
|
bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc),
|
||||||
bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
|
bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
|
||||||
} else if (op == nir_op_ior && cluster_size == 4) {
|
} else if (op == nir_op_ior && cluster_size == 4) {
|
||||||
/* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
|
/* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
|
||||||
return bld.sop1(
|
bld.sop1(
|
||||||
Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
|
Builder::s_wqm, Definition(dst), bld.def(s1, scc),
|
||||||
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
|
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
|
||||||
} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
|
} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
|
||||||
/* subgroupAnd(val) -> (~val & exec) == 0 */
|
/* subgroupAnd(val) -> (~val & exec) == 0 */
|
||||||
|
|
@ -7553,15 +7545,15 @@ emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp sr
|
||||||
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
|
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
|
||||||
.def(1)
|
.def(1)
|
||||||
.getTemp();
|
.getTemp();
|
||||||
Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
|
Temp cond = bool_to_vector_condition(ctx, tmp);
|
||||||
return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
|
bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
|
||||||
} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
|
} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
|
||||||
/* subgroupOr(val) -> (val & exec) != 0 */
|
/* subgroupOr(val) -> (val & exec) != 0 */
|
||||||
Temp tmp =
|
Temp tmp =
|
||||||
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
|
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
|
||||||
.def(1)
|
.def(1)
|
||||||
.getTemp();
|
.getTemp();
|
||||||
return bool_to_vector_condition(ctx, tmp);
|
bool_to_vector_condition(ctx, tmp, dst);
|
||||||
} else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
|
} else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
|
||||||
/* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
|
/* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
|
||||||
Temp tmp =
|
Temp tmp =
|
||||||
|
|
@ -7570,7 +7562,7 @@ emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp sr
|
||||||
tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
|
tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
|
||||||
.def(1)
|
.def(1)
|
||||||
.getTemp();
|
.getTemp();
|
||||||
return bool_to_vector_condition(ctx, tmp);
|
bool_to_vector_condition(ctx, tmp, dst);
|
||||||
} else {
|
} else {
|
||||||
/* subgroupClustered{And,Or,Xor}(val, n):
|
/* subgroupClustered{And,Or,Xor}(val, n):
|
||||||
* lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
|
* lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
|
||||||
|
|
@ -7607,22 +7599,19 @@ emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp sr
|
||||||
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
|
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
|
||||||
|
|
||||||
if (op == nir_op_iand) {
|
if (op == nir_op_iand) {
|
||||||
return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::c32(cluster_mask),
|
bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::c32(cluster_mask), tmp);
|
||||||
tmp);
|
|
||||||
} else if (op == nir_op_ior) {
|
} else if (op == nir_op_ior) {
|
||||||
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
|
||||||
} else if (op == nir_op_ixor) {
|
} else if (op == nir_op_ixor) {
|
||||||
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
|
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
|
||||||
bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
|
bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
|
||||||
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
|
||||||
}
|
}
|
||||||
assert(false);
|
|
||||||
return Temp();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp
|
void
|
||||||
emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
|
emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst)
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
assert(src.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm);
|
||||||
|
|
@ -7640,19 +7629,16 @@ emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
|
||||||
Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
|
Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
|
||||||
|
|
||||||
if (op == nir_op_iand)
|
if (op == nir_op_iand)
|
||||||
return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
|
bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), mbcnt);
|
||||||
else if (op == nir_op_ior)
|
else if (op == nir_op_ior)
|
||||||
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), mbcnt);
|
||||||
else if (op == nir_op_ixor)
|
else if (op == nir_op_ixor)
|
||||||
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(),
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(),
|
||||||
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
|
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
|
||||||
|
|
||||||
assert(false);
|
|
||||||
return Temp();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp
|
void
|
||||||
emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
|
emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst)
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
|
||||||
|
|
@ -7660,16 +7646,14 @@ emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
|
||||||
* subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
|
* subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
|
||||||
* subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
|
* subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
|
||||||
*/
|
*/
|
||||||
Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
|
Temp tmp = bld.tmp(bld.lm);
|
||||||
|
emit_boolean_exclusive_scan(ctx, op, src, tmp);
|
||||||
if (op == nir_op_iand)
|
if (op == nir_op_iand)
|
||||||
return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
|
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, src);
|
||||||
else if (op == nir_op_ior)
|
else if (op == nir_op_ior)
|
||||||
return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
|
bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), tmp, src);
|
||||||
else if (op == nir_op_ixor)
|
else if (op == nir_op_ixor)
|
||||||
return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
|
bld.sop2(Builder::s_xor, Definition(dst), bld.def(s1, scc), tmp, src);
|
||||||
|
|
||||||
assert(false);
|
|
||||||
return Temp();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ReduceOp
|
ReduceOp
|
||||||
|
|
@ -7790,7 +7774,7 @@ emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
Temp thread_count =
|
Temp thread_count =
|
||||||
bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
|
bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
|
||||||
thread_count = emit_wqm(bld, thread_count, Temp(0, s1), nir_intrinsic_include_helpers(instr));
|
emit_wqm(bld, nir_intrinsic_include_helpers(instr));
|
||||||
|
|
||||||
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
|
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -7820,7 +7804,7 @@ emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
|
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
|
||||||
else
|
else
|
||||||
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
|
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
|
||||||
packed_tid = emit_wqm(bld, packed_tid);
|
emit_wqm(bld);
|
||||||
|
|
||||||
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
|
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -7855,6 +7839,7 @@ emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
as_vgpr(ctx, src));
|
as_vgpr(ctx, src));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
emit_wqm(bld);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7915,14 +7900,17 @@ emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp
|
Temp
|
||||||
inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Temp scan, Temp src)
|
inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
|
||||||
|
Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
|
||||||
|
bld.def(dst.regClass()), src);
|
||||||
|
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case iadd8:
|
case iadd8:
|
||||||
case iadd16:
|
case iadd16:
|
||||||
case iadd32: return bld.vsub32(bld.def(scan.regClass()), scan, src);
|
case iadd32: return bld.vsub32(dst, scan, src);
|
||||||
case ixor64:
|
case ixor64:
|
||||||
case iadd64: {
|
case iadd64: {
|
||||||
Temp src00 = bld.tmp(v1);
|
Temp src00 = bld.tmp(v1);
|
||||||
|
|
@ -7941,11 +7929,11 @@ inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Temp scan, Temp src)
|
||||||
bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
|
bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
|
||||||
bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
|
bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
|
||||||
}
|
}
|
||||||
return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lower, upper);
|
return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
|
||||||
}
|
}
|
||||||
case ixor8:
|
case ixor8:
|
||||||
case ixor16:
|
case ixor16:
|
||||||
case ixor32: return bld.vop2(aco_opcode::v_xor_b32, bld.def(scan.regClass()), scan, src);
|
case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
|
||||||
default: unreachable("Unsupported op");
|
default: unreachable("Unsupported op");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -7991,11 +7979,8 @@ emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
|
||||||
Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
|
Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
|
||||||
tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
|
tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
|
||||||
tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
|
tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
|
||||||
Temp wqm1 = bld.tmp(v1);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
|
||||||
emit_wqm(bld, tmp1, wqm1, true);
|
emit_wqm(bld, true);
|
||||||
Temp wqm2 = bld.tmp(v1);
|
|
||||||
emit_wqm(bld, tmp2, wqm2, true);
|
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -8314,14 +8299,14 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
/* Make sure that all inactive lanes return zero.
|
/* Make sure that all inactive lanes return zero.
|
||||||
* Value-numbering might remove the comparison above */
|
* Value-numbering might remove the comparison above */
|
||||||
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
|
||||||
|
src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
|
||||||
if (dst.size() != bld.lm.size()) {
|
if (dst.size() != bld.lm.size()) {
|
||||||
/* Wave32 with ballot size set to 64 */
|
/* Wave32 with ballot size set to 64 */
|
||||||
src =
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
|
||||||
bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
emit_wqm(bld, src, dst);
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_shuffle:
|
case nir_intrinsic_shuffle:
|
||||||
|
|
@ -8341,25 +8326,26 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
if (src.regClass() == v1b || src.regClass() == v2b) {
|
if (src.regClass() == v1b || src.regClass() == v2b) {
|
||||||
Temp tmp = bld.tmp(v1);
|
Temp tmp = bld.tmp(v1);
|
||||||
tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
|
tmp = emit_bpermute(ctx, bld, tid, src);
|
||||||
if (dst.type() == RegType::vgpr)
|
if (dst.type() == RegType::vgpr)
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
|
bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
|
||||||
bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
|
bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
|
||||||
else
|
else
|
||||||
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
|
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
|
||||||
} else if (src.regClass() == v1) {
|
} else if (src.regClass() == v1) {
|
||||||
emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
|
Temp tmp = emit_bpermute(ctx, bld, tid, src);
|
||||||
|
bld.copy(Definition(dst), tmp);
|
||||||
} else if (src.regClass() == v2) {
|
} else if (src.regClass() == v2) {
|
||||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
||||||
lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
|
lo = emit_bpermute(ctx, bld, tid, lo);
|
||||||
hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
|
hi = emit_bpermute(ctx, bld, tid, hi);
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||||
emit_split_vector(ctx, dst, 2);
|
emit_split_vector(ctx, dst, 2);
|
||||||
} else if (instr->def.bit_size == 1 && tid.regClass() == s1) {
|
} else if (instr->def.bit_size == 1 && tid.regClass() == s1) {
|
||||||
assert(src.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm);
|
||||||
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
|
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
|
||||||
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
|
bool_to_vector_condition(ctx, tmp, dst);
|
||||||
} else if (instr->def.bit_size == 1 && tid.regClass() == v1) {
|
} else if (instr->def.bit_size == 1 && tid.regClass() == v1) {
|
||||||
assert(src.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm);
|
||||||
Temp tmp;
|
Temp tmp;
|
||||||
|
|
@ -8371,11 +8357,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
|
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
|
||||||
tmp = emit_extract_vector(ctx, tmp, 0, v1);
|
tmp = emit_extract_vector(ctx, tmp, 0, v1);
|
||||||
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
|
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
|
||||||
emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
|
||||||
dst);
|
|
||||||
} else {
|
} else {
|
||||||
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
|
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
|
||||||
}
|
}
|
||||||
|
emit_wqm(bld);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -8388,22 +8374,23 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
||||||
Temp dst = get_ssa_temp(ctx, &instr->def);
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
||||||
if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
|
if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
|
||||||
emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
|
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
|
||||||
} else if (src.regClass() == v2) {
|
} else if (src.regClass() == v2) {
|
||||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
||||||
lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
|
lo = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo);
|
||||||
hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
|
hi = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi);
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||||
emit_split_vector(ctx, dst, 2);
|
emit_split_vector(ctx, dst, 2);
|
||||||
} else if (instr->def.bit_size == 1) {
|
} else if (instr->def.bit_size == 1) {
|
||||||
assert(src.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm);
|
||||||
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
|
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
|
||||||
bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
|
bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
|
||||||
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
|
bool_to_vector_condition(ctx, tmp, dst);
|
||||||
} else {
|
} else {
|
||||||
bld.copy(Definition(dst), src);
|
bld.copy(Definition(dst), src);
|
||||||
}
|
}
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_vote_all: {
|
case nir_intrinsic_vote_all: {
|
||||||
|
|
@ -8416,8 +8403,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
|
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
|
||||||
.def(1)
|
.def(1)
|
||||||
.getTemp();
|
.getTemp();
|
||||||
Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
|
Temp cond = bool_to_vector_condition(ctx, tmp);
|
||||||
bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
|
bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_vote_any: {
|
case nir_intrinsic_vote_any: {
|
||||||
|
|
@ -8427,7 +8415,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
assert(dst.regClass() == bld.lm);
|
assert(dst.regClass() == bld.lm);
|
||||||
|
|
||||||
Temp tmp = bool_to_scalar_condition(ctx, src);
|
Temp tmp = bool_to_scalar_condition(ctx, src);
|
||||||
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
|
bool_to_vector_condition(ctx, tmp, dst);
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_reduce:
|
case nir_intrinsic_reduce:
|
||||||
|
|
@ -8470,15 +8459,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
|
assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
|
||||||
|
|
||||||
switch (instr->intrinsic) {
|
switch (instr->intrinsic) {
|
||||||
case nir_intrinsic_reduce:
|
case nir_intrinsic_reduce: emit_boolean_reduce(ctx, op, cluster_size, src, dst); break;
|
||||||
emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst, create_helpers);
|
case nir_intrinsic_exclusive_scan: emit_boolean_exclusive_scan(ctx, op, src, dst); break;
|
||||||
break;
|
case nir_intrinsic_inclusive_scan: emit_boolean_inclusive_scan(ctx, op, src, dst); break;
|
||||||
case nir_intrinsic_exclusive_scan:
|
|
||||||
emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
|
|
||||||
break;
|
|
||||||
case nir_intrinsic_inclusive_scan:
|
|
||||||
emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
|
|
||||||
break;
|
|
||||||
default: assert(false);
|
default: assert(false);
|
||||||
}
|
}
|
||||||
} else if (cluster_size == 1) {
|
} else if (cluster_size == 1) {
|
||||||
|
|
@ -8503,16 +8486,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
(op == nir_op_iadd || op == nir_op_ixor) &&
|
(op == nir_op_iadd || op == nir_op_ixor) &&
|
||||||
dst.type() == RegType::vgpr;
|
dst.type() == RegType::vgpr;
|
||||||
if (use_inclusive_for_exclusive)
|
if (use_inclusive_for_exclusive)
|
||||||
aco_op = aco_opcode::p_inclusive_scan;
|
inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
|
||||||
|
else
|
||||||
Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
|
emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
|
||||||
bld.def(dst.regClass()), src);
|
|
||||||
|
|
||||||
if (use_inclusive_for_exclusive)
|
|
||||||
tmp_dst = inclusive_scan_to_exclusive(ctx, reduce_op, tmp_dst, src);
|
|
||||||
|
|
||||||
emit_wqm(bld, tmp_dst, dst, create_helpers);
|
|
||||||
}
|
}
|
||||||
|
emit_wqm(bld, create_helpers);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_quad_broadcast:
|
case nir_intrinsic_quad_broadcast:
|
||||||
|
|
@ -8524,6 +8502,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
if (!instr->def.divergent) {
|
if (!instr->def.divergent) {
|
||||||
emit_uniform_subgroup(ctx, instr, src);
|
emit_uniform_subgroup(ctx, instr, src);
|
||||||
|
emit_wqm(bld, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -8548,7 +8527,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp dst = get_ssa_temp(ctx, &instr->def);
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
||||||
Temp tmp(dst);
|
|
||||||
|
|
||||||
/* Setup source. */
|
/* Setup source. */
|
||||||
if (bool_use_valu)
|
if (bool_use_valu)
|
||||||
|
|
@ -8557,15 +8535,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
else if (instr->def.bit_size != 1)
|
else if (instr->def.bit_size != 1)
|
||||||
src = as_vgpr(ctx, src);
|
src = as_vgpr(ctx, src);
|
||||||
|
|
||||||
/* Setup temporary destination. */
|
|
||||||
if (bool_use_valu)
|
|
||||||
tmp = bld.tmp(v1);
|
|
||||||
else if (ctx->program->stage == fragment_fs)
|
|
||||||
tmp = bld.tmp(dst.regClass());
|
|
||||||
|
|
||||||
if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
|
if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
|
||||||
/* Special case for quad broadcast using SALU only. */
|
/* Special case for quad broadcast using SALU only. */
|
||||||
assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
|
||||||
|
|
||||||
uint32_t half_mask = 0x11111111u << lane;
|
uint32_t half_mask = 0x11111111u << lane;
|
||||||
Operand mask_tmp = bld.lm.bytes() == 4
|
Operand mask_tmp = bld.lm.bytes() == 4
|
||||||
|
|
@ -8576,10 +8548,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
src =
|
src =
|
||||||
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
||||||
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
|
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
|
||||||
bld.sop1(Builder::s_wqm, Definition(tmp), src);
|
bld.sop1(Builder::s_wqm, Definition(dst), src);
|
||||||
} else if (instr->def.bit_size <= 32 || bool_use_valu) {
|
} else if (instr->def.bit_size <= 32 || bool_use_valu) {
|
||||||
unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
|
unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
|
||||||
Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
|
Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
|
||||||
|
|
||||||
if (ctx->program->gfx_level >= GFX8)
|
if (ctx->program->gfx_level >= GFX8)
|
||||||
bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
|
bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
|
||||||
|
|
@ -8587,8 +8559,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
|
bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
|
||||||
|
|
||||||
if (excess_bytes)
|
if (excess_bytes)
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
|
bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
|
||||||
bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
|
bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
|
||||||
|
if (bool_use_valu)
|
||||||
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
|
||||||
} else if (instr->def.bit_size == 64) {
|
} else if (instr->def.bit_size == 64) {
|
||||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
||||||
|
|
@ -8601,20 +8575,14 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
|
hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||||
emit_split_vector(ctx, tmp, 2);
|
emit_split_vector(ctx, dst, 2);
|
||||||
} else {
|
} else {
|
||||||
isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
|
isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tmp.id() != dst.id()) {
|
|
||||||
if (bool_use_valu)
|
|
||||||
tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
|
|
||||||
|
|
||||||
/* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
|
/* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
|
||||||
emit_wqm(bld, tmp, dst, true);
|
emit_wqm(bld, true);
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_masked_swizzle_amd: {
|
case nir_intrinsic_masked_swizzle_amd: {
|
||||||
|
|
@ -8634,26 +8602,26 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
|
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
|
||||||
Operand::c32(-1), src);
|
Operand::c32(-1), src);
|
||||||
src = emit_masked_swizzle(ctx, bld, src, mask);
|
src = emit_masked_swizzle(ctx, bld, src, mask);
|
||||||
Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
|
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
|
||||||
emit_wqm(bld, tmp, dst);
|
|
||||||
} else if (dst.regClass() == v1b) {
|
} else if (dst.regClass() == v1b) {
|
||||||
Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
|
Temp tmp = emit_masked_swizzle(ctx, bld, src, mask);
|
||||||
emit_extract_vector(ctx, tmp, 0, dst);
|
emit_extract_vector(ctx, tmp, 0, dst);
|
||||||
} else if (dst.regClass() == v2b) {
|
} else if (dst.regClass() == v2b) {
|
||||||
Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
|
Temp tmp = emit_masked_swizzle(ctx, bld, src, mask);
|
||||||
emit_extract_vector(ctx, tmp, 0, dst);
|
emit_extract_vector(ctx, tmp, 0, dst);
|
||||||
} else if (dst.regClass() == v1) {
|
} else if (dst.regClass() == v1) {
|
||||||
emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
|
bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask));
|
||||||
} else if (dst.regClass() == v2) {
|
} else if (dst.regClass() == v2) {
|
||||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
||||||
lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
|
lo = emit_masked_swizzle(ctx, bld, lo, mask);
|
||||||
hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
|
hi = emit_masked_swizzle(ctx, bld, hi, mask);
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||||
emit_split_vector(ctx, dst, 2);
|
emit_split_vector(ctx, dst, 2);
|
||||||
} else {
|
} else {
|
||||||
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
|
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
|
||||||
}
|
}
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_write_invocation_amd: {
|
case nir_intrinsic_write_invocation_amd: {
|
||||||
|
|
@ -8663,14 +8631,14 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
Temp dst = get_ssa_temp(ctx, &instr->def);
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
||||||
if (dst.regClass() == v1) {
|
if (dst.regClass() == v1) {
|
||||||
/* src2 is ignored for writelane. RA assigns the same reg for dst */
|
/* src2 is ignored for writelane. RA assigns the same reg for dst */
|
||||||
emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
|
bld.writelane(Definition(dst), val, lane, src);
|
||||||
} else if (dst.regClass() == v2) {
|
} else if (dst.regClass() == v2) {
|
||||||
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
|
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
|
||||||
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
|
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
|
||||||
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
|
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
|
||||||
Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
|
Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
|
||||||
Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
|
Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||||
emit_split_vector(ctx, dst, 2);
|
emit_split_vector(ctx, dst, 2);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -8684,8 +8652,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
Temp dst = get_ssa_temp(ctx, &instr->def);
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
||||||
/* Fit 64-bit mask for wave32 */
|
/* Fit 64-bit mask for wave32 */
|
||||||
src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
|
src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
|
||||||
Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
|
emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
|
||||||
emit_wqm(bld, wqm_tmp, dst);
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_lane_permute_16_amd: {
|
case nir_intrinsic_lane_permute_16_amd: {
|
||||||
|
|
@ -8757,15 +8725,16 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_first_invocation: {
|
case nir_intrinsic_first_invocation: {
|
||||||
emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
|
bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
|
||||||
get_ssa_temp(ctx, &instr->def));
|
Operand(exec, bld.lm));
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_last_invocation: {
|
case nir_intrinsic_last_invocation: {
|
||||||
Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
|
Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
|
||||||
Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
|
bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
|
||||||
Operand::c32(ctx->program->wave_size - 1u), flbit);
|
Operand::c32(ctx->program->wave_size - 1u), flbit);
|
||||||
emit_wqm(bld, last, get_ssa_temp(ctx, &instr->def));
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_elect: {
|
case nir_intrinsic_elect: {
|
||||||
|
|
@ -8773,8 +8742,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
* Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
|
* Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
|
||||||
* two p_elect with different exec masks as the same.
|
* two p_elect with different exec masks as the same.
|
||||||
*/
|
*/
|
||||||
Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
|
bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
|
||||||
emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->def));
|
Operand(exec, bld.lm));
|
||||||
|
emit_wqm(bld);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_shader_clock: {
|
case nir_intrinsic_shader_clock: {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue