mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 00:30:13 +01:00
aco/gfx11: fix FS input loads in quad-divergent control flow
This is not ideal and it would be great to somehow make it better some day. fossil-db (gfx1100): Totals from 5208 (3.86% of 135032) affected shaders: MaxWaves: 127058 -> 126962 (-0.08%); split: +0.01%, -0.09% Instrs: 3983440 -> 4072736 (+2.24%); split: -0.00%, +2.24% CodeSize: 21872468 -> 22230852 (+1.64%); split: -0.00%, +1.64% VGPRs: 206688 -> 206984 (+0.14%); split: -0.05%, +0.20% Latency: 37447383 -> 37491197 (+0.12%); split: -0.05%, +0.17% InvThroughput: 6421955 -> 6422348 (+0.01%); split: -0.03%, +0.03% VClause: 71579 -> 71545 (-0.05%); split: -0.09%, +0.04% SClause: 148289 -> 147146 (-0.77%); split: -0.84%, +0.07% Copies: 259011 -> 258084 (-0.36%); split: -0.61%, +0.25% Branches: 101366 -> 101314 (-0.05%); split: -0.10%, +0.05% PreSGPRs: 223482 -> 223460 (-0.01%); split: -0.21%, +0.20% PreVGPRs: 184448 -> 184744 (+0.16%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19370>
This commit is contained in:
parent
16d2c7ad55
commit
6113ee650a
9 changed files with 147 additions and 33 deletions
|
|
@ -522,7 +522,7 @@ public:
|
||||||
}
|
}
|
||||||
<%
|
<%
|
||||||
import itertools
|
import itertools
|
||||||
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8)]),
|
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6)]),
|
||||||
("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]),
|
("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]),
|
||||||
("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])),
|
("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])),
|
||||||
("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])),
|
("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])),
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,8 @@ struct if_context {
|
||||||
bool divergent_old;
|
bool divergent_old;
|
||||||
bool exec_potentially_empty_discard_old;
|
bool exec_potentially_empty_discard_old;
|
||||||
bool exec_potentially_empty_break_old;
|
bool exec_potentially_empty_break_old;
|
||||||
|
bool had_divergent_discard_old;
|
||||||
|
bool had_divergent_discard_then;
|
||||||
uint16_t exec_potentially_empty_break_depth_old;
|
uint16_t exec_potentially_empty_break_depth_old;
|
||||||
|
|
||||||
unsigned BB_if_idx;
|
unsigned BB_if_idx;
|
||||||
|
|
@ -5306,6 +5308,13 @@ visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
in_exec_divergent_or_in_loop(isel_context* ctx)
|
||||||
|
{
|
||||||
|
return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
|
||||||
|
ctx->cf_info.had_divergent_discard;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
|
emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
|
||||||
Temp prim_mask)
|
Temp prim_mask)
|
||||||
|
|
@ -5315,7 +5324,16 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem
|
||||||
|
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
|
||||||
//TODO: this doesn't work in quad-divergent control flow
|
if (in_exec_divergent_or_in_loop(ctx)) {
|
||||||
|
Operand prim_mask_op = bld.m0(prim_mask);
|
||||||
|
prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
|
||||||
|
Operand coord2_op(coord2);
|
||||||
|
coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
|
||||||
|
bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm),
|
||||||
|
Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), coord1,
|
||||||
|
coord2_op, prim_mask_op);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
||||||
|
|
||||||
|
|
@ -5385,13 +5403,22 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
if (ctx->options->gfx_level >= GFX11) {
|
if (ctx->options->gfx_level >= GFX11) {
|
||||||
//TODO: this doesn't work in quad-divergent control flow and ignores vertex_id
|
// TODO: this ignores vertex_id
|
||||||
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
|
||||||
uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0);
|
uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0);
|
||||||
Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
|
if (in_exec_divergent_or_in_loop(ctx)) {
|
||||||
|
Operand prim_mask_op = bld.m0(prim_mask);
|
||||||
|
prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
|
||||||
|
bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm),
|
||||||
|
Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component),
|
||||||
|
Operand::c32(dpp_ctrl), prim_mask_op);
|
||||||
|
} else {
|
||||||
|
Temp p =
|
||||||
|
bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
|
||||||
|
Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
|
||||||
|
|
||||||
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
|
||||||
emit_wqm(bld, res, dst, true);
|
emit_wqm(bld, res, dst, true);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
|
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
|
||||||
bld.m0(prim_mask), idx, component);
|
bld.m0(prim_mask), idx, component);
|
||||||
|
|
@ -5825,7 +5852,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
unsigned chan_component = (component + i) % 4;
|
unsigned chan_component = (component + i) % 4;
|
||||||
unsigned chan_idx = idx + (component + i) / 4;
|
unsigned chan_idx = idx + (component + i) / 4;
|
||||||
vec->operands[i] = Operand(bld.tmp(instr->dest.ssa.bit_size == 16 ? v2b : v1));
|
vec->operands[i] = Operand(bld.tmp(instr->dest.ssa.bit_size == 16 ? v2b : v1));
|
||||||
emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(), prim_mask);
|
emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id,
|
||||||
|
vec->operands[i].getTemp(), prim_mask);
|
||||||
}
|
}
|
||||||
vec->definitions[0] = Definition(dst);
|
vec->definitions[0] = Definition(dst);
|
||||||
bld.insert(std::move(vec));
|
bld.insert(std::move(vec));
|
||||||
|
|
@ -8980,6 +9008,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
||||||
ctx->cf_info.exec_potentially_empty_discard = true;
|
ctx->cf_info.exec_potentially_empty_discard = true;
|
||||||
|
|
||||||
ctx->block->kind |= block_kind_uses_discard;
|
ctx->block->kind |= block_kind_uses_discard;
|
||||||
ctx->program->needs_exact = true;
|
ctx->program->needs_exact = true;
|
||||||
break;
|
break;
|
||||||
|
|
@ -8992,6 +9021,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
|
|
||||||
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
||||||
ctx->cf_info.exec_potentially_empty_discard = true;
|
ctx->cf_info.exec_potentially_empty_discard = true;
|
||||||
|
|
||||||
ctx->block->kind |= block_kind_uses_discard;
|
ctx->block->kind |= block_kind_uses_discard;
|
||||||
ctx->program->needs_exact = true;
|
ctx->program->needs_exact = true;
|
||||||
break;
|
break;
|
||||||
|
|
@ -9007,12 +9037,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||||
assert(src.regClass() == bld.lm);
|
assert(src.regClass() == bld.lm);
|
||||||
cond =
|
cond =
|
||||||
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
||||||
|
|
||||||
|
ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
bld.pseudo(aco_opcode::p_discard_if, cond);
|
bld.pseudo(aco_opcode::p_discard_if, cond);
|
||||||
|
|
||||||
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
|
||||||
ctx->cf_info.exec_potentially_empty_discard = true;
|
ctx->cf_info.exec_potentially_empty_discard = true;
|
||||||
|
ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
|
||||||
ctx->block->kind |= block_kind_uses_discard;
|
ctx->block->kind |= block_kind_uses_discard;
|
||||||
ctx->program->needs_exact = true;
|
ctx->program->needs_exact = true;
|
||||||
break;
|
break;
|
||||||
|
|
@ -10554,6 +10587,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
|
||||||
ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
|
ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
|
||||||
ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
|
ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
|
||||||
ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
|
ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
|
||||||
|
ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
|
||||||
ctx->cf_info.parent_if.is_divergent = true;
|
ctx->cf_info.parent_if.is_divergent = true;
|
||||||
|
|
||||||
/* divergent branches use cbranch_execz */
|
/* divergent branches use cbranch_execz */
|
||||||
|
|
@ -10621,6 +10655,9 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic,
|
||||||
ctx->cf_info.exec_potentially_empty_break = false;
|
ctx->cf_info.exec_potentially_empty_break = false;
|
||||||
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
|
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
|
||||||
|
|
||||||
|
ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
|
||||||
|
ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
|
||||||
|
|
||||||
/** emit logical else block */
|
/** emit logical else block */
|
||||||
ctx->program->next_divergent_if_logical_depth++;
|
ctx->program->next_divergent_if_logical_depth++;
|
||||||
Block* BB_else_logical = ctx->program->create_and_insert_block();
|
Block* BB_else_logical = ctx->program->create_and_insert_block();
|
||||||
|
|
@ -10683,6 +10720,7 @@ end_divergent_if(isel_context* ctx, if_context* ic)
|
||||||
ctx->cf_info.exec_potentially_empty_break = false;
|
ctx->cf_info.exec_potentially_empty_break = false;
|
||||||
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
|
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
|
||||||
}
|
}
|
||||||
|
ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -10709,6 +10747,8 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
|
||||||
ctx->cf_info.has_branch = false;
|
ctx->cf_info.has_branch = false;
|
||||||
ctx->cf_info.parent_loop.has_divergent_branch = false;
|
ctx->cf_info.parent_loop.has_divergent_branch = false;
|
||||||
|
|
||||||
|
ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
|
||||||
|
|
||||||
/** emit then block */
|
/** emit then block */
|
||||||
ctx->program->next_uniform_if_depth++;
|
ctx->program->next_uniform_if_depth++;
|
||||||
Block* BB_then = ctx->program->create_and_insert_block();
|
Block* BB_then = ctx->program->create_and_insert_block();
|
||||||
|
|
@ -10742,6 +10782,9 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic)
|
||||||
ctx->cf_info.has_branch = false;
|
ctx->cf_info.has_branch = false;
|
||||||
ctx->cf_info.parent_loop.has_divergent_branch = false;
|
ctx->cf_info.parent_loop.has_divergent_branch = false;
|
||||||
|
|
||||||
|
ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
|
||||||
|
ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
|
||||||
|
|
||||||
/** emit else block */
|
/** emit else block */
|
||||||
Block* BB_else = ctx->program->create_and_insert_block();
|
Block* BB_else = ctx->program->create_and_insert_block();
|
||||||
add_edge(ic->BB_if_idx, BB_else);
|
add_edge(ic->BB_if_idx, BB_else);
|
||||||
|
|
@ -10770,6 +10813,7 @@ end_uniform_if(isel_context* ctx, if_context* ic)
|
||||||
|
|
||||||
ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
|
ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
|
||||||
ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
|
ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
|
||||||
|
ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
|
||||||
|
|
||||||
/** emit endif merge block */
|
/** emit endif merge block */
|
||||||
ctx->program->next_uniform_if_depth--;
|
ctx->program->next_uniform_if_depth--;
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,7 @@ struct isel_context {
|
||||||
struct {
|
struct {
|
||||||
bool is_divergent = false;
|
bool is_divergent = false;
|
||||||
} parent_if;
|
} parent_if;
|
||||||
|
bool had_divergent_discard = false;
|
||||||
bool exec_potentially_empty_discard =
|
bool exec_potentially_empty_discard =
|
||||||
false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
|
false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
|
||||||
uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
|
uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
|
||||||
|
|
|
||||||
|
|
@ -2378,6 +2378,54 @@ lower_to_hw_instr(Program* program)
|
||||||
bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
|
bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case aco_opcode::p_interp_gfx11: {
|
||||||
|
assert(instr->definitions[0].regClass() == v1 ||
|
||||||
|
instr->definitions[0].regClass() == v2b);
|
||||||
|
assert(instr->definitions[1].regClass() == bld.lm);
|
||||||
|
assert(instr->operands[0].regClass() == v1.as_linear());
|
||||||
|
assert(instr->operands[1].isConstant());
|
||||||
|
assert(instr->operands[2].isConstant());
|
||||||
|
assert(instr->operands.back().physReg() == m0);
|
||||||
|
Definition dst = instr->definitions[0];
|
||||||
|
PhysReg exec_tmp = instr->definitions[1].physReg();
|
||||||
|
PhysReg lin_vgpr = instr->operands[0].physReg();
|
||||||
|
unsigned attribute = instr->operands[1].constantValue();
|
||||||
|
unsigned component = instr->operands[2].constantValue();
|
||||||
|
uint16_t dpp_ctrl = 0;
|
||||||
|
Operand coord1, coord2;
|
||||||
|
if (instr->operands.size() == 6) {
|
||||||
|
assert(instr->operands[3].regClass() == v1);
|
||||||
|
assert(instr->operands[4].regClass() == v1);
|
||||||
|
coord1 = instr->operands[3];
|
||||||
|
coord2 = instr->operands[4];
|
||||||
|
} else {
|
||||||
|
assert(instr->operands[3].isConstant());
|
||||||
|
dpp_ctrl = instr->operands[3].constantValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), Operand(exec, bld.lm));
|
||||||
|
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), Operand(exec, bld.lm));
|
||||||
|
bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
|
||||||
|
attribute, component);
|
||||||
|
bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(exec_tmp, bld.lm));
|
||||||
|
|
||||||
|
Operand p(lin_vgpr, v1);
|
||||||
|
Operand dst_op(dst.physReg(), v1);
|
||||||
|
if (instr->operands.size() == 5) {
|
||||||
|
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
|
||||||
|
} else if (dst.regClass() == v2b) {
|
||||||
|
bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
|
||||||
|
coord1, p);
|
||||||
|
bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
|
||||||
|
coord2, dst_op);
|
||||||
|
} else {
|
||||||
|
bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
|
||||||
|
p);
|
||||||
|
bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
|
||||||
|
dst_op);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} else if (instr->isBranch()) {
|
} else if (instr->isBranch()) {
|
||||||
|
|
|
||||||
|
|
@ -335,6 +335,11 @@ opcode("p_init_scratch")
|
||||||
# jumps to a shader epilog
|
# jumps to a shader epilog
|
||||||
opcode("p_jump_to_epilog")
|
opcode("p_jump_to_epilog")
|
||||||
|
|
||||||
|
# loads and interpolates a fragment shader input with a correct exec mask
|
||||||
|
#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0
|
||||||
|
#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
|
||||||
|
opcode("p_interp_gfx11")
|
||||||
|
|
||||||
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
|
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
|
||||||
SOP2 = {
|
SOP2 = {
|
||||||
# GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
|
# GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
|
||||||
|
|
|
||||||
|
|
@ -673,6 +673,7 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
|
||||||
case aco_opcode::v_readfirstlane_b32:
|
case aco_opcode::v_readfirstlane_b32:
|
||||||
case aco_opcode::p_extract:
|
case aco_opcode::p_extract:
|
||||||
case aco_opcode::p_insert: return operand != 0;
|
case aco_opcode::p_insert: return operand != 0;
|
||||||
|
case aco_opcode::p_interp_gfx11: return false;
|
||||||
default: return true;
|
default: return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -45,11 +45,13 @@ setup_reduce_temp(Program* program)
|
||||||
std::vector<bool> hasReductions(program->blocks.size());
|
std::vector<bool> hasReductions(program->blocks.size());
|
||||||
for (Block& block : program->blocks) {
|
for (Block& block : program->blocks) {
|
||||||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||||
if (instr->format != Format::PSEUDO_REDUCTION)
|
if (instr->opcode == aco_opcode::p_interp_gfx11) {
|
||||||
continue;
|
maxSize = MAX2(maxSize, 1);
|
||||||
|
hasReductions[block.index] = true;
|
||||||
maxSize = MAX2(maxSize, instr->operands[0].size());
|
} else if (instr->format == Format::PSEUDO_REDUCTION) {
|
||||||
hasReductions[block.index] = true;
|
maxSize = MAX2(maxSize, instr->operands[0].size());
|
||||||
|
hasReductions[block.index] = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -92,10 +94,10 @@ setup_reduce_temp(Program* program)
|
||||||
std::vector<aco_ptr<Instruction>>::iterator it;
|
std::vector<aco_ptr<Instruction>>::iterator it;
|
||||||
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
||||||
Instruction* instr = (*it).get();
|
Instruction* instr = (*it).get();
|
||||||
if (instr->format != Format::PSEUDO_REDUCTION)
|
if (instr->format != Format::PSEUDO_REDUCTION &&
|
||||||
|
instr->opcode != aco_opcode::p_interp_gfx11)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ReduceOp op = instr->reduction().reduce_op;
|
|
||||||
reduceTmp_in_loop |= block.loop_nest_depth > 0;
|
reduceTmp_in_loop |= block.loop_nest_depth > 0;
|
||||||
|
|
||||||
if ((int)last_top_level_block_idx != inserted_at) {
|
if ((int)last_top_level_block_idx != inserted_at) {
|
||||||
|
|
@ -122,22 +124,26 @@ setup_reduce_temp(Program* program)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* same as before, except for the vector temporary instead of the reduce temporary */
|
/* same as before, except for the vector temporary instead of the reduce temporary */
|
||||||
unsigned cluster_size = instr->reduction().cluster_size;
|
bool need_vtmp = false;
|
||||||
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
|
if (instr->isReduction()) {
|
||||||
op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
|
ReduceOp op = instr->reduction().reduce_op;
|
||||||
op == imax64 || op == imul64;
|
unsigned cluster_size = instr->reduction().cluster_size;
|
||||||
bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
|
need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
|
||||||
op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
|
op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
|
||||||
op == iadd64;
|
op == imax64 || op == imul64;
|
||||||
|
bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
|
||||||
|
op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
|
||||||
|
op == iadd64;
|
||||||
|
|
||||||
if (program->gfx_level >= GFX10 && cluster_size == 64)
|
if (program->gfx_level >= GFX10 && cluster_size == 64)
|
||||||
need_vtmp = true;
|
need_vtmp = true;
|
||||||
if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
|
if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
|
||||||
need_vtmp = true;
|
need_vtmp = true;
|
||||||
if (program->gfx_level <= GFX7)
|
if (program->gfx_level <= GFX7)
|
||||||
need_vtmp = true;
|
need_vtmp = true;
|
||||||
|
|
||||||
need_vtmp |= cluster_size == 32;
|
need_vtmp |= cluster_size == 32;
|
||||||
|
}
|
||||||
|
|
||||||
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
|
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
|
||||||
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
|
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
|
||||||
|
|
@ -158,9 +164,15 @@ setup_reduce_temp(Program* program)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
instr->operands[1] = Operand(reduceTmp);
|
if (instr->isReduction()) {
|
||||||
if (need_vtmp)
|
instr->operands[1] = Operand(reduceTmp);
|
||||||
instr->operands[2] = Operand(vtmp);
|
if (need_vtmp)
|
||||||
|
instr->operands[2] = Operand(vtmp);
|
||||||
|
} else {
|
||||||
|
assert(instr->opcode == aco_opcode::p_interp_gfx11);
|
||||||
|
instr->operands[0] = Operand(reduceTmp);
|
||||||
|
instr->operands[0].setLateKill(true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -607,7 +607,9 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
|
||||||
amd_gfx_level gfx_level = program->gfx_level;
|
amd_gfx_level gfx_level = program->gfx_level;
|
||||||
|
|
||||||
if (instr->isPseudo()) {
|
if (instr->isPseudo()) {
|
||||||
if (gfx_level >= GFX8)
|
if (instr->opcode == aco_opcode::p_interp_gfx11)
|
||||||
|
return std::make_pair(4u, 4u);
|
||||||
|
else if (gfx_level >= GFX8)
|
||||||
return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
|
return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
|
||||||
else
|
else
|
||||||
return std::make_pair(4, rc.size() * 4u);
|
return std::make_pair(4, rc.size() * 4u);
|
||||||
|
|
|
||||||
|
|
@ -262,6 +262,7 @@ validate_ir(Program* program)
|
||||||
bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
|
bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
|
||||||
instr->opcode == aco_opcode::p_create_vector ||
|
instr->opcode == aco_opcode::p_create_vector ||
|
||||||
instr->opcode == aco_opcode::p_jump_to_epilog ||
|
instr->opcode == aco_opcode::p_jump_to_epilog ||
|
||||||
|
(instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
|
||||||
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
|
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
|
||||||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
|
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
|
||||||
(instr->isScratch() && i == 0);
|
(instr->isScratch() && i == 0);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue