intel/brw: Remove Gfx8- code from backend passes

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
Caio Oliveira 2024-02-15 13:19:08 -08:00 committed by Marge Bot
parent 9569ea82a8
commit 7ac5696157
10 changed files with 64 additions and 276 deletions

View file

@ -2341,10 +2341,10 @@ fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *f
if (inst->conditional_mod) {
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
if (!inst->predicate &&
(devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
inst->opcode != BRW_OPCODE_CSEL &&
inst->opcode != BRW_OPCODE_IF &&
inst->opcode != BRW_OPCODE_WHILE))) {
(inst->opcode != BRW_OPCODE_SEL &&
inst->opcode != BRW_OPCODE_CSEL &&
inst->opcode != BRW_OPCODE_IF &&
inst->opcode != BRW_OPCODE_WHILE)) {
fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
inst->flag_subreg % 2);
}

View file

@ -549,8 +549,7 @@ namespace {
* Register allocation ensures that, so don't move 127 around to avoid
* breaking that property.
*/
if (v->devinfo->ver >= 8)
constrained[p.atom_of_reg(127)] = true;
constrained[p.atom_of_reg(127)] = true;
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
/* Assume that anything referenced via fixed GRFs is baked into the
@ -567,24 +566,14 @@ namespace {
constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
}
/* Preserve the original allocation of VGRFs used by the barycentric
* source of the LINTERP instruction on Gfx6, since pair-aligned
* barycentrics allow the PLN instruction to be used.
*/
if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
inst->opcode == FS_OPCODE_LINTERP)
constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
/* The location of the Gfx7 MRF hack registers is hard-coded in the
* rest of the compiler back-end. Don't attempt to move them around.
*/
if (v->devinfo->ver >= 7) {
assert(inst->dst.file != MRF);
assert(inst->dst.file != MRF);
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
constrained[p.atom_of_reg(reg)] = true;
}
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
constrained[p.atom_of_reg(reg)] = true;
}
}
@ -600,10 +589,10 @@ namespace {
is_conflict_optimized_out(const intel_device_info *devinfo,
const fs_inst *inst)
{
return devinfo->ver >= 9 &&
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
reg_of(inst->src[1]) == reg_of(inst->src[2]));
return
(is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
reg_of(inst->src[1]) == reg_of(inst->src[2]);
}
/**
@ -915,10 +904,6 @@ brw_fs_opt_bank_conflicts(fs_visitor &s)
if (s.devinfo->ver >= 20)
return false;
/* No ternary instructions -- No bank conflicts. */
if (s.devinfo->ver < 6)
return false;
const partitioning p = shader_reg_partitioning(&s);
const bool *constrained = shader_reg_constraints(&s, p);
const weight_vector_type *conflicts =

View file

@ -451,18 +451,10 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
break;
}
} else if (scan_inst->conditional_mod == inst->conditional_mod) {
/* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
* flags value is not based on the result stored in the
* destination. On all other platforms sel.cond will not
* write the flags, so execution will not get to this point.
*/
if (scan_inst->opcode == BRW_OPCODE_SEL) {
assert(devinfo->ver <= 5);
} else {
inst->remove(block, true);
progress = true;
}
/* sel.cond will not write the flags. */
assert(scan_inst->opcode != BRW_OPCODE_SEL);
inst->remove(block, true);
progress = true;
break;
} else if (!read_flag && scan_inst->can_do_cmod()) {
scan_inst->conditional_mod = inst->conditional_mod;

View file

@ -764,30 +764,6 @@ brw_combine_constants(struct value *candidates, unsigned num_candidates)
return combine_constants_greedy(candidates, num_candidates);
}
/* Returns whether an instruction could co-issue if its immediate source were
* replaced with a GRF source.
*/
static bool
could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst)
{
assert(inst->opcode == BRW_OPCODE_MOV ||
inst->opcode == BRW_OPCODE_CMP ||
inst->opcode == BRW_OPCODE_ADD ||
inst->opcode == BRW_OPCODE_MUL);
if (devinfo->ver != 7)
return false;
/* Only float instructions can coissue. We don't have a great
* understanding of whether or not something like float(int(a) + int(b))
* would be considered float (based on the destination type) or integer
* (based on the source types), so we take the conservative choice of
* only promoting when both destination and source are float.
*/
return inst->dst.type == BRW_REGISTER_TYPE_F &&
inst->src[0].type == BRW_REGISTER_TYPE_F;
}
/**
* Box for storing fs_inst and some other necessary data
*
@ -1346,12 +1322,6 @@ brw_fs_opt_combine_constants(fs_visitor &s)
add_candidate_immediate(&table, inst, ip, 0, true, false, block,
devinfo, const_ctx);
}
if (inst->src[1].file == IMM && devinfo->ver < 8) {
add_candidate_immediate(&table, inst, ip, 1, true, false, block,
devinfo, const_ctx);
}
break;
case BRW_OPCODE_ADD3:
@ -1418,24 +1388,6 @@ brw_fs_opt_combine_constants(fs_visitor &s)
}
break;
case BRW_OPCODE_MOV:
if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) {
add_candidate_immediate(&table, inst, ip, 0, false, false, block,
devinfo, const_ctx);
}
break;
case BRW_OPCODE_CMP:
case BRW_OPCODE_ADD:
case BRW_OPCODE_MUL:
assert(inst->src[0].file != IMM);
if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) {
add_candidate_immediate(&table, inst, ip, 1, false, false, block,
devinfo, const_ctx);
}
break;
default:
break;
}
@ -1552,47 +1504,21 @@ brw_fs_opt_combine_constants(fs_visitor &s)
if (s.cfg->num_blocks != 1)
qsort(table.imm, table.len, sizeof(struct imm), compare);
if (devinfo->ver > 7) {
struct register_allocation *regs =
(struct register_allocation *) calloc(table.len, sizeof(regs[0]));
struct register_allocation *regs =
(struct register_allocation *) calloc(table.len, sizeof(regs[0]));
for (int i = 0; i < table.len; i++) {
regs[i].nr = UINT_MAX;
regs[i].avail = 0xffff;
}
foreach_block(block, s.cfg) {
parcel_out_registers(table.imm, table.len, block, regs, table.len,
s.alloc, devinfo->ver);
}
free(regs);
} else {
fs_reg reg(VGRF, s.alloc.allocate(1));
reg.stride = 0;
for (int i = 0; i < table.len; i++) {
struct imm *imm = &table.imm[i];
/* Put the immediate in an offset aligned to its size. Some
* instructions seem to have additional alignment requirements, so
* account for that too.
*/
reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
/* Ensure we have enough space in the register to copy the immediate */
if (reg.offset + imm->size > REG_SIZE) {
reg.nr = s.alloc.allocate(1);
reg.offset = 0;
}
imm->nr = reg.nr;
imm->subreg_offset = reg.offset;
reg.offset += imm->size;
}
for (int i = 0; i < table.len; i++) {
regs[i].nr = UINT_MAX;
regs[i].avail = 0xffff;
}
foreach_block(block, s.cfg) {
parcel_out_registers(table.imm, table.len, block, regs, table.len,
s.alloc, devinfo->ver);
}
free(regs);
bool rebuild_cfg = false;
/* Insert MOVs to load the constant values into GRFs. */
@ -1661,7 +1587,7 @@ brw_fs_opt_combine_constants(fs_visitor &s)
* replicating the single one we want. To avoid this, we always populate
* both HF slots within a DWord with the constant.
*/
const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
const uint32_t width = 1;
const fs_builder ibld = fs_builder(&s, width).at(insert_block, n).exec_all();
fs_reg reg(VGRF, imm->nr);

View file

@ -630,14 +630,8 @@ can_take_stride(fs_inst *inst, brw_reg_type dst_type,
* are sends, so the sources are moved to MRF's and there are no
* restrictions.
*/
if (inst->is_math()) {
if (devinfo->ver == 6 || devinfo->ver == 7) {
assert(inst->dst.stride == 1);
return stride == 1 || stride == 0;
} else if (devinfo->ver >= 8) {
return stride == inst->dst.stride || stride == 0;
}
}
if (inst->is_math())
return stride == inst->dst.stride || stride == 0;
return true;
}
@ -725,15 +719,6 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
}
}
/* Avoid propagating odd-numbered FIXED_GRF registers into the first source
* of a LINTERP instruction on platforms where the PLN instruction has
* register alignment restrictions.
*/
if (devinfo->has_pln && devinfo->ver <= 6 &&
entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
inst->opcode == FS_OPCODE_LINTERP && arg == 0)
return false;
/* we can't generally copy-propagate UD negations because we
* can end up accessing the resulting values as signed integers
* instead. See also resolve_ud_negate() and comment in
@ -750,8 +735,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
/* Reject cases that would violate register regioning restrictions. */
if ((entry->src.file == UNIFORM || !entry->src.is_contiguous()) &&
((devinfo->ver == 6 && inst->is_math()) ||
inst->is_send_from_grf() ||
(inst->is_send_from_grf() ||
inst->uses_indirect_addressing())) {
return false;
}
@ -867,7 +851,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
type_sz(entry->dst.type) != type_sz(inst->src[arg].type)))
return false;
if (devinfo->ver >= 8 && (entry->src.negate || entry->src.abs) &&
if ((entry->src.negate || entry->src.abs) &&
is_logic_op(inst->opcode)) {
return false;
}
@ -946,7 +930,6 @@ static bool
try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
acp_entry *entry, int arg)
{
const struct intel_device_info *devinfo = compiler->devinfo;
bool progress = false;
if (type_sz(entry->src.type) > 4)
@ -1002,14 +985,14 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
val.type = inst->src[arg].type;
if (inst->src[arg].abs) {
if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
if (is_logic_op(inst->opcode) ||
!brw_abs_immediate(val.type, &val.as_brw_reg())) {
return false;
}
}
if (inst->src[arg].negate) {
if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
if (is_logic_op(inst->opcode) ||
!brw_negate_immediate(val.type, &val.as_brw_reg())) {
return false;
}
@ -1024,13 +1007,6 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
break;
case SHADER_OPCODE_POW:
/* Allow constant propagation into src1 (except on Gen 6 which
* doesn't support scalar source math), and let constant combining
* promote the constant on Gen < 8.
*/
if (devinfo->ver == 6)
break;
if (arg == 1) {
inst->src[arg] = val;
progress = true;
@ -1190,15 +1166,6 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
/* Allow constant propagation into either source (except on Gen 6
* which doesn't support scalar source math). Constant combining
* promote the src1 constant on Gen < 8, and it will promote the src0
* constant on all platforms.
*/
if (devinfo->ver == 6)
break;
FALLTHROUGH;
case BRW_OPCODE_AND:
case BRW_OPCODE_ASR:
case BRW_OPCODE_BFE:

View file

@ -334,13 +334,12 @@ bool
brw_fs_lower_barycentrics(fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
const bool has_interleaved_layout = devinfo->has_pln ||
(devinfo->ver >= 7 && devinfo->ver < 20);
bool progress = false;
if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
return false;
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
if (inst->exec_size < 16)
continue;
@ -461,9 +460,6 @@ brw_fs_lower_find_live_channel(fs_visitor &s)
{
bool progress = false;
if (s.devinfo->ver < 8)
return false;
bool packed_dispatch =
brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
s.stage_prog_data);

View file

@ -150,23 +150,16 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
*/
if (inst->src[1].file == IMM &&
(inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
* src1 are used.
/* The MUL instruction isn't commutative. On Gen >= 7 only
* the low 16-bits of src1 are used.
*
* If multiplying by an immediate value that fits in 16-bits, do a
* single MUL instruction with that value in the proper location.
*/
const bool ud = (inst->src[1].d >= 0);
if (devinfo->ver < 7) {
fs_reg imm(VGRF, s.alloc.allocate(s.dispatch_width / 8), inst->dst.type);
ibld.MOV(imm, inst->src[1]);
ibld.MUL(inst->dst, imm, inst->src[0]);
} else {
ibld.MUL(inst->dst, inst->src[0],
ud ? brw_imm_uw(inst->src[1].ud)
: brw_imm_w(inst->src[1].d));
}
ibld.MUL(inst->dst, inst->src[0],
ud ? brw_imm_uw(inst->src[1].ud)
: brw_imm_w(inst->src[1].d));
} else {
/* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
* do 32-bit integer multiplication in one instruction, but instead
@ -239,7 +232,7 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
high.offset = inst->dst.offset % REG_SIZE;
bool do_addition = true;
if (devinfo->ver >= 7) {
{
/* From Wa_1604601757:
*
* "When multiplying a DW and any lower precision integer, source modifier
@ -294,14 +287,6 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
ibld.MUL(high, inst->src[0],
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
}
} else {
if (inst->src[0].abs)
lower_src_modifiers(&s, block, inst, 0);
ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
inst->src[1]);
ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
inst->src[1]);
}
if (do_addition) {
@ -399,7 +384,7 @@ brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
* mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
* mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
*/
if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
if (inst->src[1].negate || inst->src[1].abs)
lower_src_modifiers(&s, block, inst, 1);
/* Should have been lowered to 8-wide. */
@ -408,47 +393,23 @@ brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
const fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
inst->group % acc_width);
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
if (devinfo->ver >= 8) {
/* Until Gfx8, integer multiplies read 32-bits from one source,
* and 16-bits from the other, and relying on the MACH instruction
* to generate the high bits of the result.
*
* On Gfx8, the multiply instruction does a full 32x32-bit
* multiply, but in order to do a 64-bit multiply we can simulate
* the previous behavior and then use a MACH instruction.
*/
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
mul->src[1].type == BRW_REGISTER_TYPE_UD);
mul->src[1].type = BRW_REGISTER_TYPE_UW;
mul->src[1].stride *= 2;
/* Until Gfx8, integer multiplies read 32-bits from one source,
* and 16-bits from the other, and relying on the MACH instruction
* to generate the high bits of the result.
*
* On Gfx8, the multiply instruction does a full 32x32-bit
* multiply, but in order to do a 64-bit multiply we can simulate
* the previous behavior and then use a MACH instruction.
*/
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
mul->src[1].type == BRW_REGISTER_TYPE_UD);
mul->src[1].type = BRW_REGISTER_TYPE_UW;
mul->src[1].stride *= 2;
if (mul->src[1].file == IMM) {
mul->src[1] = brw_imm_uw(mul->src[1].ud);
}
} else if (devinfo->verx10 == 70 &&
inst->group > 0) {
/* Among other things the quarter control bits influence which
* accumulator register is used by the hardware for instructions
* that access the accumulator implicitly (e.g. MACH). A
* second-half instruction would normally map to acc1, which
* doesn't exist on Gfx7 and up (the hardware does emulate it for
* floating-point instructions *only* by taking advantage of the
* extra precision of acc0 not normally used for floating point
* arithmetic).
*
* HSW and up are careful enough not to try to access an
* accumulator register that doesn't exist, but on earlier Gfx7
* hardware we need to make sure that the quarter control bits are
* zero to avoid non-deterministic behaviour and emit an extra MOV
* to get the result masked correctly according to the current
* channel enables.
*/
mach->group = 0;
mach->force_writemask_all = true;
mach->dst = ibld.vgrf(inst->dst.type);
ibld.MOV(inst->dst, mach->dst);
if (mul->src[1].file == IMM) {
mul->src[1] = brw_imm_uw(mul->src[1].ud);
}
}
@ -463,13 +424,8 @@ brw_fs_lower_integer_multiplication(fs_visitor &s)
/* If the instruction is already in a form that does not need lowering,
* return early.
*/
if (s.devinfo->ver >= 7) {
if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
continue;
} else {
if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
continue;
}
if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
continue;
if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
inst->dst.type == BRW_REGISTER_TYPE_UQ) &&

View file

@ -64,13 +64,6 @@ brw_fs_lower_pack(fs_visitor &s)
const uint32_t half = _mesa_float_to_half(inst->src[i].f);
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
brw_imm_uw(half));
} else if (i == 1 && s.devinfo->ver < 9) {
/* Pre-Skylake requires DWord aligned destinations */
fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
inst->src[i]);
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
} else {
ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
inst->src[i]);

View file

@ -184,7 +184,6 @@ namespace {
* support 64-bit types at all.
*/
if ((!has_64bit || devinfo->verx10 >= 125 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
return BRW_REGISTER_TYPE_UD;
else
@ -192,9 +191,7 @@ namespace {
case SHADER_OPCODE_BROADCAST:
case SHADER_OPCODE_MOV_INDIRECT:
if (((devinfo->verx10 == 70 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo) ||
if (((intel_device_info_is_9lp(devinfo) ||
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
(devinfo->verx10 >= 125 &&
brw_reg_type_is_floating_point(inst->src[0].type)))
@ -258,24 +255,6 @@ namespace {
return false;
}
/* Empirical testing shows that Broadwell has a bug affecting half-float
* MAD instructions when any of its sources has a non-zero offset, such
* as:
*
* mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
*
* We used to generate code like this for SIMD8 executions where we
* used to pack components Y and W of a vector at offset 16B of a SIMD
* register. The problem doesn't occur if the stride of the source is 0.
*/
if (devinfo->ver == 8 &&
inst->opcode == BRW_OPCODE_MAD &&
inst->src[i].type == BRW_REGISTER_TYPE_HF &&
reg_offset(inst->src[i]) % REG_SIZE > 0 &&
inst->src[i].stride != 0) {
return true;
}
const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);

View file

@ -190,9 +190,6 @@ load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
bool
brw_fs_opt_zero_samples(fs_visitor &s)
{
/* Implementation supports only SENDs, so applicable to Gfx7+ only. */
assert(s.devinfo->ver >= 7);
bool progress = false;
foreach_block_and_inst(block, fs_inst, send, s.cfg) {
@ -268,9 +265,6 @@ brw_fs_opt_zero_samples(fs_visitor &s)
bool
brw_fs_opt_split_sends(fs_visitor &s)
{
if (s.devinfo->ver < 9)
return false;
bool progress = false;
foreach_block_and_inst(block, fs_inst, send, s.cfg) {