mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 03:08:05 +02:00
i965/vec4: add a SIMD lowering pass
Generally, instructions in Align16 mode only ever write to a single register and don't need any form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time. This patch implements a SIMD splitting pass similar to the one in the scalar backend. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass should be a no-op on any other generation. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. v2: - Move the simd lowering pass after the main optimization loop and run copy-propagation and dce if it reports progress (Curro) - Compute number of registers written instead of fixing it to 1 (Iago) - Use group from backend_instruction (Iago) - Drop assertion that checked that we only split 8-wide instructions into 4-wide. (Curro) - Don't assume that instructions can only be 8-wide, we might want to use 16-wide instructions in the future too (Curro) - Wrap gen7 workarounds in a conditional to ease adding workarounds for other gens in the future (Curro) - Handle dst/src overlap hazard (Curro) - Use the horiz_offset() helper to simplify the implementation (Curro) - Drop the assertion that checks that each split instruction writes exactly one register (Curro) - Use the copy constructor to generate split instructions with all the relevant fields initialized to the values in the original instruction instead of copying only a handful of them manually (Curro) v3 (Iago): - When copying to a temporary, allocate the number of registers required for the copy based on the size written of the lowered instruction instead of assuming that all lowered instructions produce single-register writes - Adapt to changes in offset() Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
parent
945269ab72
commit
58767f0fec
2 changed files with 161 additions and 0 deletions
|
|
@ -1977,6 +1977,160 @@ vec4_visitor::convert_to_hw_regs()
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the closest native SIMD width supported by the hardware for instruction
|
||||
* \p inst. The instruction will be left untouched by
|
||||
* vec4_visitor::lower_simd_width() if the returned value matches the
|
||||
* instruction's original execution size.
|
||||
*/
|
||||
static unsigned
|
||||
get_lowered_simd_width(const struct gen_device_info *devinfo,
|
||||
const vec4_instruction *inst)
|
||||
{
|
||||
unsigned lowered_width = MIN2(16, inst->exec_size);
|
||||
|
||||
/* We need to split some cases of double-precision instructions that write
|
||||
* 2 registers. We only need to care about this in gen7 because that is the
|
||||
* only hardware that implements fp64 in Align16.
|
||||
*/
|
||||
if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
|
||||
/* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
|
||||
* Register Addressing:
|
||||
*
|
||||
* "When destination spans two registers, the source MUST span two
|
||||
* registers."
|
||||
*/
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == BAD_FILE)
|
||||
continue;
|
||||
if (inst->size_read(i) <= REG_SIZE)
|
||||
lowered_width = MIN2(lowered_width, 4);
|
||||
}
|
||||
}
|
||||
|
||||
return lowered_width;
|
||||
}
|
||||
|
||||
static bool
|
||||
dst_src_regions_overlap(vec4_instruction *inst)
|
||||
{
|
||||
if (inst->size_written == 0)
|
||||
return false;
|
||||
|
||||
unsigned dst_start = inst->dst.offset;
|
||||
unsigned dst_end = dst_start + inst->size_written - 1;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == BAD_FILE)
|
||||
continue;
|
||||
|
||||
if (inst->dst.file != inst->src[i].file ||
|
||||
inst->dst.nr != inst->src[i].nr)
|
||||
continue;
|
||||
|
||||
unsigned src_start = inst->src[i].offset;
|
||||
unsigned src_end = src_start + inst->size_read(i) - 1;
|
||||
|
||||
if ((dst_start >= src_start && dst_start <= src_end) ||
|
||||
(dst_end >= src_start && dst_end <= src_end) ||
|
||||
(dst_start <= src_start && dst_end >= src_end)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::lower_simd_width()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
|
||||
const unsigned lowered_width = get_lowered_simd_width(devinfo, inst);
|
||||
assert(lowered_width <= inst->exec_size);
|
||||
if (lowered_width == inst->exec_size)
|
||||
continue;
|
||||
|
||||
/* We need to deal with source / destination overlaps when splitting.
|
||||
* The hardware supports reading from and writing to the same register
|
||||
* in the same instruction, but we need to be careful that each split
|
||||
* instruction we produce does not corrupt the source of the next.
|
||||
*
|
||||
* The easiest way to handle this is to make the split instructions write
|
||||
* to temporaries if there is an src/dst overlap and then move from the
|
||||
* temporaries to the original destination. We also need to consider
|
||||
* instructions that do partial writes via align1 opcodes, in which case
|
||||
* we need to make sure that the we initialize the temporary with the
|
||||
* value of the instruction's dst.
|
||||
*/
|
||||
bool needs_temp = dst_src_regions_overlap(inst);
|
||||
for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
|
||||
unsigned channel_offset = lowered_width * n;
|
||||
|
||||
unsigned size_written = lowered_width * type_sz(inst->dst.type);
|
||||
|
||||
/* Create the split instruction from the original so that we copy all
|
||||
* relevant instruction fields, then set the width and calculate the
|
||||
* new dst/src regions.
|
||||
*/
|
||||
vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
|
||||
linst->exec_size = lowered_width;
|
||||
linst->group = channel_offset;
|
||||
linst->size_written = size_written;
|
||||
|
||||
/* Compute split dst region */
|
||||
dst_reg dst;
|
||||
if (needs_temp) {
|
||||
unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
|
||||
dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
|
||||
inst->dst.type);
|
||||
if (inst->is_align1_partial_write()) {
|
||||
vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
|
||||
copy->exec_size = lowered_width;
|
||||
copy->group = channel_offset;
|
||||
copy->size_written = size_written;
|
||||
inst->insert_before(block, copy);
|
||||
}
|
||||
} else {
|
||||
dst = horiz_offset(inst->dst, channel_offset);
|
||||
}
|
||||
linst->dst = dst;
|
||||
|
||||
/* Compute split source regions */
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (linst->src[i].file == BAD_FILE)
|
||||
continue;
|
||||
|
||||
if (!is_uniform(linst->src[i]))
|
||||
linst->src[i] = horiz_offset(linst->src[i], channel_offset);
|
||||
}
|
||||
|
||||
inst->insert_before(block, linst);
|
||||
|
||||
/* If we used a temporary to store the result of the split
|
||||
* instruction, copy the result to the original destination
|
||||
*/
|
||||
if (needs_temp) {
|
||||
vec4_instruction *mov =
|
||||
MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
|
||||
mov->exec_size = lowered_width;
|
||||
mov->group = channel_offset;
|
||||
mov->size_written = size_written;
|
||||
mov->predicate = inst->predicate;
|
||||
inst->insert_before(block, mov);
|
||||
}
|
||||
}
|
||||
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_live_intervals();
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::run()
|
||||
{
|
||||
|
|
@ -2068,6 +2222,11 @@ vec4_visitor::run()
|
|||
OPT(dead_code_eliminate);
|
||||
}
|
||||
|
||||
if (OPT(lower_simd_width)) {
|
||||
OPT(opt_copy_propagation);
|
||||
OPT(dead_code_eliminate);
|
||||
}
|
||||
|
||||
if (failed)
|
||||
return false;
|
||||
|
||||
|
|
|
|||
|
|
@ -161,6 +161,8 @@ public:
|
|||
void opt_schedule_instructions();
|
||||
void convert_to_hw_regs();
|
||||
|
||||
bool lower_simd_width();
|
||||
|
||||
vec4_instruction *emit(vec4_instruction *inst);
|
||||
|
||||
vec4_instruction *emit(enum opcode opcode);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue