mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 17:30:12 +01:00
intel/brw: Reduce scope of has_source_and_destination_hazard
This predicate at the moment is only relevant during register allocation, so move it there and the code can ignore virtual instructions that were already lowered previously. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30496>
This commit is contained in:
parent
bf9456753d
commit
4361a08254
2 changed files with 115 additions and 115 deletions
|
|
@ -274,120 +274,6 @@ fs_inst::is_payload(unsigned arg) const
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this instruction's sources and destinations cannot
|
||||
* safely be the same register.
|
||||
*
|
||||
* In most cases, a register can be written over safely by the same
|
||||
* instruction that is its last use. For a single instruction, the
|
||||
* sources are dereferenced before writing of the destination starts
|
||||
* (naturally).
|
||||
*
|
||||
* However, there are a few cases where this can be problematic:
|
||||
*
|
||||
* - Virtual opcodes that translate to multiple instructions in the
|
||||
* code generator: if src == dst and one instruction writes the
|
||||
* destination before a later instruction reads the source, then
|
||||
* src will have been clobbered.
|
||||
*
|
||||
* - SIMD16 compressed instructions with certain regioning (see below).
|
||||
*
|
||||
* The register allocator uses this information to set up conflicts between
|
||||
* GRF sources and the destination.
|
||||
*/
|
||||
bool
|
||||
fs_inst::has_source_and_destination_hazard() const
|
||||
{
|
||||
switch (opcode) {
|
||||
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
||||
/* Multiple partial writes to the destination */
|
||||
return true;
|
||||
case SHADER_OPCODE_SHUFFLE:
|
||||
/* This instruction returns an arbitrary channel from the source and
|
||||
* gets split into smaller instructions in the generator. It's possible
|
||||
* that one of the instructions will read from a channel corresponding
|
||||
* to an earlier instruction.
|
||||
*/
|
||||
case SHADER_OPCODE_SEL_EXEC:
|
||||
/* This is implemented as
|
||||
*
|
||||
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
||||
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
||||
*
|
||||
* Because the source is only read in the second instruction, the first
|
||||
* may stomp all over it.
|
||||
*/
|
||||
return true;
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
switch (src[1].ud) {
|
||||
case BRW_SWIZZLE_XXXX:
|
||||
case BRW_SWIZZLE_YYYY:
|
||||
case BRW_SWIZZLE_ZZZZ:
|
||||
case BRW_SWIZZLE_WWWW:
|
||||
case BRW_SWIZZLE_XXZZ:
|
||||
case BRW_SWIZZLE_YYWW:
|
||||
case BRW_SWIZZLE_XYXY:
|
||||
case BRW_SWIZZLE_ZWZW:
|
||||
/* These can be implemented as a single Align1 region on all
|
||||
* platforms, so there's never a hazard between source and
|
||||
* destination. C.f. fs_generator::generate_quad_swizzle().
|
||||
*/
|
||||
return false;
|
||||
default:
|
||||
return !is_uniform(src[0]);
|
||||
}
|
||||
case BRW_OPCODE_DPAS:
|
||||
/* This is overly conservative. The actual hazard is more complicated to
|
||||
* describe. When the repeat count is N, the single instruction behaves
|
||||
* like N instructions with a repeat count of one, but the destination
|
||||
* and source registers are incremented (in somewhat complex ways) for
|
||||
* each instruction.
|
||||
*
|
||||
* This means the source and destination register is actually a range of
|
||||
* registers. The hazard exists of an earlier iteration would write a
|
||||
* register that should be read by a later iteration.
|
||||
*
|
||||
* There may be some advantage to properly modeling this, but for now,
|
||||
* be overly conservative.
|
||||
*/
|
||||
return rcount > 1;
|
||||
default:
|
||||
/* The SIMD16 compressed instruction
|
||||
*
|
||||
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
||||
*
|
||||
* is actually decoded in hardware as:
|
||||
*
|
||||
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
||||
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
||||
*
|
||||
* Which is safe. However, if we have uniform accesses
|
||||
* happening, we get into trouble:
|
||||
*
|
||||
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
||||
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
||||
*
|
||||
* Now our destination for the first instruction overwrote the
|
||||
* second instruction's src0, and we get garbage for those 8
|
||||
* pixels. There's a similar issue for the pre-gfx6
|
||||
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
||||
* would get stomped by the first decode as well.
|
||||
*/
|
||||
if (exec_size == 16) {
|
||||
for (int i = 0; i < sources; i++) {
|
||||
if (src[i].file == VGRF && (src[i].stride == 0 ||
|
||||
src[i].type == BRW_TYPE_UW ||
|
||||
src[i].type == BRW_TYPE_W ||
|
||||
src[i].type == BRW_TYPE_UB ||
|
||||
src[i].type == BRW_TYPE_B)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
||||
{
|
||||
|
|
|
|||
|
|
@ -405,13 +405,127 @@ fs_reg_alloc::setup_live_interference(unsigned node,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this instruction's sources and destinations cannot
|
||||
* safely be the same register.
|
||||
*
|
||||
* In most cases, a register can be written over safely by the same
|
||||
* instruction that is its last use. For a single instruction, the
|
||||
* sources are dereferenced before writing of the destination starts
|
||||
* (naturally).
|
||||
*
|
||||
* However, there are a few cases where this can be problematic:
|
||||
*
|
||||
* - Virtual opcodes that translate to multiple instructions in the
|
||||
* code generator: if src == dst and one instruction writes the
|
||||
* destination before a later instruction reads the source, then
|
||||
* src will have been clobbered.
|
||||
*
|
||||
* - SIMD16 compressed instructions with certain regioning (see below).
|
||||
*
|
||||
* The register allocator uses this information to set up conflicts between
|
||||
* GRF sources and the destination.
|
||||
*/
|
||||
static bool
|
||||
brw_inst_has_source_and_destination_hazard(const fs_inst *inst)
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
||||
/* Multiple partial writes to the destination */
|
||||
return true;
|
||||
case SHADER_OPCODE_SHUFFLE:
|
||||
/* This instruction returns an arbitrary channel from the source and
|
||||
* gets split into smaller instructions in the generator. It's possible
|
||||
* that one of the instructions will read from a channel corresponding
|
||||
* to an earlier instruction.
|
||||
*/
|
||||
case SHADER_OPCODE_SEL_EXEC:
|
||||
/* This is implemented as
|
||||
*
|
||||
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
||||
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
||||
*
|
||||
* Because the source is only read in the second instruction, the first
|
||||
* may stomp all over it.
|
||||
*/
|
||||
return true;
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
switch (inst->src[1].ud) {
|
||||
case BRW_SWIZZLE_XXXX:
|
||||
case BRW_SWIZZLE_YYYY:
|
||||
case BRW_SWIZZLE_ZZZZ:
|
||||
case BRW_SWIZZLE_WWWW:
|
||||
case BRW_SWIZZLE_XXZZ:
|
||||
case BRW_SWIZZLE_YYWW:
|
||||
case BRW_SWIZZLE_XYXY:
|
||||
case BRW_SWIZZLE_ZWZW:
|
||||
/* These can be implemented as a single Align1 region on all
|
||||
* platforms, so there's never a hazard between source and
|
||||
* destination. C.f. fs_generator::generate_quad_swizzle().
|
||||
*/
|
||||
return false;
|
||||
default:
|
||||
return !is_uniform(inst->src[0]);
|
||||
}
|
||||
case BRW_OPCODE_DPAS:
|
||||
/* This is overly conservative. The actual hazard is more complicated to
|
||||
* describe. When the repeat count is N, the single instruction behaves
|
||||
* like N instructions with a repeat count of one, but the destination
|
||||
* and source registers are incremented (in somewhat complex ways) for
|
||||
* each instruction.
|
||||
*
|
||||
* This means the source and destination register is actually a range of
|
||||
* registers. The hazard exists of an earlier iteration would write a
|
||||
* register that should be read by a later iteration.
|
||||
*
|
||||
* There may be some advantage to properly modeling this, but for now,
|
||||
* be overly conservative.
|
||||
*/
|
||||
return inst->rcount > 1;
|
||||
default:
|
||||
/* The SIMD16 compressed instruction
|
||||
*
|
||||
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
||||
*
|
||||
* is actually decoded in hardware as:
|
||||
*
|
||||
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
||||
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
||||
*
|
||||
* Which is safe. However, if we have uniform accesses
|
||||
* happening, we get into trouble:
|
||||
*
|
||||
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
||||
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
||||
*
|
||||
* Now our destination for the first instruction overwrote the
|
||||
* second instruction's src0, and we get garbage for those 8
|
||||
* pixels. There's a similar issue for the pre-gfx6
|
||||
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
||||
* would get stomped by the first decode as well.
|
||||
*/
|
||||
if (inst->exec_size == 16) {
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == VGRF && (inst->src[i].stride == 0 ||
|
||||
inst->src[i].type == BRW_TYPE_UW ||
|
||||
inst->src[i].type == BRW_TYPE_W ||
|
||||
inst->src[i].type == BRW_TYPE_UB ||
|
||||
inst->src[i].type == BRW_TYPE_B)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
|
||||
{
|
||||
/* Certain instructions can't safely use the same register for their
|
||||
* sources and destination. Add interference.
|
||||
*/
|
||||
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
|
||||
if (inst->dst.file == VGRF && brw_inst_has_source_and_destination_hazard(inst)) {
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue