diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 62c2c1f5e5f..5c0cc0bbb2b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -274,120 +274,6 @@ fs_inst::is_payload(unsigned arg) const } } -/** - * Returns true if this instruction's sources and destinations cannot - * safely be the same register. - * - * In most cases, a register can be written over safely by the same - * instruction that is its last use. For a single instruction, the - * sources are dereferenced before writing of the destination starts - * (naturally). - * - * However, there are a few cases where this can be problematic: - * - * - Virtual opcodes that translate to multiple instructions in the - * code generator: if src == dst and one instruction writes the - * destination before a later instruction reads the source, then - * src will have been clobbered. - * - * - SIMD16 compressed instructions with certain regioning (see below). - * - * The register allocator uses this information to set up conflicts between - * GRF sources and the destination. - */ -bool -fs_inst::has_source_and_destination_hazard() const -{ - switch (opcode) { - case FS_OPCODE_PACK_HALF_2x16_SPLIT: - /* Multiple partial writes to the destination */ - return true; - case SHADER_OPCODE_SHUFFLE: - /* This instruction returns an arbitrary channel from the source and - * gets split into smaller instructions in the generator. It's possible - * that one of the instructions will read from a channel corresponding - * to an earlier instruction. - */ - case SHADER_OPCODE_SEL_EXEC: - /* This is implemented as - * - * mov(16) g4<1>D 0D { align1 WE_all 1H }; - * mov(16) g4<1>D g5<8,8,1>D { align1 1H } - * - * Because the source is only read in the second instruction, the first - * may stomp all over it. - */ - return true; - case SHADER_OPCODE_QUAD_SWIZZLE: - switch (src[1].ud) { - case BRW_SWIZZLE_XXXX: - case BRW_SWIZZLE_YYYY: - case BRW_SWIZZLE_ZZZZ: - case BRW_SWIZZLE_WWWW: - case BRW_SWIZZLE_XXZZ: - case BRW_SWIZZLE_YYWW: - case BRW_SWIZZLE_XYXY: - case BRW_SWIZZLE_ZWZW: - /* These can be implemented as a single Align1 region on all - * platforms, so there's never a hazard between source and - * destination. C.f. fs_generator::generate_quad_swizzle(). - */ - return false; - default: - return !is_uniform(src[0]); - } - case BRW_OPCODE_DPAS: - /* This is overly conservative. The actual hazard is more complicated to - * describe. When the repeat count is N, the single instruction behaves - * like N instructions with a repeat count of one, but the destination - * and source registers are incremented (in somewhat complex ways) for - * each instruction. - * - * This means the source and destination register is actually a range of - * registers. The hazard exists of an earlier iteration would write a - * register that should be read by a later iteration. - * - * There may be some advantage to properly modeling this, but for now, - * be overly conservative. - */ - return rcount > 1; - default: - /* The SIMD16 compressed instruction - * - * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F - * - * is actually decoded in hardware as: - * - * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F - * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F - * - * Which is safe. However, if we have uniform accesses - * happening, we get into trouble: - * - * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F - * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F - * - * Now our destination for the first instruction overwrote the - * second instruction's src0, and we get garbage for those 8 - * pixels. There's a similar issue for the pre-gfx6 - * pixel_x/pixel_y, which are registers of 16-bit values and thus - * would get stomped by the first decode as well. - */ - if (exec_size == 16) { - for (int i = 0; i < sources; i++) { - if (src[i].file == VGRF && (src[i].stride == 0 || - src[i].type == BRW_TYPE_UW || - src[i].type == BRW_TYPE_W || - src[i].type == BRW_TYPE_UB || - src[i].type == BRW_TYPE_B)) { - return true; - } - } - } - return false; - } -} - bool fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const { diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index ff62f74753f..13d4409605e 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -405,13 +405,127 @@ fs_reg_alloc::setup_live_interference(unsigned node, } } +/** + * Returns true if this instruction's sources and destinations cannot + * safely be the same register. + * + * In most cases, a register can be written over safely by the same + * instruction that is its last use. For a single instruction, the + * sources are dereferenced before writing of the destination starts + * (naturally). + * + * However, there are a few cases where this can be problematic: + * + * - Virtual opcodes that translate to multiple instructions in the + * code generator: if src == dst and one instruction writes the + * destination before a later instruction reads the source, then + * src will have been clobbered. + * + * - SIMD16 compressed instructions with certain regioning (see below). + * + * The register allocator uses this information to set up conflicts between + * GRF sources and the destination. + */ +static bool +brw_inst_has_source_and_destination_hazard(const fs_inst *inst) +{ + switch (inst->opcode) { + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + /* Multiple partial writes to the destination */ + return true; + case SHADER_OPCODE_SHUFFLE: + /* This instruction returns an arbitrary channel from the source and + * gets split into smaller instructions in the generator. It's possible + * that one of the instructions will read from a channel corresponding + * to an earlier instruction. + */ + case SHADER_OPCODE_SEL_EXEC: + /* This is implemented as + * + * mov(16) g4<1>D 0D { align1 WE_all 1H }; + * mov(16) g4<1>D g5<8,8,1>D { align1 1H } + * + * Because the source is only read in the second instruction, the first + * may stomp all over it. + */ + return true; + case SHADER_OPCODE_QUAD_SWIZZLE: + switch (inst->src[1].ud) { + case BRW_SWIZZLE_XXXX: + case BRW_SWIZZLE_YYYY: + case BRW_SWIZZLE_ZZZZ: + case BRW_SWIZZLE_WWWW: + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + /* These can be implemented as a single Align1 region on all + * platforms, so there's never a hazard between source and + * destination. C.f. fs_generator::generate_quad_swizzle(). + */ + return false; + default: + return !is_uniform(inst->src[0]); + } + case BRW_OPCODE_DPAS: + /* This is overly conservative. The actual hazard is more complicated to + * describe. When the repeat count is N, the single instruction behaves + * like N instructions with a repeat count of one, but the destination + * and source registers are incremented (in somewhat complex ways) for + * each instruction. + * + * This means the source and destination register is actually a range of + * registers. The hazard exists of an earlier iteration would write a + * register that should be read by a later iteration. + * + * There may be some advantage to properly modeling this, but for now, + * be overly conservative. + */ + return inst->rcount > 1; + default: + /* The SIMD16 compressed instruction + * + * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F + * + * is actually decoded in hardware as: + * + * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F + * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F + * + * Which is safe. However, if we have uniform accesses + * happening, we get into trouble: + * + * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F + * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F + * + * Now our destination for the first instruction overwrote the + * second instruction's src0, and we get garbage for those 8 + * pixels. There's a similar issue for the pre-gfx6 + * pixel_x/pixel_y, which are registers of 16-bit values and thus + * would get stomped by the first decode as well. + */ + if (inst->exec_size == 16) { + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF && (inst->src[i].stride == 0 || + inst->src[i].type == BRW_TYPE_UW || + inst->src[i].type == BRW_TYPE_W || + inst->src[i].type == BRW_TYPE_UB || + inst->src[i].type == BRW_TYPE_B)) { + return true; + } + } + } + return false; + } +} + void fs_reg_alloc::setup_inst_interference(const fs_inst *inst) { /* Certain instructions can't safely use the same register for their * sources and destination. Add interference. */ - if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { + if (inst->dst.file == VGRF && brw_inst_has_source_and_destination_hazard(inst)) { for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,