diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index add199388c1..2735a430ec3 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1116,9 +1116,13 @@ fs_inst::flags_read(const intel_device_info *devinfo) const } unsigned -fs_inst::flags_written() const +fs_inst::flags_written(const intel_device_info *devinfo) const { - if ((conditional_mod && (opcode != BRW_OPCODE_SEL && + /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented + * using a separte cmpn and sel instruction. This lowering occurs in + * fs_vistor::lower_minmax which is called very, very late. + */ + if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) && opcode != BRW_OPCODE_CSEL && opcode != BRW_OPCODE_IF && opcode != BRW_OPCODE_WHILE)) || @@ -7609,7 +7613,7 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) return !(is_periodic(inst->src[i], lbld.dispatch_width()) || (inst->components_read(i) == 1 && lbld.dispatch_width() <= inst->exec_size)) || - (inst->flags_written() & + (inst->flags_written(lbld.shader->devinfo) & flag_mask(inst->src[i], type_sz(inst->src[i].type))); } @@ -8730,7 +8734,7 @@ fs_visitor::fixup_nomask_control_flow() foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { if (!inst->predicate && inst->exec_size >= 8) - flag_liveout &= ~inst->flags_written(); + flag_liveout &= ~inst->flags_written(devinfo); switch (inst->opcode) { case BRW_OPCODE_DO: diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 435186c1de1..770e417e47f 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -55,7 +55,7 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, fs_inst *inst) { bool read_flag = false; - const unsigned flags_written = inst->flags_written(); + const unsigned flags_written = inst->flags_written(devinfo); foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { if (scan_inst->opcode == BRW_OPCODE_ADD && @@ -89,8 +89,8 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, * Perhaps (scan_inst->flags_written() & flags_written) != * flags_written? */ - if (scan_inst->flags_written() != 0 && - scan_inst->flags_written() != flags_written) + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) goto not_match; /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags": @@ -142,7 +142,7 @@ cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, } not_match: - if ((scan_inst->flags_written() & flags_written) != 0) + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) break; read_flag = read_flag || @@ -171,7 +171,7 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, { const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod); bool read_flag = false; - const unsigned flags_written = inst->flags_written(); + const unsigned flags_written = inst->flags_written(devinfo); if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ) return false; @@ -195,8 +195,8 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, * Perhaps (scan_inst->flags_written() & flags_written) != * flags_written? */ - if (scan_inst->flags_written() != 0 && - scan_inst->flags_written() != flags_written) + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) break; if (scan_inst->can_do_cmod() && @@ -209,7 +209,7 @@ cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, break; } - if ((scan_inst->flags_written() & flags_written) != 0) + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) break; read_flag = read_flag || @@ -285,7 +285,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) } bool read_flag = false; - const unsigned flags_written = inst->flags_written(); + const unsigned flags_written = inst->flags_written(devinfo); foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { if (regions_overlap(scan_inst->dst, scan_inst->size_written, inst->src[0], inst->size_read(0))) { @@ -296,8 +296,8 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) * Perhaps (scan_inst->flags_written() & flags_written) != * flags_written? */ - if (scan_inst->flags_written() != 0 && - scan_inst->flags_written() != flags_written) + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) break; if (scan_inst->is_partial_write() || @@ -396,7 +396,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) * between scan_inst and inst. */ if (!inst->src[0].negate && - scan_inst->flags_written()) { + scan_inst->flags_written(devinfo)) { if (scan_inst->opcode == BRW_OPCODE_CMP) { if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) || (inst->conditional_mod == BRW_CONDITIONAL_G && @@ -408,8 +408,18 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) break; } } else if (scan_inst->conditional_mod == inst->conditional_mod) { - inst->remove(block, true); - progress = true; + /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the + * flags value is not based on the result stored in the + * destination. On all other platforms sel.cond will not + * write the flags, so execution will not get to this point. + */ + if (scan_inst->opcode == BRW_OPCODE_SEL) { + assert(devinfo->ver <= 5); + } else { + inst->remove(block, true); + progress = true; + } + break; } else if (!read_flag) { scan_inst->conditional_mod = inst->conditional_mod; @@ -505,7 +515,7 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) break; } - if ((scan_inst->flags_written() & flags_written) != 0) + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) break; read_flag = read_flag || diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp index 66c914bf93f..495f1f95fdd 100644 --- a/src/intel/compiler/brw_fs_cse.cpp +++ b/src/intel/compiler/brw_fs_cse.cpp @@ -332,10 +332,10 @@ fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &i /* Kill all AEB entries that write a different value to or read from * the flag register if we just wrote it. */ - if (inst->flags_written()) { + if (inst->flags_written(devinfo)) { bool negate; /* dummy */ if (entry->generator->flags_read(devinfo) || - (entry->generator->flags_written() && + (entry->generator->flags_written(devinfo) && !instructions_match(inst, entry->generator, &negate))) { entry->remove(); ralloc_free(entry); diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp index 21f6f903df0..ed7ab3ebdc9 100644 --- a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp +++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp @@ -40,11 +40,12 @@ using namespace brw; * Is it safe to eliminate the instruction? */ static bool -can_eliminate(const fs_inst *inst, BITSET_WORD *flag_live) +can_eliminate(const intel_device_info *devinfo, const fs_inst *inst, + BITSET_WORD *flag_live) { return !inst->is_control_flow() && !inst->has_side_effects() && - !(flag_live[0] & inst->flags_written()) && + !(flag_live[0] & inst->flags_written(devinfo)) && !inst->writes_accumulator; } @@ -96,14 +97,14 @@ fs_visitor::dead_code_eliminate() result_live |= BITSET_TEST(live, var + i); if (!result_live && - (can_omit_write(inst) || can_eliminate(inst, flag_live))) { + (can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) { inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type), inst->dst.stride)); progress = true; } } - if (inst->dst.is_null() && can_eliminate(inst, flag_live)) { + if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) { inst->opcode = BRW_OPCODE_NOP; progress = true; } @@ -118,7 +119,7 @@ fs_visitor::dead_code_eliminate() } if (!inst->predicate && inst->exec_size >= 8) - flag_live[0] &= ~inst->flags_written(); + flag_live[0] &= ~inst->flags_written(devinfo); if (inst->opcode == BRW_OPCODE_NOP) { inst->remove(block, true); diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp index 3a35cac3b76..5ad5b95976a 100644 --- a/src/intel/compiler/brw_fs_live_variables.cpp +++ b/src/intel/compiler/brw_fs_live_variables.cpp @@ -138,7 +138,7 @@ fs_live_variables::setup_def_use() } if (!inst->predicate && inst->exec_size >= 8) - bd->flag_def[0] |= inst->flags_written() & ~bd->flag_use[0]; + bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0]; ip++; } diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index 5e0f8664d45..c9ce2a814db 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -355,7 +355,7 @@ namespace { if (!has_inconsistent_cmod(inst)) inst->conditional_mod = BRW_CONDITIONAL_NONE; - assert(!inst->flags_written() || !mov->predicate); + assert(!inst->flags_written(v->devinfo) || !mov->predicate); return true; } diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp index 6de5211f56d..540c2c8c21f 100644 --- a/src/intel/compiler/brw_fs_sel_peephole.cpp +++ b/src/intel/compiler/brw_fs_sel_peephole.cpp @@ -63,13 +63,14 @@ using namespace brw; * returns 3. */ static int -count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS], +count_movs_from_if(const intel_device_info *devinfo, + fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS], bblock_t *then_block, bblock_t *else_block) { int then_movs = 0; foreach_inst_in_block(fs_inst, inst, then_block) { if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV || - inst->flags_written()) + inst->flags_written(devinfo)) break; then_mov[then_movs] = inst; @@ -79,7 +80,7 @@ count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS], int else_movs = 0; foreach_inst_in_block(fs_inst, inst, else_block) { if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV || - inst->flags_written()) + inst->flags_written(devinfo)) break; else_mov[else_movs] = inst; @@ -152,7 +153,7 @@ fs_visitor::opt_peephole_sel() if (else_block == NULL) continue; - int movs = count_movs_from_if(then_mov, else_mov, then_block, else_block); + int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block); if (movs == 0) continue; diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 214b0763d5e..a61e66303b6 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -378,7 +378,7 @@ public: * Return the subset of flag registers updated by the instruction (either * partially or fully) as a bitset with byte granularity. */ - unsigned flags_written() const; + unsigned flags_written(const intel_device_info *devinfo) const; fs_reg dst; fs_reg *src; diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index edec07a909a..e6caebb5a26 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -1375,7 +1375,7 @@ namespace { st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); } - if (const unsigned mask = inst->flags_written()) { + if (const unsigned mask = inst->flags_written(devinfo)) { for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { if (mask & (1 << i)) stall_on_dependency(st, flag_dependency_id(i)); @@ -1425,7 +1425,7 @@ namespace { reg_dependency_id(devinfo, brw_acc_reg(8), j)); } - if (const unsigned mask = inst->flags_written()) { + if (const unsigned mask = inst->flags_written(devinfo)) { for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { if (mask & (1 << i)) mark_write_dependency(st, perf, flag_dependency_id(i)); diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index e19a9e5416d..7c6d459e00d 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -1262,7 +1262,7 @@ fs_instruction_scheduler::calculate_deps() } } - if (const unsigned mask = inst->flags_written()) { + if (const unsigned mask = inst->flags_written(v->devinfo)) { assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { @@ -1384,7 +1384,7 @@ fs_instruction_scheduler::calculate_deps() } } - if (const unsigned mask = inst->flags_written()) { + if (const unsigned mask = inst->flags_written(v->devinfo)) { assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp index ff4f9785fbb..e157a92b3c3 100644 --- a/src/intel/compiler/test_fs_cmod_propagation.cpp +++ b/src/intel/compiler/test_fs_cmod_propagation.cpp @@ -2482,3 +2482,134 @@ TEST_F(cmod_propagation_test, cmp_to_add_float_le) EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod); } + +TEST_F(cmod_propagation_test, prop_across_sel_gfx7) +{ + const fs_builder &bld = v->bld; + fs_reg dest1 = v->vgrf(glsl_type::float_type); + fs_reg dest2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::float_type); + fs_reg src3 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest1, src0, src1); + bld.emit_minmax(dest2, src2, src3, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), dest1, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest1 src0 src1 + * 1: sel.ge(8) dest2 src2 src3 + * 2: cmp.ge.f0(8) null dest1 0.0f + * + * = After = + * 0: add.ge.f0(8) dest1 src0 src1 + * 1: sel.ge(8) dest2 src2 src3 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, prop_across_sel_gfx5) +{ + devinfo->ver = 5; + devinfo->verx10 = devinfo->ver * 10; + + const fs_builder &bld = v->bld; + fs_reg dest1 = v->vgrf(glsl_type::float_type); + fs_reg dest2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::float_type); + fs_reg src3 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest1, src0, src1); + bld.emit_minmax(dest2, src2, src3, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), dest1, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest1 src0 src1 + * 1: sel.ge(8) dest2 src2 src3 + * 2: cmp.ge.f0(8) null dest1 0.0f + * + * = After = + * (no changes) + * + * On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented + * using a separate cmpn and sel instruction. This lowering occurs in + * fs_vistor::lower_minmax which is called a long time after the first + * calls to cmod_propagation. + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, prop_into_sel_gfx5) +{ + devinfo->ver = 5; + devinfo->verx10 = devinfo->ver * 10; + + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.emit_minmax(dest, src0, src1, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: sel.ge(8) dest src0 src1 + * 1: cmp.ge.f0(8) null dest 0.0f + * + * = After = + * (no changes) + * + * Do not copy propagate into a sel.cond instruction. While it does modify + * the flags, the flags are not based on the result compared with zero (as + * with most other instructions). The result is based on the sources + * compared with each other (like cmp.cond). + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +}