intel/brw: Always have a (non-DO) block after a DO in the CFG

Make the "block after DO" more stable so that adding instructions after a DO doesn't require repairing the CFG. Use a new SHADER_OPCODE_FLOW instruction that is a placeholder representing "go to the next block" and disappears at code generation. For some context, there are a few facts about how CFG currently works - Blocks are assumed to not be empty; - DO is always by itself in a block, i.e. starts and ends a block; - There are no empty blocks; - Predicated WHILE and CONTINUE will link to the "block after DO"; - When nesting loops, it is possible that the "block after DO" is another "DO". Reasons and further explanations for those are in the brw_cfg.c comments. What makes this new change useful is that a pass might want to add instructions between two DO instructions. When that happens, a new block must be created and any predicated WHILE and CONTINUE must be repaired. So, instead of requiring a repair (which has proven to be tricky in the past), this change adds a block that can be "virtually" empty but allow instructions to be added without further changes. One alternative design would be allowing empty blocks, that would be a deeper change since the blocks are currently assumed to be not empty in various places. We'll save that for when other changes are made to the CFG. The problem described happens in brw_opt_combine_constants, and a different patch will clean that up. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33536>
2025-12-21 07:10:09 +01:00 · 2025-02-13 18:38:00 -08:00 · 2025-02-13 18:38:00 -08:00 · d2c39b1779
commit d2c39b1779
parent d32a5ab0e4
11 changed files with 68 additions and 8 deletions
--- a/src/intel/compiler/brw_analysis_performance.cpp
+++ b/src/intel/compiler/brw_analysis_performance.cpp
@ -713,6 +713,7 @@ namespace {
      case SHADER_OPCODE_UNDEF:
      case SHADER_OPCODE_HALT_TARGET:
      case FS_OPCODE_SCHEDULING_FENCE:
+      case SHADER_OPCODE_FLOW:
         return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
                               0, 0, 0, 0, 0, 0);

--- a/src/intel/compiler/brw_builder.h
+++ b/src/intel/compiler/brw_builder.h
@ -847,12 +847,22 @@ public:
   brw_shader *shader;

   brw_inst *BREAK()    const { return emit(BRW_OPCODE_BREAK); }
-   brw_inst *DO()       const { return emit(BRW_OPCODE_DO); }
   brw_inst *ENDIF()    const { return emit(BRW_OPCODE_ENDIF); }
   brw_inst *NOP()      const { return emit(BRW_OPCODE_NOP); }
   brw_inst *WHILE()    const { return emit(BRW_OPCODE_WHILE); }
   brw_inst *CONTINUE() const { return emit(BRW_OPCODE_CONTINUE); }

+   void DO() const {
+      emit(BRW_OPCODE_DO);
+      /* Ensure that there'll always be a block after DO to add
+       * instructions and serve as sucessor for predicated WHILE
+       * and CONTINUE.
+       *
+       * See more details in brw_cfg::validate().
+       */
+      emit(SHADER_OPCODE_FLOW);
+   }
+
   bool has_writemask_all() const {
      return force_writemask_all;
   }
--- a/src/intel/compiler/brw_cfg.cpp
+++ b/src/intel/compiler/brw_cfg.cpp
@ -215,6 +215,13 @@ cfg_t::cfg_t(const brw_shader *s, exec_list *instructions) :
      inst->exec_node::remove();

      switch (inst->opcode) {
+      case SHADER_OPCODE_FLOW:
+         cur->instructions.push_tail(inst);
+         next = new_block();
+         cur->add_successor(mem_ctx, next, bblock_link_logical);
+         set_next_block(&cur, next, ip);
+         break;
+
      case BRW_OPCODE_IF:
         cur->instructions.push_tail(inst);

@ -736,6 +743,8 @@ cfg_t::validate(const char *stage_abbrev)
         }
      }

+      cfgv_assert(!block->instructions.is_empty());
+
      brw_inst *first_inst = block->start();
      if (first_inst->opcode == BRW_OPCODE_DO) {
         /* DO instructions both begin and end a block, so the DO instruction
@ -763,6 +772,24 @@ cfg_t::validate(const char *stage_abbrev)

         cfgv_assert(logical_block != nullptr);
         cfgv_assert(physical_block != nullptr);
+
+         /* A flow block (block ending with SHADER_OPCODE_FLOW) is
+          * used to ensure that the block right after DO is always
+          * present even if it doesn't have actual instructions.
+          *
+          * This way predicated WHILE and CONTINUE don't need to be
+          * repaired when adding instructions right after the DO.
+          * They will point to the flow block whether is empty or not.
+          */
+         cfgv_assert(logical_block->end()->opcode == SHADER_OPCODE_FLOW);
+      }
+
+      brw_inst *last_inst = block->end();
+      if (last_inst->opcode == SHADER_OPCODE_FLOW) {
+         /* A flow block only has one successor -- the instruction disappears
+          * when generating code.
+          */
+         cfgv_assert(block->children.length() == 1);
      }
   }
 }
--- a/src/intel/compiler/brw_cfg.h
+++ b/src/intel/compiler/brw_cfg.h
@ -214,7 +214,8 @@ bblock_ends_with_control_flow(const struct bblock_t *block)
          op == BRW_OPCODE_ELSE ||
          op == BRW_OPCODE_WHILE ||
          op == BRW_OPCODE_BREAK ||
-          op == BRW_OPCODE_CONTINUE;
+          op == BRW_OPCODE_CONTINUE ||
+          op == SHADER_OPCODE_FLOW;
 }

 static inline brw_inst *
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@ -553,6 +553,9 @@ enum opcode {
   SHADER_OPCODE_MEMORY_LOAD_LOGICAL,
   SHADER_OPCODE_MEMORY_STORE_LOGICAL,
   SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL,
+
+   /* Ends a block moving to the next one.  See brw_cfg for details. */
+   SHADER_OPCODE_FLOW,
 };

 enum fb_write_logical_srcs {
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@ -471,6 +471,8 @@ brw_from_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
   assert(!nir_loop_has_continue_construct(loop));
   bld.DO();

+   bld.emit(SHADER_OPCODE_FLOW);
+
   brw_from_nir_emit_cf_list(ntb, &loop->body);

   brw_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
--- a/src/intel/compiler/brw_generator.cpp
+++ b/src/intel/compiler/brw_generator.cpp
@ -1083,6 +1083,10 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 	 brw_DO(p, brw_get_default_exec_size(p));
 	 break;

+      case SHADER_OPCODE_FLOW:
+         /* Do nothing. */
+         break;
+
      case BRW_OPCODE_BREAK:
 	 brw_BREAK(p);
 	 break;
--- a/src/intel/compiler/brw_inst.cpp
+++ b/src/intel/compiler/brw_inst.cpp
@ -808,6 +808,7 @@ brw_inst::is_control_flow_end() const
   case BRW_OPCODE_ELSE:
   case BRW_OPCODE_WHILE:
   case BRW_OPCODE_ENDIF:
+   case SHADER_OPCODE_FLOW:
      return true;
   default:
      return false;
@ -825,6 +826,7 @@ brw_inst::is_control_flow() const
   case BRW_OPCODE_ENDIF:
   case BRW_OPCODE_BREAK:
   case BRW_OPCODE_CONTINUE:
+   case SHADER_OPCODE_FLOW:
      return true;
   default:
      return false;
--- a/src/intel/compiler/brw_lower_scoreboard.cpp
+++ b/src/intel/compiler/brw_lower_scoreboard.cpp
@ -185,6 +185,7 @@ namespace {
      case SHADER_OPCODE_UNDEF:
      case SHADER_OPCODE_HALT_TARGET:
      case FS_OPCODE_SCHEDULING_FENCE:
+      case SHADER_OPCODE_FLOW:
         return 0;
      default:
         /* Note that the following is inaccurate for virtual instructions
--- a/src/intel/compiler/brw_print.cpp
+++ b/src/intel/compiler/brw_print.cpp
@ -294,6 +294,9 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
      return "read_from_live_channel";
   case SHADER_OPCODE_READ_FROM_CHANNEL:
      return "read_from_channel";
+
+   case SHADER_OPCODE_FLOW:
+      return "flow";
   }

   unreachable("not reached");
--- a/src/intel/compiler/test_lower_scoreboard.cpp
+++ b/src/intel/compiler/test_lower_scoreboard.cpp
@ -515,12 +515,14 @@ TEST_F(scoreboard_test, loop1)
   brw_calculate_cfg(*v);
   lower_scoreboard(v);

-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
   brw_inst *add = instruction(body, 0);
   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
   EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 1));

-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
   brw_inst *mul = instruction(last_block, 0);
   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
   EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 1));
@ -550,12 +552,14 @@ TEST_F(scoreboard_test, loop2)

   /* Now the write in ADD has the tightest RegDist for both ADD and MUL. */

-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
   brw_inst *add = instruction(body, 0);
   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
   EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 2));

-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
   brw_inst *mul = instruction(last_block, 0);
   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
   EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 2));
@ -586,12 +590,14 @@ TEST_F(scoreboard_test, loop3)
   brw_calculate_cfg(*v);
   lower_scoreboard(v);

-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
   brw_inst *add = instruction(body, 4);
   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
   EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 5));

-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
   brw_inst *mul = instruction(last_block, 0);
   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
   EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 1));