From d2c39b1779330439e5b3bb5539c0e7655d3bee8c Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Thu, 13 Feb 2025 18:38:00 -0800
Subject: [PATCH] intel/brw: Always have a (non-DO) block after a DO in the CFG

Make the "block after DO" more stable so that adding instructions after
a DO doesn't require repairing the CFG.  Use a new SHADER_OPCODE_FLOW
instruction that is a placeholder representing "go to the next block"
and disappears at code generation.

For some context, there are a few facts about how CFG currently works

- Blocks are assumed to not be empty;
- DO is always by itself in a block, i.e. starts and ends a block;
- There are no empty blocks;
- Predicated WHILE and CONTINUE will link to the "block after DO";
- When nesting loops, it is possible that the "block after DO" is
  another "DO".

Reasons and further explanations for those are in the brw_cfg.c comments.

What makes this new change useful is that a pass might want to add
instructions between two DO instructions.  When that happens, a new
block must be created and any predicated WHILE and CONTINUE must be
repaired.

So, instead of requiring a repair (which has proven to be tricky in
the past), this change adds a block that can be "virtually" empty but
allow instructions to be added without further changes.

One alternative design would be allowing empty blocks, that would be
a deeper change since the blocks are currently assumed to be not empty
in various places.  We'll save that for when other changes are made to
the CFG.

The problem described happens in brw_opt_combine_constants, and a
different patch will clean that up.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33536>
---
 .../compiler/brw_analysis_performance.cpp     |  1 +
 src/intel/compiler/brw_builder.h              | 12 ++++++++-
 src/intel/compiler/brw_cfg.cpp                | 27 +++++++++++++++++++
 src/intel/compiler/brw_cfg.h                  |  3 ++-
 src/intel/compiler/brw_eu_defines.h           |  3 +++
 src/intel/compiler/brw_from_nir.cpp           |  2 ++
 src/intel/compiler/brw_generator.cpp          |  4 +++
 src/intel/compiler/brw_inst.cpp               |  2 ++
 src/intel/compiler/brw_lower_scoreboard.cpp   |  1 +
 src/intel/compiler/brw_print.cpp              |  3 +++
 src/intel/compiler/test_lower_scoreboard.cpp  | 18 ++++++++-----
 11 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_analysis_performance.cpp b/src/intel/compiler/brw_analysis_performance.cpp
index 0c3f993004c..d5b73b73519 100644
--- a/src/intel/compiler/brw_analysis_performance.cpp
+++ b/src/intel/compiler/brw_analysis_performance.cpp
@@ -713,6 +713,7 @@ namespace {
       case SHADER_OPCODE_UNDEF:
       case SHADER_OPCODE_HALT_TARGET:
       case FS_OPCODE_SCHEDULING_FENCE:
+      case SHADER_OPCODE_FLOW:
          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0);
 
diff --git a/src/intel/compiler/brw_builder.h b/src/intel/compiler/brw_builder.h
index 5417cfdad8e..37faa045743 100644
--- a/src/intel/compiler/brw_builder.h
+++ b/src/intel/compiler/brw_builder.h
@@ -847,12 +847,22 @@ public:
    brw_shader *shader;
 
    brw_inst *BREAK()    const { return emit(BRW_OPCODE_BREAK); }
-   brw_inst *DO()       const { return emit(BRW_OPCODE_DO); }
    brw_inst *ENDIF()    const { return emit(BRW_OPCODE_ENDIF); }
    brw_inst *NOP()      const { return emit(BRW_OPCODE_NOP); }
    brw_inst *WHILE()    const { return emit(BRW_OPCODE_WHILE); }
    brw_inst *CONTINUE() const { return emit(BRW_OPCODE_CONTINUE); }
 
+   void DO() const {
+      emit(BRW_OPCODE_DO);
+      /* Ensure that there'll always be a block after DO to add
+       * instructions and serve as sucessor for predicated WHILE
+       * and CONTINUE.
+       *
+       * See more details in brw_cfg::validate().
+       */
+      emit(SHADER_OPCODE_FLOW);
+   }
+
    bool has_writemask_all() const {
       return force_writemask_all;
    }
diff --git a/src/intel/compiler/brw_cfg.cpp b/src/intel/compiler/brw_cfg.cpp
index e88895edf48..e153e978096 100644
--- a/src/intel/compiler/brw_cfg.cpp
+++ b/src/intel/compiler/brw_cfg.cpp
@@ -215,6 +215,13 @@ cfg_t::cfg_t(const brw_shader *s, exec_list *instructions) :
       inst->exec_node::remove();
 
       switch (inst->opcode) {
+      case SHADER_OPCODE_FLOW:
+         cur->instructions.push_tail(inst);
+         next = new_block();
+         cur->add_successor(mem_ctx, next, bblock_link_logical);
+         set_next_block(&cur, next, ip);
+         break;
+
       case BRW_OPCODE_IF:
          cur->instructions.push_tail(inst);
 
@@ -736,6 +743,8 @@ cfg_t::validate(const char *stage_abbrev)
          }
       }
 
+      cfgv_assert(!block->instructions.is_empty());
+
       brw_inst *first_inst = block->start();
       if (first_inst->opcode == BRW_OPCODE_DO) {
          /* DO instructions both begin and end a block, so the DO instruction
@@ -763,6 +772,24 @@ cfg_t::validate(const char *stage_abbrev)
 
          cfgv_assert(logical_block != nullptr);
          cfgv_assert(physical_block != nullptr);
+
+         /* A flow block (block ending with SHADER_OPCODE_FLOW) is
+          * used to ensure that the block right after DO is always
+          * present even if it doesn't have actual instructions.
+          *
+          * This way predicated WHILE and CONTINUE don't need to be
+          * repaired when adding instructions right after the DO.
+          * They will point to the flow block whether is empty or not.
+          */
+         cfgv_assert(logical_block->end()->opcode == SHADER_OPCODE_FLOW);
+      }
+
+      brw_inst *last_inst = block->end();
+      if (last_inst->opcode == SHADER_OPCODE_FLOW) {
+         /* A flow block only has one successor -- the instruction disappears
+          * when generating code.
+          */
+         cfgv_assert(block->children.length() == 1);
       }
    }
 }
diff --git a/src/intel/compiler/brw_cfg.h b/src/intel/compiler/brw_cfg.h
index efed9f11564..2f1d9d7c015 100644
--- a/src/intel/compiler/brw_cfg.h
+++ b/src/intel/compiler/brw_cfg.h
@@ -214,7 +214,8 @@ bblock_ends_with_control_flow(const struct bblock_t *block)
           op == BRW_OPCODE_ELSE ||
           op == BRW_OPCODE_WHILE ||
           op == BRW_OPCODE_BREAK ||
-          op == BRW_OPCODE_CONTINUE;
+          op == BRW_OPCODE_CONTINUE ||
+          op == SHADER_OPCODE_FLOW;
 }
 
 static inline brw_inst *
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 9177cc98a94..63bf0bc4181 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -553,6 +553,9 @@ enum opcode {
    SHADER_OPCODE_MEMORY_LOAD_LOGICAL,
    SHADER_OPCODE_MEMORY_STORE_LOGICAL,
    SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL,
+
+   /* Ends a block moving to the next one.  See brw_cfg for details. */
+   SHADER_OPCODE_FLOW,
 };
 
 enum fb_write_logical_srcs {
diff --git a/src/intel/compiler/brw_from_nir.cpp b/src/intel/compiler/brw_from_nir.cpp
index 2a62b9034bc..5fbd0b49106 100644
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@@ -471,6 +471,8 @@ brw_from_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
    assert(!nir_loop_has_continue_construct(loop));
    bld.DO();
 
+   bld.emit(SHADER_OPCODE_FLOW);
+
    brw_from_nir_emit_cf_list(ntb, &loop->body);
 
    brw_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
diff --git a/src/intel/compiler/brw_generator.cpp b/src/intel/compiler/brw_generator.cpp
index 115da7e5b28..27fcb03a07b 100644
--- a/src/intel/compiler/brw_generator.cpp
+++ b/src/intel/compiler/brw_generator.cpp
@@ -1083,6 +1083,10 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 	 brw_DO(p, brw_get_default_exec_size(p));
 	 break;
 
+      case SHADER_OPCODE_FLOW:
+         /* Do nothing. */
+         break;
+
       case BRW_OPCODE_BREAK:
 	 brw_BREAK(p);
 	 break;
diff --git a/src/intel/compiler/brw_inst.cpp b/src/intel/compiler/brw_inst.cpp
index e4ca30e9a78..b5cb9bb7337 100644
--- a/src/intel/compiler/brw_inst.cpp
+++ b/src/intel/compiler/brw_inst.cpp
@@ -808,6 +808,7 @@ brw_inst::is_control_flow_end() const
    case BRW_OPCODE_ELSE:
    case BRW_OPCODE_WHILE:
    case BRW_OPCODE_ENDIF:
+   case SHADER_OPCODE_FLOW:
       return true;
    default:
       return false;
@@ -825,6 +826,7 @@ brw_inst::is_control_flow() const
    case BRW_OPCODE_ENDIF:
    case BRW_OPCODE_BREAK:
    case BRW_OPCODE_CONTINUE:
+   case SHADER_OPCODE_FLOW:
       return true;
    default:
       return false;
diff --git a/src/intel/compiler/brw_lower_scoreboard.cpp b/src/intel/compiler/brw_lower_scoreboard.cpp
index 941583a4740..04edd552906 100644
--- a/src/intel/compiler/brw_lower_scoreboard.cpp
+++ b/src/intel/compiler/brw_lower_scoreboard.cpp
@@ -185,6 +185,7 @@ namespace {
       case SHADER_OPCODE_UNDEF:
       case SHADER_OPCODE_HALT_TARGET:
       case FS_OPCODE_SCHEDULING_FENCE:
+      case SHADER_OPCODE_FLOW:
          return 0;
       default:
          /* Note that the following is inaccurate for virtual instructions
diff --git a/src/intel/compiler/brw_print.cpp b/src/intel/compiler/brw_print.cpp
index f0bf6491875..d19b4fe479e 100644
--- a/src/intel/compiler/brw_print.cpp
+++ b/src/intel/compiler/brw_print.cpp
@@ -294,6 +294,9 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
       return "read_from_live_channel";
    case SHADER_OPCODE_READ_FROM_CHANNEL:
       return "read_from_channel";
+
+   case SHADER_OPCODE_FLOW:
+      return "flow";
    }
 
    unreachable("not reached");
diff --git a/src/intel/compiler/test_lower_scoreboard.cpp b/src/intel/compiler/test_lower_scoreboard.cpp
index e60b8650fbb..288e7d27ef6 100644
--- a/src/intel/compiler/test_lower_scoreboard.cpp
+++ b/src/intel/compiler/test_lower_scoreboard.cpp
@@ -515,12 +515,14 @@ TEST_F(scoreboard_test, loop1)
    brw_calculate_cfg(*v);
    lower_scoreboard(v);
 
-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
    brw_inst *add = instruction(body, 0);
    EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
    EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 1));
 
-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
    brw_inst *mul = instruction(last_block, 0);
    EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
    EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 1));
@@ -550,12 +552,14 @@ TEST_F(scoreboard_test, loop2)
 
    /* Now the write in ADD has the tightest RegDist for both ADD and MUL. */
 
-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
    brw_inst *add = instruction(body, 0);
    EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
    EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 2));
 
-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
    brw_inst *mul = instruction(last_block, 0);
    EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
    EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 2));
@@ -586,12 +590,14 @@ TEST_F(scoreboard_test, loop3)
    brw_calculate_cfg(*v);
    lower_scoreboard(v);
 
-   bblock_t *body = v->cfg->blocks[2];
+   const int num_blocks = v->cfg->num_blocks;
+
+   bblock_t *body = v->cfg->blocks[num_blocks - 2];
    brw_inst *add = instruction(body, 4);
    EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
    EXPECT_EQ(add->sched, regdist(TGL_PIPE_FLOAT, 5));
 
-   bblock_t *last_block = v->cfg->blocks[3];
+   bblock_t *last_block = v->cfg->blocks[num_blocks - 1];
    brw_inst *mul = instruction(last_block, 0);
    EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
    EXPECT_EQ(mul->sched, regdist(TGL_PIPE_FLOAT, 1));