brw: add scheduler support for address registers

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2025-12-29 01:30:08 +01:00 · 2024-03-14 19:29:36 +02:00 · 2024-03-14 19:29:36 +02:00 · aac906c16c
commit aac906c16c
parent 0a5bdf1199
1 changed files with 216 additions and 3 deletions
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@ -95,6 +95,14 @@ public:
    */
   int issue_time;

+   /**
+    * Whether the instruction reads any part of the address register (to speed
+    * up instruction checks).
+    */
+   schedule_node **address_read;
+   int address_read_count;
+   int address_read_cap;
+
   /* Temporary data used during the scheduling process. */
   struct {
      int parent_count;
@ -579,6 +587,7 @@ public:
   void add_cross_lane_deps(schedule_node *n);
   void add_dep(schedule_node *before, schedule_node *after, int latency);
   void add_dep(schedule_node *before, schedule_node *after);
+   void add_address_dep(schedule_node *before, schedule_node *after);

   void set_current_block(bblock_t *block);
   void compute_delays();
@ -590,6 +599,7 @@ public:
   void calculate_deps();
   bool is_compressed(const fs_inst *inst);
   bool register_needs_barrier(const brw_reg &reg);
+   bool address_register_interfere(const schedule_node *n);
   schedule_node *choose_instruction_to_schedule();
   int calculate_issue_time(const fs_inst *inst);

@ -626,6 +636,9 @@ public:
      unsigned cand_generation;
      int time;
      exec_list available;
+
+      /* Currently used address register */
+      uint32_t address_register[16];
   } current;

   bool post_reg_alloc;
@ -946,8 +959,19 @@ instruction_scheduler::compute_delays()
         n->delay = n->issue_time;
      } else {
         for (int i = 0; i < n->children_count; i++) {
-            assert(n->children[i].n->delay);
-            n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
+            if (n->children[i].n->delay == 0) {
+               /* This is a special case for address register, where a child
+                * could be a prior instruction.
+                *
+                * This ensures that a address register write instruction will
+                * always unblock the reader of the address register. Otherwise
+                * we could end up with scheduling deadlocks.
+                */
+               assert(n->children[i].n->inst->dst.is_address());
+               n->delay = MAX2(n->delay, 1);
+            } else {
+               n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
+            }
         }
      }
   }
@ -1023,6 +1047,10 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
   child->effective_latency = latency;
   before->children_count++;
   after->initial_parent_count++;
+
+   /* Propagate the dependency to the address register instructions. */
+   for (int i = 0; i < after->address_read_count; i++)
+      add_dep(before, after->address_read[i]);
 }

 void
@ -1034,6 +1062,24 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
   add_dep(before, after, before->latency);
 }

+void
+instruction_scheduler::add_address_dep(schedule_node *before, schedule_node *after)
+{
+   assert(before && after);
+
+   add_dep(before, after, before->latency);
+
+   if (after->address_read_cap <= after->address_read_count) {
+      after->address_read_cap = MAX2(2 * after->address_read_cap, 1);
+
+      after->address_read = reralloc(mem_ctx, after->address_read,
+                                     schedule_node *,
+                                     after->address_read_cap);
+   }
+
+   after->address_read[after->address_read_count++] = before;
+}
+
 static bool
 is_scheduling_barrier(const fs_inst *inst)
 {
@ -1199,8 +1245,76 @@ instruction_scheduler::calculate_deps()
    * granular level.
    */
   schedule_node *last_fixed_grf_write = NULL;
+   schedule_node *last_address_write[16] = {};

   /* top-to-bottom dependencies: RAW and WAW. */
+
+   if (!post_reg_alloc) {
+      /* Address registers have virtual identifier, allowing us to identify
+       * what instructions needs the values written to the register. The
+       * address register is written/read in pairs of instructions (enforced
+       * by the brw_fs_validate.cpp).
+       *
+       * To allow scheduling of SEND messages, out of order, without the
+       * address register tracking generating serialized dependency between
+       * all the messages, we first track all the dependencies of the address
+       * register. Those dependencies are added to the instructions consuming
+       * the address register value. Then when doing the normal dependency
+       * tracking, any node adding a dependency to an instruction consuming
+       * the address register is also added as dependency to the instruction
+       * writing the value to the address register.
+       *
+       * This scheme allows the scheduling done by
+       * choose_instruction_to_schedule() to ensure that once an instruction
+       * writing the address register is scheduled, we can always schedule all
+       * instructions making use of the address register value. Otherwise we
+       * could run into scheduling deadlocks.
+       *
+       * Here is a deadlock example :
+       *
+       *    mov    a0, 0x42
+       *    send grf1, ..., a0
+       *    mov    a0, 0x43
+       *    send grf2, grf1, a0
+       *
+       * Let say choose_instruction_to_schedule() chooses the second mov
+       * instruction first (mov a0, 0x43). Then it cannot schedule the second
+       * send instruction because the first send instruction populating grf1
+       * and has not been scheduled and we cannot schedule the first mov
+       * either because the address register is already in use for another
+       * message.
+       *
+       * In post-register-allocation mode, this scheme cannot work as all GRFs
+       * can get reused and we have to serializae all address register usages
+       * (like the accumulator, flag, etc...).
+       */
+      for (schedule_node *n = current.start; n < current.end; n++) {
+         fs_inst *inst = (fs_inst *)n->inst;
+
+         /* Pre pass going over instruction using the register flag as a
+          * source.
+          */
+         for (int i = 0; i < inst->sources; i++) {
+            if (!inst->src[i].is_address())
+               continue;
+
+            for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
+               assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write));
+               schedule_node *write_addr_node =
+                  last_address_write[inst->src[i].address_slot(byte)];
+               assert(write_addr_node->inst->dst.nr == inst->src[i].nr);
+               add_address_dep(write_addr_node, n);
+            }
+         }
+
+         if (inst->dst.is_address()) {
+            for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
+               last_address_write[inst->dst.address_slot(byte)] = n;
+            }
+         }
+      }
+   }
+
   for (schedule_node *n = current.start; n < current.end; n++) {
      fs_inst *inst = (fs_inst *)n->inst;

@ -1225,12 +1339,16 @@ instruction_scheduler::calculate_deps()
            }
         } else if (inst->src[i].is_accumulator()) {
            add_dep(last_accumulator_write, n);
+         } else if (inst->src[i].is_address()) {
+            if (post_reg_alloc) {
+               for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2)
+                  add_dep(last_address_write[inst->src[i].address_slot(byte)], n);
+            }
         } else if (register_needs_barrier(inst->src[i])) {
            add_barrier_deps(n);
         }
      }

-
      if (const unsigned mask = inst->flags_read(s->devinfo)) {
         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));

@ -1264,6 +1382,13 @@ instruction_scheduler::calculate_deps()
      } else if (inst->dst.is_accumulator()) {
         add_dep(last_accumulator_write, n);
         last_accumulator_write = n;
+      } else if (inst->dst.is_address()) {
+         if (post_reg_alloc) {
+            for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
+               add_dep(last_address_write[inst->dst.address_slot(byte)], n);
+               last_address_write[inst->dst.address_slot(byte)] = n;
+            }
+         }
      } else if (register_needs_barrier(inst->dst)) {
         add_barrier_deps(n);
      }
@ -1284,6 +1409,13 @@ instruction_scheduler::calculate_deps()
         add_dep(last_accumulator_write, n);
         last_accumulator_write = n;
      }
+
+      if (post_reg_alloc && inst->uses_address_register_implicitly()) {
+         for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) {
+            add_dep(last_address_write[i], n);
+            last_address_write[i] = n;
+         }
+      }
   }

   clear_last_grf_write();
@ -1292,6 +1424,7 @@ instruction_scheduler::calculate_deps()
   memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
   last_accumulator_write = NULL;
   last_fixed_grf_write = NULL;
+   memset(last_address_write, 0, sizeof(last_address_write));

   for (schedule_node *n = current.end - 1; n >= current.start; n--) {
      fs_inst *inst = (fs_inst *)n->inst;
@ -1310,6 +1443,12 @@ instruction_scheduler::calculate_deps()
            }
         } else if (inst->src[i].is_accumulator()) {
            add_dep(n, last_accumulator_write, 0);
+         } else if (inst->src[i].is_address()) {
+            if (post_reg_alloc) {
+               for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
+                  add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0);
+               }
+            }
         } else if (register_needs_barrier(inst->src[i])) {
            add_barrier_deps(n);
         }
@ -1328,6 +1467,11 @@ instruction_scheduler::calculate_deps()
         add_dep(n, last_accumulator_write);
      }

+      if (post_reg_alloc && inst->uses_address_register_implicitly()) {
+         for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++)
+            last_address_write[i] = n;
+      }
+
      /* Update the things this instruction wrote, so earlier reads
       * can mark this as WAR dependency.
       */
@ -1343,6 +1487,11 @@ instruction_scheduler::calculate_deps()
         }
      } else if (inst->dst.is_accumulator()) {
         last_accumulator_write = n;
+      } else if (inst->dst.is_address()) {
+         if (post_reg_alloc) {
+            for (unsigned byte = 0; byte < inst->size_written; byte += 2)
+               last_address_write[inst->dst.address_slot(byte)] = n;
+         }
      } else if (register_needs_barrier(inst->dst)) {
         add_barrier_deps(n);
      }
@ -1364,6 +1513,39 @@ instruction_scheduler::calculate_deps()
   clear_last_grf_write();
 }

+bool
+instruction_scheduler::address_register_interfere(const schedule_node *n)
+{
+   if (n->inst->uses_address_register_implicitly()) {
+      for (unsigned i = 0; i < ARRAY_SIZE(current.address_register); i++)
+         if (current.address_register[i] != 0)
+            return true;
+      return false;
+   }
+
+   if (n->inst->dst.is_address()) {
+      for (unsigned byte = 0; byte < n->inst->size_written; byte += 2) {
+         if (current.address_register[n->inst->dst.address_slot(byte)] != 0 &&
+             current.address_register[n->inst->dst.address_slot(byte)] != n->inst->dst.nr)
+            return true;
+      }
+   }
+
+   if (n->address_read_count > 0) {
+      for (unsigned i = 0; i < n->inst->sources; i++) {
+         if (!n->inst->src[i].is_address())
+            continue;
+         for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) {
+            if (current.address_register[n->inst->src[i].address_slot(byte)] !=
+                n->inst->src[i].nr)
+               return true;
+         }
+      }
+   }
+
+   return false;
+}
+
 schedule_node *
 instruction_scheduler::choose_instruction_to_schedule()
 {
@ -1377,6 +1559,9 @@ instruction_scheduler::choose_instruction_to_schedule()
       * otherwise the oldest one.
       */
      foreach_in_list(schedule_node, n, &current.available) {
+         if (!post_reg_alloc && address_register_interfere(n))
+            continue;
+
         if (!chosen ||
             exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
             (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
@ -1395,6 +1580,9 @@ instruction_scheduler::choose_instruction_to_schedule()
       * latency.
       */
      foreach_in_list(schedule_node, n, &current.available) {
+         if (!post_reg_alloc && address_register_interfere(n))
+            continue;
+
         if (!chosen) {
            chosen = n;
            chosen_register_pressure_benefit =
@ -1512,6 +1700,29 @@ instruction_scheduler::schedule(schedule_node *chosen)
 void
 instruction_scheduler::update_children(schedule_node *chosen)
 {
+   if (chosen->address_read_count > 0) {
+      for (unsigned i = 0; i < chosen->inst->sources; i++) {
+         if (!chosen->inst->src[i].is_address())
+            continue;
+         for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) {
+            assert(chosen->inst->src[i].address_slot(byte) <
+                   ARRAY_SIZE(current.address_register));
+            current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;
+         }
+      }
+   }
+
+   if (chosen->inst->dst.is_address()) {
+      for (unsigned byte = 0; byte < chosen->inst->size_written; byte += 2) {
+         assert(chosen->inst->dst.address_slot(byte) <
+                ARRAY_SIZE(current.address_register));
+         current.address_register[
+            chosen->inst->dst.address_slot(byte)] = chosen->inst->dst.nr;
+      }
+   } else if (chosen->inst->uses_address_register_implicitly()) {
+      memset(current.address_register, 0, sizeof(current.address_register));
+   }
+
   /* Now that we've scheduled a new instruction, some of its
    * children can be promoted to the list of instructions ready to
    * be scheduled.  Update the children's unblocked time for this
@ -1557,6 +1768,8 @@ instruction_scheduler::schedule_instructions()

   current.block->instructions.make_empty();

+   memset(current.address_register, 0, sizeof(current.address_register));
+
   while (!current.available.is_empty()) {
      schedule_node *chosen = choose_instruction_to_schedule();
      schedule(chosen);