From 4f246cf4e74f3d5a3a0a478aba82cc2a0f826515 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Sun, 15 Oct 2023 23:38:56 -0700
Subject: [PATCH] intel/compiler: Merge child/latency arrays in schedule_node

Values are used together, saves one pointer in schedule_node,
reduces amount of reallocations when children count grows.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25841>
---
 .../compiler/brw_schedule_instructions.cpp    | 84 ++++++++++---------
 1 file changed, 45 insertions(+), 39 deletions(-)

diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
index 0117dc0e0b6..b4ea150f0b7 100644
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -59,6 +59,7 @@ using namespace brw;
 static bool debug = false;
 
 class instruction_scheduler;
+struct schedule_node_child;
 
 class schedule_node : public exec_node
 {
@@ -67,11 +68,10 @@ public:
    void set_latency_gfx7(const struct brw_isa_info *isa);
 
    backend_instruction *inst;
-   schedule_node **children;
-   int *child_latency;
-   int child_count;
+   schedule_node_child *children;
+   int children_count;
+   int children_cap;
    int parent_count;
-   int child_array_size;
    int unblocked_time;
    int latency;
 
@@ -105,6 +105,11 @@ public:
    int issue_time;
 };
 
+struct schedule_node_child {
+   schedule_node *n;
+   int effective_latency;
+};
+
 /**
  * Lower bound of the scheduling time after which one of the instructions
  * blocked by this node may lead to program termination.
@@ -1017,12 +1022,12 @@ void
 instruction_scheduler::compute_delays()
 {
    for (schedule_node *n = current.end - 1; n >= current.start; n--) {
-      if (!n->child_count) {
+      if (!n->children_count) {
          n->delay = n->issue_time;
       } else {
-         for (int i = 0; i < n->child_count; i++) {
-            assert(n->children[i]->delay);
-            n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
+         for (int i = 0; i < n->children_count; i++) {
+            assert(n->children[i].n->delay);
+            n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
          }
       }
    }
@@ -1036,10 +1041,11 @@ instruction_scheduler::compute_exits()
     * from the top instead of from the bottom of the block.
     */
    for (schedule_node *n = current.start; n < current.end; n++) {
-      for (int i = 0; i < n->child_count; i++) {
-         n->children[i]->unblocked_time =
-            MAX2(n->children[i]->unblocked_time,
-                 n->unblocked_time + n->issue_time + n->child_latency[i]);
+      for (int i = 0; i < n->children_count; i++) {
+         schedule_node_child *child = &n->children[i];
+         child->n->unblocked_time =
+            MAX2(child->n->unblocked_time,
+                 n->unblocked_time + n->issue_time + child->effective_latency);
       }
    }
 
@@ -1051,9 +1057,9 @@ instruction_scheduler::compute_exits()
    for (schedule_node *n = current.end - 1; n >= current.start; n--) {
       n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL);
 
-      for (int i = 0; i < n->child_count; i++) {
-         if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n))
-            n->exit = n->children[i]->exit;
+      for (int i = 0; i < n->children_count; i++) {
+         if (exit_unblocked_time(n->children[i].n) < exit_unblocked_time(n))
+            n->exit = n->children[i].n->exit;
       }
    }
 }
@@ -1073,29 +1079,29 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 
    assert(before != after);
 
-   for (int i = 0; i < before->child_count; i++) {
-      if (before->children[i] == after) {
-         before->child_latency[i] = MAX2(before->child_latency[i], latency);
+   for (int i = 0; i < before->children_count; i++) {
+      schedule_node_child *child = &before->children[i];
+      if (child->n == after) {
+         child->effective_latency = MAX2(child->effective_latency, latency);
          return;
       }
    }
 
-   if (before->child_array_size <= before->child_count) {
-      if (before->child_array_size < 16)
-         before->child_array_size = 16;
+   if (before->children_cap <= before->children_count) {
+      if (before->children_cap < 16)
+         before->children_cap = 16;
       else
-         before->child_array_size *= 2;
+         before->children_cap *= 2;
 
       before->children = reralloc(mem_ctx, before->children,
-                                  schedule_node *,
-                                  before->child_array_size);
-      before->child_latency = reralloc(mem_ctx, before->child_latency,
-                                       int, before->child_array_size);
+                                  schedule_node_child,
+                                  before->children_cap);
    }
 
-   before->children[before->child_count] = after;
-   before->child_latency[before->child_count] = latency;
-   before->child_count++;
+   schedule_node_child *child = &before->children[before->children_count];
+   child->n = after;
+   child->effective_latency = latency;
+   before->children_count++;
    after->parent_count++;
 }
 
@@ -1859,24 +1865,24 @@ instruction_scheduler::update_children(schedule_node *chosen)
     * be scheduled.  Update the children's unblocked time for this
     * DAG edge as we do so.
     */
-   for (int i = chosen->child_count - 1; i >= 0; i--) {
-      schedule_node *child = chosen->children[i];
+   for (int i = chosen->children_count - 1; i >= 0; i--) {
+      schedule_node_child *child = &chosen->children[i];
 
-      child->unblocked_time = MAX2(child->unblocked_time,
-                                   current.time + chosen->child_latency[i]);
+      child->n->unblocked_time = MAX2(child->n->unblocked_time,
+                                      current.time + child->effective_latency);
 
       if (debug) {
-         fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
-         bs->dump_instruction(child->inst);
+         fprintf(stderr, "\tchild %d, %d parents: ", i, child->n->parent_count);
+         bs->dump_instruction(child->n->inst);
       }
 
-      child->cand_generation = current.cand_generation;
-      child->parent_count--;
-      if (child->parent_count == 0) {
+      child->n->cand_generation = current.cand_generation;
+      child->n->parent_count--;
+      if (child->n->parent_count == 0) {
          if (debug) {
             fprintf(stderr, "\t\tnow available\n");
          }
-         current.available.push_head(child);
+         current.available.push_head(child->n);
       }
    }
    current.cand_generation++;