From 98866159588b489edbf651a32bbe49177bb63ec6 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 24 May 2022 02:44:53 -0700
Subject: [PATCH] intel/compiler: Move spill/fill tracking to the register
 allocator

Originally, we had virtual opcodes for scratch access, and let the
generator count spills/fills separately from other sends.  Later, we
started using the generic SHADER_OPCODE_SEND for spills/fills on some
generations of hardware, and simply detected stateless messages there.

But then we started using stateless messages for other things:
- anv uses stateless messages for the buffer device address feature.
- nir_opt_large_constants generates stateless messages.
- XeHP curbe setup can generate stateless messages.

So counting stateless messages is not accurate.  Instead, we move the
spill/fill accounting to the register allocator, as it generates such
things, as well as the load/store_scratch intrinsic handling, as those
are basically spill/fills, just at a higher level.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16691>
---
 src/intel/compiler/brw_fs.h                |  2 +
 src/intel/compiler/brw_fs_generator.cpp    | 44 +++++++++++-----------
 src/intel/compiler/brw_fs_nir.cpp          |  3 ++
 src/intel/compiler/brw_fs_reg_allocate.cpp | 30 +++++++++------
 src/intel/compiler/brw_fs_visitor.cpp      |  2 +
 5 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 4afb77146bc..5990cb422c8 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -81,6 +81,8 @@ offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
 struct shader_stats {
    const char *scheduler_mode;
    unsigned promoted_constants;
+   unsigned spill_count;
+   unsigned fill_count;
 };
 
 /**
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 4ca3e4cc9e9..54db3b6721b 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1826,13 +1826,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 
    int start_offset = p->next_insn_offset;
 
-   /* `send_count` explicitly does not include spills or fills, as we'd
-    * like to use it as a metric for intentional memory access or other
-    * shared function use.  Otherwise, subtle changes to scheduling or
-    * register allocation could cause it to fluctuate wildly - and that
-    * effect is already counted in spill/fill counts.
-    */
-   int spill_count = 0, fill_count = 0;
    int loop_count = 0, send_count = 0, nop_count = 0;
    bool is_accum_used = false;
 
@@ -2265,15 +2258,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
       case SHADER_OPCODE_SEND:
          generate_send(inst, dst, src[0], src[1], src[2],
                        inst->ex_mlen > 0 ? src[3] : brw_null_reg());
-         if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
-             (inst->desc & 0xff) == GFX8_BTI_STATELESS_NON_COHERENT) {
-            if (inst->size_written)
-               fill_count++;
-            else
-               spill_count++;
-         } else {
-            send_count++;
-         }
+         send_count++;
          break;
 
       case SHADER_OPCODE_GET_BUFFER_SIZE:
@@ -2306,17 +2291,17 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 
       case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
 	 generate_scratch_write(inst, src[0]);
-         spill_count++;
+         send_count++;
 	 break;
 
       case SHADER_OPCODE_GFX4_SCRATCH_READ:
 	 generate_scratch_read(inst, dst);
-         fill_count++;
+         send_count++;
 	 break;
 
       case SHADER_OPCODE_GFX7_SCRATCH_READ:
 	 generate_scratch_read_gfx7(inst, dst);
-         fill_count++;
+         send_count++;
 	 break;
 
       case SHADER_OPCODE_SCRATCH_HEADER:
@@ -2630,6 +2615,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
    /* end of program sentinel */
    disasm_new_inst_group(disasm_info, p->next_insn_offset);
 
+   /* `send_count` explicitly does not include spills or fills, as we'd
+    * like to use it as a metric for intentional memory access or other
+    * shared function use.  Otherwise, subtle changes to scheduling or
+    * register allocation could cause it to fluctuate wildly - and that
+    * effect is already counted in spill/fill counts.
+    */
+   send_count -= shader_stats.spill_count;
+   send_count -= shader_stats.fill_count;
+
 #ifndef NDEBUG
    bool validated =
 #else
@@ -2661,7 +2655,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
               shader_name, sha1buf,
               dispatch_width, before_size / 16,
               loop_count, perf.latency,
-              spill_count, fill_count, send_count,
+              shader_stats.spill_count,
+              shader_stats.fill_count,
+              send_count,
               shader_stats.scheduler_mode,
               shader_stats.promoted_constants,
               before_size, after_size,
@@ -2693,7 +2689,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
                         _mesa_shader_stage_to_abbrev(stage),
                         dispatch_width, before_size / 16 - nop_count,
                         loop_count, perf.latency,
-                        spill_count, fill_count, send_count,
+                        shader_stats.spill_count,
+                        shader_stats.fill_count,
+                        send_count,
                         shader_stats.scheduler_mode,
                         shader_stats.promoted_constants,
                         before_size, after_size);
@@ -2703,8 +2701,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
       stats->sends = send_count;
       stats->loops = loop_count;
       stats->cycles = perf.latency;
-      stats->spills = spill_count;
-      stats->fills = fill_count;
+      stats->spills = shader_stats.spill_count;
+      stats->fills = shader_stats.fill_count;
    }
 
    return start_offset;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 954abde45a3..b51435f47c7 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5177,6 +5177,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
          bld.MOV(dest, read_result);
       }
+
+      shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16);
       break;
    }
 
@@ -5250,6 +5252,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
       }
+      shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16);
       break;
    }
 
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 42077fb6621..9045e35fb7e 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -348,10 +348,10 @@ private:
    void build_interference_graph(bool allow_spilling);
    void discard_interference_graph();
 
-   void emit_unspill(const fs_builder &bld, fs_reg dst,
-                     uint32_t spill_offset, unsigned count);
-   void emit_spill(const fs_builder &bld, fs_reg src,
-                   uint32_t spill_offset, unsigned count);
+   void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
+                     fs_reg dst, uint32_t spill_offset, unsigned count);
+   void emit_spill(const fs_builder &bld, struct shader_stats *stats,
+                   fs_reg src, uint32_t spill_offset, unsigned count);
 
    void set_spill_costs();
    int choose_spill_reg();
@@ -738,7 +738,9 @@ fs_reg_alloc::discard_interference_graph()
 }
 
 void
-fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst,
+fs_reg_alloc::emit_unspill(const fs_builder &bld,
+                           struct shader_stats *stats,
+                           fs_reg dst,
                            uint32_t spill_offset, unsigned count)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
@@ -747,6 +749,8 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst,
    assert(count % reg_size == 0);
 
    for (unsigned i = 0; i < count / reg_size; i++) {
+      ++stats->fill_count;
+
       fs_inst *unspill_inst;
       if (devinfo->ver >= 9) {
          fs_reg header = this->scratch_header;
@@ -803,7 +807,9 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst,
 }
 
 void
-fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src,
+fs_reg_alloc::emit_spill(const fs_builder &bld,
+                         struct shader_stats *stats,
+                         fs_reg src,
                          uint32_t spill_offset, unsigned count)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
@@ -812,6 +818,8 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src,
    assert(count % reg_size == 0);
 
    for (unsigned i = 0; i < count / reg_size; i++) {
+      ++stats->spill_count;
+
       fs_inst *spill_inst;
       if (devinfo->ver >= 9) {
          fs_reg header = this->scratch_header;
@@ -1098,8 +1106,8 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
              * 32 bit channels.  It shouldn't hurt in any case because the
              * unspill destination is a block-local temporary.
              */
-            emit_unspill(ibld.exec_all().group(width, 0), unspill_dst,
-                         subset_spill_offset, count);
+            emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
+                         unspill_dst, subset_spill_offset, count);
 	 }
       }
 
@@ -1153,10 +1161,10 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
 	  */
          if (inst->is_partial_write() ||
              (!inst->force_writemask_all && !per_channel))
-            emit_unspill(ubld, spill_src, subset_spill_offset,
-                         regs_written(inst));
+            emit_unspill(ubld, &fs->shader_stats, spill_src,
+                         subset_spill_offset, regs_written(inst));
 
-         emit_spill(ubld.at(block, inst->next), spill_src,
+         emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
                     subset_spill_offset, regs_written(inst));
       }
 
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 5da019bf938..760f2135139 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -1172,6 +1172,8 @@ fs_visitor::init()
 
    this->shader_stats.scheduler_mode = NULL;
    this->shader_stats.promoted_constants = 0,
+   this->shader_stats.spill_count = 0,
+   this->shader_stats.fill_count = 0,
 
    this->grf_used = 0;
    this->spilled_any_registers = false;