brw: Implement Wa_22012725308 for flags via SWSB too

At this point, using the per-register granularity will only help in conjuction with fragment shader discard (which is implemented using f1). v2: Loop restructuring and code cleanups. Suggested by Curro. v3: Only apply Wa on Gfx12.5+. Suggested by Curro. v4: Also apply to implicit flag reads. Suggested by Curro. This version affects a *lot* more shaders (10,936 on Meteor Lake shader-db versus 4,482 before). The results are still very much in the 🤷 territory. v5: Add missing dependency. I thought I got them all the previous time. :( Noticed by Curro. shader-db: Lunar Lake total cycles in shared programs: 886315282 -> 886391040 (<.01%) cycles in affected programs: 204907250 -> 204983008 (0.04%) helped: 1 / HURT: 6716 LOST: 0 GAINED: 1 Meteor Lake and DG2 had similar results. (Meteor Lake shown) total cycles in shared programs: 883774789 -> 883921507 (0.02%) cycles in affected programs: 481836784 -> 481983502 (0.03%) helped: 4 / HURT: 10936 LOST: 3 GAINED: 7 fossil-db: Lunar Lake Totals: Cycle count: 32600441334 -> 32601862658 (+0.00%); split: -0.00%, +0.00% Totals from 90283 (11.44% of 789260) affected shaders: Cycle count: 17265933202 -> 17267354526 (+0.01%); split: -0.00%, +0.01% Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Cycle count: 26477292677 -> 26480321805 (+0.01%); split: -0.00%, +0.01% Max dispatch width: 8010440 -> 8010984 (+0.01%) Totals from 132952 (14.71% of 903925) affected shaders: Cycle count: 15349555348 -> 15352584476 (+0.02%); split: -0.00%, +0.02% Max dispatch width: 1085416 -> 1085960 (+0.05%) Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35415>
2025-12-24 08:50:13 +01:00 · 2025-06-04 12:20:50 -07:00 · 2025-06-04 12:20:50 -07:00 · 1279f12c84
commit 1279f12c84
parent 1fdcc9039b
1 changed files with 87 additions and 1 deletions
--- a/src/intel/compiler/brw_lower_scoreboard.cpp
+++ b/src/intel/compiler/brw_lower_scoreboard.cpp
@ -49,7 +49,6 @@
 * The following ARF registers don't need to be tracked here because data
 * coherency is still provided transparently by the hardware:
 *
- *  - f0-1 flag registers
 *  - n0 notification register
 *  - tdr0 thread dependency register
 */
@ -699,6 +698,10 @@ namespace {

         sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
         sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
+            sb.flag_deps[i] = merge(eq, sb0.flag_deps[i], sb1.flag_deps[i]);
+
         sb.scalar_dep = merge(eq, sb0.scalar_dep, sb1.scalar_dep);

         return sb;
@ -718,6 +721,10 @@ namespace {

         sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
         sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
+            sb.flag_deps[i] = shadow(sb0.flag_deps[i], sb1.flag_deps[i]);
+
         sb.scalar_dep = shadow(sb0.scalar_dep, sb1.scalar_dep);

         return sb;
@ -737,6 +744,10 @@ namespace {

         sb.addr_dep = transport(sb0.addr_dep, delta);
         sb.accum_dep = transport(sb0.accum_dep, delta);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
+            sb.flag_deps[i] = transport(sb0.flag_deps[i], delta);
+
         sb.scalar_dep = transport(sb0.scalar_dep, delta);

         return sb;
@ -756,6 +767,11 @@ namespace {
         if (sb0.accum_dep != sb1.accum_dep)
            return false;

+         for (unsigned i = 0; i < ARRAY_SIZE(sb0.flag_deps); i++) {
+            if (sb0.flag_deps[i] != sb1.flag_deps[i])
+               return false;
+         }
+
         if (sb0.scalar_dep != sb1.scalar_dep)
            return false;

@ -772,6 +788,7 @@ namespace {
      dependency grf_deps[XE3_MAX_GRF];
      dependency addr_dep;
      dependency accum_dep;
+      dependency flag_deps[4];
      dependency scalar_dep;

      dependency *
@ -783,6 +800,7 @@ namespace {
         return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
                 brw_reg_is_arf(r, BRW_ARF_ADDRESS) ? &addr_dep :
                 brw_reg_is_arf(r, BRW_ARF_ACCUMULATOR) ? &accum_dep :
+                 brw_reg_is_arf(r, BRW_ARF_FLAG) ? &flag_deps[r.nr & 0x0f] :
                 brw_reg_is_arf(r, BRW_ARF_SCALAR) ? &scalar_dep :
                 NULL);
      }
@ -1073,6 +1091,25 @@ namespace {
      if (inst->reads_accumulator_implicitly())
         sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));

+      /* flags_read (and flags_written) returns a bit set per byte of the
+       * flags register file that is writtten.
+       *
+       * Gfx12 does not need this particular workaround because earlier
+       * hardware doesn't have multiple asynchronous FPU pipelines, and
+       * therefore can't be affected by this bug.
+       */
+      if (devinfo->verx10 >= 125 && inst->predicate != BRW_PREDICATE_NONE) {
+         const dependency rd_dep = dependency(TGL_REGDIST_SRC, jp, exec_all);
+         unsigned flags = inst->flags_read(devinfo);
+
+         for (unsigned i = 0; flags != 0; i++) {
+            if ((flags & 0x0f) != 0)
+               sb.set(brw_flag_reg(i, 0), rd_dep);
+
+            flags >>= 4;
+         }
+      }
+
      /* Track any destination registers of this instruction. */
      const dependency wr_dep =
         is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
@ -1082,6 +1119,19 @@ namespace {
      if (inst->writes_accumulator_implicitly(devinfo))
         sb.set(brw_acc_reg(8), wr_dep);

+      /* See comment above for explanation of flag_written parsing and the
+       * Gfx12.5 restriction.
+       */
+      if (devinfo->verx10 >= 125) {
+         unsigned flags = inst->flags_written(devinfo);
+         for (unsigned i = 0; flags != 0; i++) {
+            if ((flags & 0x0f) != 0)
+               sb.set(brw_flag_reg(i, 0), wr_dep);
+
+            flags >>= 4;
+         }
+      }
+
      if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
          !inst->dst.is_null()) {
         for (unsigned j = 0; j < regs_written(inst); j++)
@ -1196,6 +1246,28 @@ namespace {
               add_dependency(ids, deps[ip], dep);
         }

+         /* flags_read (and flags_written) returns a bit set per byte of the
+          * flags register file that is writtten.
+          *
+          * Gfx12 does not need this particular workaround because earlier
+          * hardware doesn't have multiple asynchronous FPU pipelines, and
+          * therefore can't be affected by this bug.
+          */
+         if (devinfo->verx10 >= 125 && inst->predicate != BRW_PREDICATE_NONE) {
+            unsigned flags = inst->flags_read(devinfo);
+
+            for (unsigned i = 0; flags != 0; i++) {
+               if ((flags & 0x0f) != 0) {
+                  const dependency dep = sb.get(brw_flag_reg(i, 0));
+
+                  if (dep.ordered && !is_single_pipe(dep.jp, p))
+                     add_dependency(ids, deps[ip], dep);
+               }
+
+               flags >>= 4;
+            }
+         }
+
         if (is_unordered(devinfo, inst) && !inst->eot)
            add_dependency(ids, deps[ip],
                           dependency(TGL_SBID_SET, ip, exec_all));
@ -1221,6 +1293,20 @@ namespace {
               if (dep.ordered && !is_single_pipe(dep.jp, p))
                  add_dependency(ids, deps[ip], dep);
            }
+
+            /* flags_written returns a bit set per byte of the flags register
+             * file that is writtten.
+             */
+            unsigned flags = inst->flags_written(devinfo);
+            for (unsigned i = 0; flags != 0; i++) {
+               if ((flags & 0x0f) != 0) {
+                  const dependency dep = sb.get(brw_flag_reg(i, 0));
+                  if (dep.ordered && !is_single_pipe(dep.jp, p))
+                     add_dependency(ids, deps[ip], dep);
+               }
+
+               flags >>= 4;
+            }
         }

         update_inst_scoreboard(shader, jps, inst, ip, sb);