brw: Implement Wa_22012725308 for flags via SWSB too

At this point, using the per-register granularity will only help in
conjuction with fragment shader discard (which is implemented using f1).

v2: Loop restructuring and code cleanups. Suggested by Curro.

v3: Only apply Wa on Gfx12.5+. Suggested by Curro.

v4: Also apply to implicit flag reads. Suggested by Curro. This version
affects a *lot* more shaders (10,936 on Meteor Lake shader-db versus
4,482 before). The results are still very much in the 🤷 territory.

v5: Add missing dependency. I thought I got them all the previous
time. :( Noticed by Curro.

shader-db:

Lunar Lake
total cycles in shared programs: 886315282 -> 886391040 (<.01%)
cycles in affected programs: 204907250 -> 204983008 (0.04%)
helped: 1 / HURT: 6716

LOST:   0
GAINED: 1

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
total cycles in shared programs: 883774789 -> 883921507 (0.02%)
cycles in affected programs: 481836784 -> 481983502 (0.03%)
helped: 4 / HURT: 10936

LOST:   3
GAINED: 7

fossil-db:

Lunar Lake
Totals:
Cycle count: 32600441334 -> 32601862658 (+0.00%); split: -0.00%, +0.00%

Totals from 90283 (11.44% of 789260) affected shaders:
Cycle count: 17265933202 -> 17267354526 (+0.01%); split: -0.00%, +0.01%

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Cycle count: 26477292677 -> 26480321805 (+0.01%); split: -0.00%, +0.01%
Max dispatch width: 8010440 -> 8010984 (+0.01%)

Totals from 132952 (14.71% of 903925) affected shaders:
Cycle count: 15349555348 -> 15352584476 (+0.02%); split: -0.00%, +0.02%
Max dispatch width: 1085416 -> 1085960 (+0.05%)

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35415>
This commit is contained in:
Ian Romanick 2025-06-04 12:20:50 -07:00 committed by Marge Bot
parent 1fdcc9039b
commit 1279f12c84

View file

@ -49,7 +49,6 @@
* The following ARF registers don't need to be tracked here because data
* coherency is still provided transparently by the hardware:
*
* - f0-1 flag registers
* - n0 notification register
* - tdr0 thread dependency register
*/
@ -699,6 +698,10 @@ namespace {
sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
sb.flag_deps[i] = merge(eq, sb0.flag_deps[i], sb1.flag_deps[i]);
sb.scalar_dep = merge(eq, sb0.scalar_dep, sb1.scalar_dep);
return sb;
@ -718,6 +721,10 @@ namespace {
sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
sb.flag_deps[i] = shadow(sb0.flag_deps[i], sb1.flag_deps[i]);
sb.scalar_dep = shadow(sb0.scalar_dep, sb1.scalar_dep);
return sb;
@ -737,6 +744,10 @@ namespace {
sb.addr_dep = transport(sb0.addr_dep, delta);
sb.accum_dep = transport(sb0.accum_dep, delta);
for (unsigned i = 0; i < ARRAY_SIZE(sb.flag_deps); i++)
sb.flag_deps[i] = transport(sb0.flag_deps[i], delta);
sb.scalar_dep = transport(sb0.scalar_dep, delta);
return sb;
@ -756,6 +767,11 @@ namespace {
if (sb0.accum_dep != sb1.accum_dep)
return false;
for (unsigned i = 0; i < ARRAY_SIZE(sb0.flag_deps); i++) {
if (sb0.flag_deps[i] != sb1.flag_deps[i])
return false;
}
if (sb0.scalar_dep != sb1.scalar_dep)
return false;
@ -772,6 +788,7 @@ namespace {
dependency grf_deps[XE3_MAX_GRF];
dependency addr_dep;
dependency accum_dep;
dependency flag_deps[4];
dependency scalar_dep;
dependency *
@ -783,6 +800,7 @@ namespace {
return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
brw_reg_is_arf(r, BRW_ARF_ADDRESS) ? &addr_dep :
brw_reg_is_arf(r, BRW_ARF_ACCUMULATOR) ? &accum_dep :
brw_reg_is_arf(r, BRW_ARF_FLAG) ? &flag_deps[r.nr & 0x0f] :
brw_reg_is_arf(r, BRW_ARF_SCALAR) ? &scalar_dep :
NULL);
}
@ -1073,6 +1091,25 @@ namespace {
if (inst->reads_accumulator_implicitly())
sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
/* flags_read (and flags_written) returns a bit set per byte of the
* flags register file that is writtten.
*
* Gfx12 does not need this particular workaround because earlier
* hardware doesn't have multiple asynchronous FPU pipelines, and
* therefore can't be affected by this bug.
*/
if (devinfo->verx10 >= 125 && inst->predicate != BRW_PREDICATE_NONE) {
const dependency rd_dep = dependency(TGL_REGDIST_SRC, jp, exec_all);
unsigned flags = inst->flags_read(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0)
sb.set(brw_flag_reg(i, 0), rd_dep);
flags >>= 4;
}
}
/* Track any destination registers of this instruction. */
const dependency wr_dep =
is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
@ -1082,6 +1119,19 @@ namespace {
if (inst->writes_accumulator_implicitly(devinfo))
sb.set(brw_acc_reg(8), wr_dep);
/* See comment above for explanation of flag_written parsing and the
* Gfx12.5 restriction.
*/
if (devinfo->verx10 >= 125) {
unsigned flags = inst->flags_written(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0)
sb.set(brw_flag_reg(i, 0), wr_dep);
flags >>= 4;
}
}
if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
!inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
@ -1196,6 +1246,28 @@ namespace {
add_dependency(ids, deps[ip], dep);
}
/* flags_read (and flags_written) returns a bit set per byte of the
* flags register file that is writtten.
*
* Gfx12 does not need this particular workaround because earlier
* hardware doesn't have multiple asynchronous FPU pipelines, and
* therefore can't be affected by this bug.
*/
if (devinfo->verx10 >= 125 && inst->predicate != BRW_PREDICATE_NONE) {
unsigned flags = inst->flags_read(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0) {
const dependency dep = sb.get(brw_flag_reg(i, 0));
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
flags >>= 4;
}
}
if (is_unordered(devinfo, inst) && !inst->eot)
add_dependency(ids, deps[ip],
dependency(TGL_SBID_SET, ip, exec_all));
@ -1221,6 +1293,20 @@ namespace {
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
/* flags_written returns a bit set per byte of the flags register
* file that is writtten.
*/
unsigned flags = inst->flags_written(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0) {
const dependency dep = sb.get(brw_flag_reg(i, 0));
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
flags >>= 4;
}
}
update_inst_scoreboard(shader, jps, inst, ip, sb);