jay: allow npot operands in RA

As long as we round up the /alignments/ in RA, and pad to power-of-two when
calculating partitions (trivially true now, this informs future work though),
this is fine.

SIMD16:
   Totals from 1001 (37.82% of 2647) affected shaders:
   Instrs: 1897734 -> 1896157 (-0.08%); split: -0.25%, +0.16%
   CodeSize: 28330256 -> 28315472 (-0.05%); split: -0.30%, +0.25%
   Number of spill instructions: 1003 -> 999 (-0.40%)
   Number of fill instructions: 990 -> 986 (-0.40%)

SIMD32:
   Totals from 1230 (46.47% of 2647) affected shaders:
   Instrs: 3284649 -> 3277437 (-0.22%); split: -1.18%, +0.96%
   CodeSize: 48977696 -> 48907376 (-0.14%); split: -1.10%, +0.96%
   Number of spill instructions: 41004 -> 40582 (-1.03%); split: -1.05%, +0.02%
   Number of fill instructions: 39298 -> 38572 (-1.85%); split: -1.91%, +0.06%

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41808>
This commit is contained in:
Alyssa Rosenzweig 2026-05-20 08:42:06 -04:00 committed by Marge Bot
parent 30c392fc55
commit feb9ac168b
5 changed files with 14 additions and 64 deletions

View file

@ -6,7 +6,6 @@
#include "util/bitset.h"
#include "util/macros.h"
#include "util/sparse_bitset.h"
#include "util/u_math.h"
#include "util/u_worklist.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
@ -183,7 +182,7 @@ jay_calculate_register_demands(jay_function *func)
jay_foreach_inst_in_block(block, I) {
/* Make destinations live */
jay_foreach_dst(I, d) {
demands[d.file] += util_next_power_of_two(jay_num_values(d));
demands[d.file] += jay_num_values(d);
}
/* Update maximum demands */
@ -201,11 +200,6 @@ jay_calculate_register_demands(jay_function *func)
}
}
jay_foreach_dst(I, d) {
unsigned n = jay_num_values(d);
demands[d.file] -= util_next_power_of_two(n) - n;
}
/* Late-kill sources. Duplicated sources are only marked killed once,
* so we do not need to filter out duplicates.
*/

View file

@ -3,7 +3,6 @@
* SPDX-License-Identifier: MIT
*/
#include "util/bitscan.h"
#include "util/hash_table.h"
#include "util/lut.h"
#include "util/macros.h"
@ -14,37 +13,12 @@
#include "jay_opcodes.h"
#include "jay_private.h"
/*
* Register allocation operates only on power-of-two vectors. Pad out
* non-power-of-two vectors with null values to simplify RA.
*/
static jay_def
lower_npot_vector(jay_builder *b, jay_def x)
{
unsigned n = jay_num_values(x);
if (!util_is_power_of_two_or_zero(n)) {
uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
for (unsigned i = 0; i < n; ++i) {
indices[i] = jay_channel(x, i);
}
x = jay_collect(b, x.file, indices, util_next_power_of_two(n));
}
assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond");
return x;
}
/**
* Vectors need to be allocated to contiguous registers. Furthermore, we
* require power-of-two sizes in certain cases, that's handled here too.
*
* This means that a value cannot appear in multiple channels of an
* instruction, as register allocation would need to assign the same value to
* locations <X+i> and <X+j>. Scalars don't have this restriction, except for
* SENDs because the hardware bans repeated sources.
* Vectors need to be allocated to contiguous registers. This means that a value
* cannot appear in multiple channels of an instruction, as register allocation
* would need to assign the same value to locations <X+i> and <X+j>. Scalars
* don't have this restriction, except for SENDs because the hardware bans
* repeated sources.
*
* If a value appears in multiple positions, we emit copies so that each
* can be register allocated in the correct position.
@ -75,7 +49,7 @@ lower_contiguous_sources(jay_builder *b, jay_inst *I)
}
}
jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s]));
jay_replace_src(&I->src[s], I->src[s]);
}
}
}

View file

@ -1042,13 +1042,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I)
bool killed = false;
jay_def var = *(vars[i]);
unsigned size = jay_num_values(var);
if (is_src) {
assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered");
} else {
size = util_next_power_of_two(size);
}
unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size;
unsigned alignment =
I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : util_next_power_of_two(size);
enum jay_file file = var.file;
enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8;
@ -1169,9 +1164,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I)
/* Reset data structures */
for (unsigned i = 0; i < nr_vars; ++i) {
jay_def var = *(vars[i]);
unsigned n = util_next_power_of_two(jay_num_values(var));
BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, n);
BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, n);
BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, jay_num_values(var));
BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, jay_num_values(var));
}
/* Sources selected for early-kill have had their last_use fields cleared.

View file

@ -138,7 +138,7 @@ calculate_pressure_delta_before(struct sched_ctx *ctx, jay_inst *I)
/* Make destinations live */
jay_foreach_dst(I, dst) {
delta += util_next_power_of_two(jay_num_values(dst)) * scale(ctx, dst);
delta += jay_num_values(dst) * scale(ctx, dst);
}
return delta;
@ -157,11 +157,6 @@ calculate_pressure_delta_after(struct sched_ctx *ctx, jay_inst *I)
delta -= !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->dst);
}
jay_foreach_dst(I, d) {
unsigned n = jay_num_values(d);
delta -= (util_next_power_of_two(n) - n) * scale(ctx, I->dst);
}
/* Late-kill sources. We precomputed the deduplication info and stashed it in
* the I->last_use bitfield for convenience.
*/

View file

@ -10,7 +10,6 @@
#include "util/ralloc.h"
#include "util/sparse_bitset.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "util/u_qsort.h"
#include "util/u_worklist.h"
#include "jay_builder.h"
@ -488,14 +487,8 @@ min_algorithm(struct spill_ctx *ctx,
}
}
/* Limit W to make space for the operands.
*
* We need to round up to power-of-two destination sizes to match the
* rounding in demand calculation.
*/
bool has_dst = I->dst.file == GPR;
unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst));
limit(ctx, I, ctx->k - (has_dst ? dst_size : 0));
/* Limit W to make space for the operands. */
limit(ctx, I, ctx->k - (I->dst.file == GPR ? jay_num_values(I->dst) : 0));
/* Add destinations to the register file */
if (I->dst.file == GPR) {