From feb9ac168b4fa5411aefea06e207c76b3bf862f5 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 20 May 2026 08:42:06 -0400 Subject: [PATCH] jay: allow npot operands in RA As long as we round up the /alignments/ in RA, and pad to power-of-two when calculating partitions (trivially true now, this informs future work though), this is fine. SIMD16: Totals from 1001 (37.82% of 2647) affected shaders: Instrs: 1897734 -> 1896157 (-0.08%); split: -0.25%, +0.16% CodeSize: 28330256 -> 28315472 (-0.05%); split: -0.30%, +0.25% Number of spill instructions: 1003 -> 999 (-0.40%) Number of fill instructions: 990 -> 986 (-0.40%) SIMD32: Totals from 1230 (46.47% of 2647) affected shaders: Instrs: 3284649 -> 3277437 (-0.22%); split: -1.18%, +0.96% CodeSize: 48977696 -> 48907376 (-0.14%); split: -1.10%, +0.96% Number of spill instructions: 41004 -> 40582 (-1.03%); split: -1.05%, +0.02% Number of fill instructions: 39298 -> 38572 (-1.85%); split: -1.91%, +0.06% Signed-off-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/jay/jay_liveness.c | 8 +--- src/intel/compiler/jay/jay_lower_pre_ra.c | 38 +++---------------- .../compiler/jay/jay_register_allocate.c | 14 ++----- src/intel/compiler/jay/jay_schedule.c | 7 +--- src/intel/compiler/jay/jay_spill.c | 11 +----- 5 files changed, 14 insertions(+), 64 deletions(-) diff --git a/src/intel/compiler/jay/jay_liveness.c b/src/intel/compiler/jay/jay_liveness.c index afd6f1206df..547734ada0c 100644 --- a/src/intel/compiler/jay/jay_liveness.c +++ b/src/intel/compiler/jay/jay_liveness.c @@ -6,7 +6,6 @@ #include "util/bitset.h" #include "util/macros.h" #include "util/sparse_bitset.h" -#include "util/u_math.h" #include "util/u_worklist.h" #include "jay_ir.h" #include "jay_opcodes.h" @@ -183,7 +182,7 @@ jay_calculate_register_demands(jay_function *func) jay_foreach_inst_in_block(block, I) { /* Make destinations live */ jay_foreach_dst(I, d) { - demands[d.file] += util_next_power_of_two(jay_num_values(d)); + demands[d.file] += jay_num_values(d); } /* Update maximum demands */ @@ -201,11 +200,6 @@ jay_calculate_register_demands(jay_function *func) } } - jay_foreach_dst(I, d) { - unsigned n = jay_num_values(d); - demands[d.file] -= util_next_power_of_two(n) - n; - } - /* Late-kill sources. Duplicated sources are only marked killed once, * so we do not need to filter out duplicates. */ diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c index 0e20c1d5847..bcbd8e51f14 100644 --- a/src/intel/compiler/jay/jay_lower_pre_ra.c +++ b/src/intel/compiler/jay/jay_lower_pre_ra.c @@ -3,7 +3,6 @@ * SPDX-License-Identifier: MIT */ -#include "util/bitscan.h" #include "util/hash_table.h" #include "util/lut.h" #include "util/macros.h" @@ -14,37 +13,12 @@ #include "jay_opcodes.h" #include "jay_private.h" -/* - * Register allocation operates only on power-of-two vectors. Pad out - * non-power-of-two vectors with null values to simplify RA. - */ -static jay_def -lower_npot_vector(jay_builder *b, jay_def x) -{ - unsigned n = jay_num_values(x); - - if (!util_is_power_of_two_or_zero(n)) { - uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; - - for (unsigned i = 0; i < n; ++i) { - indices[i] = jay_channel(x, i); - } - - x = jay_collect(b, x.file, indices, util_next_power_of_two(n)); - } - - assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond"); - return x; -} - /** - * Vectors need to be allocated to contiguous registers. Furthermore, we - * require power-of-two sizes in certain cases, that's handled here too. - * - * This means that a value cannot appear in multiple channels of an - * instruction, as register allocation would need to assign the same value to - * locations and . Scalars don't have this restriction, except for - * SENDs because the hardware bans repeated sources. + * Vectors need to be allocated to contiguous registers. This means that a value + * cannot appear in multiple channels of an instruction, as register allocation + * would need to assign the same value to locations and . Scalars + * don't have this restriction, except for SENDs because the hardware bans + * repeated sources. * * If a value appears in multiple positions, we emit copies so that each * can be register allocated in the correct position. @@ -75,7 +49,7 @@ lower_contiguous_sources(jay_builder *b, jay_inst *I) } } - jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s])); + jay_replace_src(&I->src[s], I->src[s]); } } } diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c index 0b240f84bcc..99c21f54b9e 100644 --- a/src/intel/compiler/jay/jay_register_allocate.c +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -1042,13 +1042,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I) bool killed = false; jay_def var = *(vars[i]); unsigned size = jay_num_values(var); - if (is_src) { - assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered"); - } else { - size = util_next_power_of_two(size); - } - - unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size; + unsigned alignment = + I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : util_next_power_of_two(size); enum jay_file file = var.file; enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8; @@ -1169,9 +1164,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I) /* Reset data structures */ for (unsigned i = 0; i < nr_vars; ++i) { jay_def var = *(vars[i]); - unsigned n = util_next_power_of_two(jay_num_values(var)); - BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, n); - BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, n); + BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, jay_num_values(var)); + BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, jay_num_values(var)); } /* Sources selected for early-kill have had their last_use fields cleared. diff --git a/src/intel/compiler/jay/jay_schedule.c b/src/intel/compiler/jay/jay_schedule.c index b9243e77dc5..4834f50e186 100644 --- a/src/intel/compiler/jay/jay_schedule.c +++ b/src/intel/compiler/jay/jay_schedule.c @@ -138,7 +138,7 @@ calculate_pressure_delta_before(struct sched_ctx *ctx, jay_inst *I) /* Make destinations live */ jay_foreach_dst(I, dst) { - delta += util_next_power_of_two(jay_num_values(dst)) * scale(ctx, dst); + delta += jay_num_values(dst) * scale(ctx, dst); } return delta; @@ -157,11 +157,6 @@ calculate_pressure_delta_after(struct sched_ctx *ctx, jay_inst *I) delta -= !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->dst); } - jay_foreach_dst(I, d) { - unsigned n = jay_num_values(d); - delta -= (util_next_power_of_two(n) - n) * scale(ctx, I->dst); - } - /* Late-kill sources. We precomputed the deduplication info and stashed it in * the I->last_use bitfield for convenience. */ diff --git a/src/intel/compiler/jay/jay_spill.c b/src/intel/compiler/jay/jay_spill.c index 0b00240c2c1..763fe8025fa 100644 --- a/src/intel/compiler/jay/jay_spill.c +++ b/src/intel/compiler/jay/jay_spill.c @@ -10,7 +10,6 @@ #include "util/ralloc.h" #include "util/sparse_bitset.h" #include "util/u_dynarray.h" -#include "util/u_math.h" #include "util/u_qsort.h" #include "util/u_worklist.h" #include "jay_builder.h" @@ -488,14 +487,8 @@ min_algorithm(struct spill_ctx *ctx, } } - /* Limit W to make space for the operands. - * - * We need to round up to power-of-two destination sizes to match the - * rounding in demand calculation. - */ - bool has_dst = I->dst.file == GPR; - unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst)); - limit(ctx, I, ctx->k - (has_dst ? dst_size : 0)); + /* Limit W to make space for the operands. */ + limit(ctx, I, ctx->k - (I->dst.file == GPR ? jay_num_values(I->dst) : 0)); /* Add destinations to the register file */ if (I->dst.file == GPR) {