mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-04 10:58:15 +02:00
jay: allow npot operands in RA
As long as we round up the /alignments/ in RA, and pad to power-of-two when calculating partitions (trivially true now, this informs future work though), this is fine. SIMD16: Totals from 1001 (37.82% of 2647) affected shaders: Instrs: 1897734 -> 1896157 (-0.08%); split: -0.25%, +0.16% CodeSize: 28330256 -> 28315472 (-0.05%); split: -0.30%, +0.25% Number of spill instructions: 1003 -> 999 (-0.40%) Number of fill instructions: 990 -> 986 (-0.40%) SIMD32: Totals from 1230 (46.47% of 2647) affected shaders: Instrs: 3284649 -> 3277437 (-0.22%); split: -1.18%, +0.96% CodeSize: 48977696 -> 48907376 (-0.14%); split: -1.10%, +0.96% Number of spill instructions: 41004 -> 40582 (-1.03%); split: -1.05%, +0.02% Number of fill instructions: 39298 -> 38572 (-1.85%); split: -1.91%, +0.06% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41808>
This commit is contained in:
parent
30c392fc55
commit
feb9ac168b
5 changed files with 14 additions and 64 deletions
|
|
@ -6,7 +6,6 @@
|
|||
#include "util/bitset.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/sparse_bitset.h"
|
||||
#include "util/u_math.h"
|
||||
#include "util/u_worklist.h"
|
||||
#include "jay_ir.h"
|
||||
#include "jay_opcodes.h"
|
||||
|
|
@ -183,7 +182,7 @@ jay_calculate_register_demands(jay_function *func)
|
|||
jay_foreach_inst_in_block(block, I) {
|
||||
/* Make destinations live */
|
||||
jay_foreach_dst(I, d) {
|
||||
demands[d.file] += util_next_power_of_two(jay_num_values(d));
|
||||
demands[d.file] += jay_num_values(d);
|
||||
}
|
||||
|
||||
/* Update maximum demands */
|
||||
|
|
@ -201,11 +200,6 @@ jay_calculate_register_demands(jay_function *func)
|
|||
}
|
||||
}
|
||||
|
||||
jay_foreach_dst(I, d) {
|
||||
unsigned n = jay_num_values(d);
|
||||
demands[d.file] -= util_next_power_of_two(n) - n;
|
||||
}
|
||||
|
||||
/* Late-kill sources. Duplicated sources are only marked killed once,
|
||||
* so we do not need to filter out duplicates.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "util/bitscan.h"
|
||||
#include "util/hash_table.h"
|
||||
#include "util/lut.h"
|
||||
#include "util/macros.h"
|
||||
|
|
@ -14,37 +13,12 @@
|
|||
#include "jay_opcodes.h"
|
||||
#include "jay_private.h"
|
||||
|
||||
/*
|
||||
* Register allocation operates only on power-of-two vectors. Pad out
|
||||
* non-power-of-two vectors with null values to simplify RA.
|
||||
*/
|
||||
static jay_def
|
||||
lower_npot_vector(jay_builder *b, jay_def x)
|
||||
{
|
||||
unsigned n = jay_num_values(x);
|
||||
|
||||
if (!util_is_power_of_two_or_zero(n)) {
|
||||
uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 };
|
||||
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
indices[i] = jay_channel(x, i);
|
||||
}
|
||||
|
||||
x = jay_collect(b, x.file, indices, util_next_power_of_two(n));
|
||||
}
|
||||
|
||||
assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond");
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Vectors need to be allocated to contiguous registers. Furthermore, we
|
||||
* require power-of-two sizes in certain cases, that's handled here too.
|
||||
*
|
||||
* This means that a value cannot appear in multiple channels of an
|
||||
* instruction, as register allocation would need to assign the same value to
|
||||
* locations <X+i> and <X+j>. Scalars don't have this restriction, except for
|
||||
* SENDs because the hardware bans repeated sources.
|
||||
* Vectors need to be allocated to contiguous registers. This means that a value
|
||||
* cannot appear in multiple channels of an instruction, as register allocation
|
||||
* would need to assign the same value to locations <X+i> and <X+j>. Scalars
|
||||
* don't have this restriction, except for SENDs because the hardware bans
|
||||
* repeated sources.
|
||||
*
|
||||
* If a value appears in multiple positions, we emit copies so that each
|
||||
* can be register allocated in the correct position.
|
||||
|
|
@ -75,7 +49,7 @@ lower_contiguous_sources(jay_builder *b, jay_inst *I)
|
|||
}
|
||||
}
|
||||
|
||||
jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s]));
|
||||
jay_replace_src(&I->src[s], I->src[s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1042,13 +1042,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I)
|
|||
bool killed = false;
|
||||
jay_def var = *(vars[i]);
|
||||
unsigned size = jay_num_values(var);
|
||||
if (is_src) {
|
||||
assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered");
|
||||
} else {
|
||||
size = util_next_power_of_two(size);
|
||||
}
|
||||
|
||||
unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size;
|
||||
unsigned alignment =
|
||||
I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : util_next_power_of_two(size);
|
||||
enum jay_file file = var.file;
|
||||
enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8;
|
||||
|
||||
|
|
@ -1169,9 +1164,8 @@ assign_regs_for_inst(jay_ra_state *ra, jay_inst *I)
|
|||
/* Reset data structures */
|
||||
for (unsigned i = 0; i < nr_vars; ++i) {
|
||||
jay_def var = *(vars[i]);
|
||||
unsigned n = util_next_power_of_two(jay_num_values(var));
|
||||
BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, n);
|
||||
BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, n);
|
||||
BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, jay_num_values(var));
|
||||
BITSET_CLEAR_COUNT(ra->killed[var.file], var.reg, jay_num_values(var));
|
||||
}
|
||||
|
||||
/* Sources selected for early-kill have had their last_use fields cleared.
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ calculate_pressure_delta_before(struct sched_ctx *ctx, jay_inst *I)
|
|||
|
||||
/* Make destinations live */
|
||||
jay_foreach_dst(I, dst) {
|
||||
delta += util_next_power_of_two(jay_num_values(dst)) * scale(ctx, dst);
|
||||
delta += jay_num_values(dst) * scale(ctx, dst);
|
||||
}
|
||||
|
||||
return delta;
|
||||
|
|
@ -157,11 +157,6 @@ calculate_pressure_delta_after(struct sched_ctx *ctx, jay_inst *I)
|
|||
delta -= !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->dst);
|
||||
}
|
||||
|
||||
jay_foreach_dst(I, d) {
|
||||
unsigned n = jay_num_values(d);
|
||||
delta -= (util_next_power_of_two(n) - n) * scale(ctx, I->dst);
|
||||
}
|
||||
|
||||
/* Late-kill sources. We precomputed the deduplication info and stashed it in
|
||||
* the I->last_use bitfield for convenience.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@
|
|||
#include "util/ralloc.h"
|
||||
#include "util/sparse_bitset.h"
|
||||
#include "util/u_dynarray.h"
|
||||
#include "util/u_math.h"
|
||||
#include "util/u_qsort.h"
|
||||
#include "util/u_worklist.h"
|
||||
#include "jay_builder.h"
|
||||
|
|
@ -488,14 +487,8 @@ min_algorithm(struct spill_ctx *ctx,
|
|||
}
|
||||
}
|
||||
|
||||
/* Limit W to make space for the operands.
|
||||
*
|
||||
* We need to round up to power-of-two destination sizes to match the
|
||||
* rounding in demand calculation.
|
||||
*/
|
||||
bool has_dst = I->dst.file == GPR;
|
||||
unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst));
|
||||
limit(ctx, I, ctx->k - (has_dst ? dst_size : 0));
|
||||
/* Limit W to make space for the operands. */
|
||||
limit(ctx, I, ctx->k - (I->dst.file == GPR ? jay_num_values(I->dst) : 0));
|
||||
|
||||
/* Add destinations to the register file */
|
||||
if (I->dst.file == GPR) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue