pan: Centralize preload registers

Rather than having preload registers hardcoded over multiple files,
gather them in one place with an enum abstraction.

This should simplify updates to the preload registers.

Reviewed-by: Eric R. Smith <eric.smith@collabora.com>
Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40643>
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2026-03-25 16:04:01 +01:00 committed by Marge Bot
parent 1e052f0bb5
commit 1f0370616a
4 changed files with 228 additions and 61 deletions

View file

@ -321,8 +321,9 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
static void
bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
uint64_t preload_live, unsigned node_count, bool is_blend,
bool split_file, bool aligned_sr)
bool split_file, unsigned arch)
{
bool aligned_sr = arch >= 9;
bi_foreach_instr_in_block_rev(block, ins) {
/* Mark all registers live after the instruction as
* interfering with the destination */
@ -383,8 +384,10 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
}
if (!is_blend && ins->op == BI_OPCODE_BLEND) {
/* Blend shaders might clobber r0-r15, r48. */
uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
/* Blend shaders might clobber r0-r15, blend link reg. */
uint64_t clobber =
BITFIELD64_MASK(16) |
BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, arch));
for (unsigned i = 0; i < node_count; ++i) {
if (live[i])
@ -410,7 +413,7 @@ bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc);
bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc,
ctx->inputs->is_blend, !full_regs, ctx->arch >= 9);
ctx->inputs->is_blend, !full_regs, ctx->arch);
free(live);
}
@ -438,36 +441,43 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
bi_foreach_dest(ins, d)
l->affinity[ins->dest[d].value] = default_affinity;
/* Blend shaders expect the src colour to be in r0-r3 */
/* Blend shaders expect the src colour to be in blend_src0_c0
* through c3 */
if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) {
assert(bi_is_ssa(ins->src[0]));
l->solutions[ins->src[0].value] = 0;
l->solutions[ins->src[0].value] =
bi_preload_reg(BI_PRELOAD_BLEND_SRC0_C0, ctx->arch);
/* Dual source blend input in r4-r7 */
/* Dual source blend input in blend_src1_c0 through c3 */
if (bi_is_ssa(ins->src[4]))
l->solutions[ins->src[4].value] = 4;
l->solutions[ins->src[4].value] =
bi_preload_reg(BI_PRELOAD_BLEND_SRC1_C0, ctx->arch);
/* Writes to R48 */
/* Writes to blend link */
if (!bi_is_null(ins->dest[0]))
l->solutions[ins->dest[0].value] = 48;
l->solutions[ins->dest[0].value] =
bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch);
}
/* Coverage mask writes stay in R60 */
/* Coverage mask writes stay in the cumulative coverage reg */
if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) &&
!bi_is_null(ins->dest[0])) {
l->solutions[ins->dest[0].value] = 60;
l->solutions[ins->dest[0].value] =
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch);
}
/* Experimentally, it seems coverage masks inputs to ATEST must
* be in R60. Otherwise coverage mask writes do not work with
* early-ZS with pixel-frequency-shading (this combination of
* settings is legal if depth/stencil writes are disabled).
* Allowing a FAU index also seems to work on Valhall, at least.
* be in the cumulative coverage reg. Otherwise coverage mask
* writes do not work with early-ZS with pixel-frequency-shading
* (this combination of settings is legal if depth/stencil
* writes are disabled). Allowing a FAU index also seems to
* work on Valhall, at least.
*/
if (ins->op == BI_OPCODE_ATEST) {
assert(bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU);
if (bi_is_ssa(ins->src[0]))
l->solutions[ins->src[0].value] = 60;
l->solutions[ins->src[0].value] =
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch);
}
}
@ -492,8 +502,10 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
if (ctx->inputs->is_blend) {
/* We're allowed to coalesce the moves to these */
affinity |= BITFIELD64_BIT(48);
affinity |= BITFIELD64_BIT(60);
affinity |=
BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch));
affinity |= BITFIELD64_BIT(
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch));
}
/* Try to coalesce */
@ -595,14 +607,15 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
bi_foreach_instr_global(ctx, ins) {
bi_foreach_dest(ins, d) {
/* Don't allow spilling coverage mask writes because the
* register preload logic assumes it will stay in R60.
* This could be optimized.
* register preload logic assumes it will stay in the
* cumulative coverage reg. This could be optimized.
*/
if (ins->no_spill || ins->op == BI_OPCODE_ATEST ||
ins->op == BI_OPCODE_ZS_EMIT ||
(ins->op == BI_OPCODE_MOV_I32 &&
ins->src[0].type == BI_INDEX_REGISTER &&
ins->src[0].value == 60)) {
ins->src[0].value ==
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch))) {
BITSET_SET(no_spill, ins->dest[d].value);
}
}

View file

@ -32,8 +32,9 @@ static void pan_stats_verbose(FILE *f, const char *prefix, bi_context *ctx,
static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
static bi_index
bi_preload(bi_builder *b, unsigned reg)
bi_preload(bi_builder *b, enum bi_preload val)
{
unsigned reg = bi_preload_reg(val, b->shader->arch);
if (bi_is_null(b->shader->preloaded[reg])) {
/* Insert at the beginning of the shader */
bi_builder b_ = *b;
@ -50,7 +51,7 @@ static bi_index
bi_coverage(bi_builder *b)
{
if (bi_is_null(b->shader->coverage))
b->shader->coverage = bi_preload(b, 60);
b->shader->coverage = bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE);
return b->shader->coverage;
}
@ -63,20 +64,20 @@ bi_coverage(bi_builder *b)
static inline bi_index
bi_vertex_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
return bi_preload(b, BI_PRELOAD_VERTEX_ID);
}
static inline bi_index
bi_instance_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
return bi_preload(b, BI_PRELOAD_INSTANCE_ID);
}
static inline bi_index
bi_draw_id(bi_builder *b)
{
assert(b->shader->arch >= 9);
return bi_preload(b, 62);
return bi_preload(b, BI_PRELOAD_DRAW_ID);
}
static void
@ -258,8 +259,9 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
{
switch (intr->intrinsic) {
case nir_intrinsic_load_barycentric_centroid:
return bi_preload(b, BI_PRELOAD_CENTROID_ID);
case nir_intrinsic_load_barycentric_sample:
return bi_preload(b, 61);
return bi_preload(b, BI_PRELOAD_SAMPLE_ID);
/* Need to put the sample ID in the top 16-bits */
case nir_intrinsic_load_barycentric_at_sample:
@ -328,7 +330,8 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
case nir_intrinsic_load_barycentric_pixel:
default:
return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
return b->shader->arch >= 9 ? bi_preload(b, BI_PRELOAD_CENTROID_ID)
: bi_dontcare(b);
}
}
@ -550,7 +553,8 @@ bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr)
unsigned snap4 = 0x5E;
uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
bi_collect_v3i32_to(b, bi_def_index(&intr->def),
bi_preload(b, 58), bi_preload(b, 59),
bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_LO),
bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_HI),
bi_imm_u32(format));
return;
}
@ -838,8 +842,8 @@ bi_load_sample_id_to(bi_builder *b, bi_index dst)
* seem to read garbage (despite being architecturally defined
* as zero), so use a 5-bit mask instead of 8-bits */
bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
bi_imm_u8(16), false);
bi_rshift_and_i32_to(b, dst, bi_preload(b, BI_PRELOAD_SAMPLE_ID),
bi_imm_u32(0x1f), bi_imm_u8(16), false);
}
static bi_index
@ -872,12 +876,24 @@ static void
bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned base = sem.dual_source_blend_index * 4;
unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
assert(size == 16 || size == 32);
bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
bi_preload(b, base + 2), bi_preload(b, base + 3)};
bi_index srcs[4];
switch (sem.dual_source_blend_index) {
case 0:
srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C0);
srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C1);
srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C2);
srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C3);
break;
case 1:
srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C0);
srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C1);
srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C2);
srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C3);
break;
}
bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
}
@ -1759,7 +1775,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_cumulative_coverage_pan:
bi_mov_i32_to(b, dst, bi_preload(b, 60));
bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE));
break;
case nir_intrinsic_load_blend_descriptor_pan: {
@ -1841,16 +1857,15 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
}
case nir_intrinsic_blend_return_pan:
/* Jump back to the fragment shader, return address is stored
* in r48 (see above). On Valhall, only jump if the address is
* nonzero. The check is free there and it implements the "jump
* to 0 terminates the blend shader" that's automatic on
* Bifrost.
/* Jump back to the fragment shader. On Valhall, only jump if the address
* is nonzero. The check is free there and it implements the "jump to 0
* terminates the blend shader" that's automatic on Bifrost.
*/
if (b->shader->arch >= 9)
bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
bi_branchzi(b, bi_preload(b, BI_PRELOAD_BLEND_LINK),
bi_preload(b, BI_PRELOAD_BLEND_LINK), BI_CMPF_NE);
else
bi_jump(b, bi_preload(b, 48));
bi_jump(b, bi_preload(b, BI_PRELOAD_BLEND_LINK));
break;
case nir_intrinsic_load_ubo:
@ -2008,7 +2023,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_pixel_coord:
/* Vectorized load of the preloaded i16vec2 */
bi_mov_i32_to(b, dst, bi_preload(b, 59));
bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_POSITION_XY));
break;
case nir_intrinsic_load_texel_buf_conv_pan:
@ -2033,7 +2048,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_idvs_output_buf_index_pan:
bi_mov_i32_to(b, dst, bi_preload(b, 59));
bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_INTERNAL_ID));
break;
case nir_intrinsic_lea_attr_pan:
@ -2067,8 +2082,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_sample_mask_in:
/* r61[0:15] contains the coverage bitmap */
bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
/* [0:15] contains the coverage bitmap */
bi_u16_to_u32_to(
b, dst, bi_half(bi_preload(b, BI_PRELOAD_RASTERIZER_COVERAGE), false));
break;
case nir_intrinsic_load_sample_mask:
@ -2080,12 +2096,12 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_primitive_id:
bi_mov_i32_to(b, dst, bi_preload(b, 57));
bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_PRIMITIVE_ID));
break;
case nir_intrinsic_load_front_face: {
/* (r58 & 1) == 0 means primitive is front facing */
bi_index primitive_facing = bi_preload(b, 58);
/* (primitive_flags & 1) == 0 means primitive is front facing */
bi_index primitive_facing = bi_preload(b, BI_PRELOAD_PRIMITIVE_FLAGS);
/* Starting with v11, there is more fields defined in the primitive flags */
if (b->shader->arch >= 11)
@ -2150,20 +2166,23 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
}
case nir_intrinsic_load_local_invocation_id:
bi_collect_v3i32_to(b, dst,
bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
bi_collect_v3i32_to(
b, dst,
bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_0), 0)),
bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_1), 1)),
bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_2), 0)));
break;
case nir_intrinsic_load_workgroup_id:
bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
bi_preload(b, 59));
bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_WORKGROUP_ID_0),
bi_preload(b, BI_PRELOAD_WORKGROUP_ID_1),
bi_preload(b, BI_PRELOAD_WORKGROUP_ID_2));
break;
case nir_intrinsic_load_global_invocation_id:
bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
bi_preload(b, 62));
bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_GLOBAL_ID_0),
bi_preload(b, BI_PRELOAD_GLOBAL_ID_1),
bi_preload(b, BI_PRELOAD_GLOBAL_ID_2));
break;
case nir_intrinsic_shader_clock:
@ -2190,7 +2209,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_view_index:
case nir_intrinsic_load_layer_id:
assert(b->shader->arch >= 9);
bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
bi_mov_i32_to(
b, dst,
bi_u8_to_u32(b, bi_byte(bi_preload(b, BI_PRELOAD_FRAME_ARG), 0)));
break;
case nir_intrinsic_load_ssbo_address:

View file

@ -1116,6 +1116,137 @@ enum bi_idvs_mode {
#define BI_MAX_REGS 64
enum bi_preload {
/* Compute */
BI_PRELOAD_LOCAL_ID_0,
BI_PRELOAD_LOCAL_ID_1,
BI_PRELOAD_LOCAL_ID_2,
BI_PRELOAD_WORKGROUP_ID_0,
BI_PRELOAD_WORKGROUP_ID_1,
BI_PRELOAD_WORKGROUP_ID_2,
BI_PRELOAD_GLOBAL_ID_0,
BI_PRELOAD_GLOBAL_ID_1,
BI_PRELOAD_GLOBAL_ID_2,
/* Vertex */
BI_PRELOAD_POS_RESULT_PTR_LO,
BI_PRELOAD_POS_RESULT_PTR_HI,
BI_PRELOAD_INTERNAL_ID,
BI_PRELOAD_VERTEX_ID,
BI_PRELOAD_INSTANCE_ID,
BI_PRELOAD_DRAW_ID,
BI_PRELOAD_VIEW_ID,
/* Fragment */
BI_PRELOAD_PRIMITIVE_ID,
BI_PRELOAD_PRIMITIVE_FLAGS,
BI_PRELOAD_POSITION_XY,
BI_PRELOAD_CUMULATIVE_COVERAGE,
BI_PRELOAD_RASTERIZER_COVERAGE,
BI_PRELOAD_SAMPLE_ID,
BI_PRELOAD_CENTROID_ID,
BI_PRELOAD_FRAME_ARG,
/* Blend */
BI_PRELOAD_BLEND_SRC0_C0,
BI_PRELOAD_BLEND_SRC0_C1,
BI_PRELOAD_BLEND_SRC0_C2,
BI_PRELOAD_BLEND_SRC0_C3,
BI_PRELOAD_BLEND_SRC1_C0,
BI_PRELOAD_BLEND_SRC1_C1,
BI_PRELOAD_BLEND_SRC1_C2,
BI_PRELOAD_BLEND_SRC1_C3,
BI_PRELOAD_BLEND_LINK,
};
static inline unsigned
bi_preload_reg(enum bi_preload val, unsigned arch)
{
switch (val) {
/* Compute */
case BI_PRELOAD_LOCAL_ID_0:
/* Bits [15;0] */
return 55;
case BI_PRELOAD_LOCAL_ID_1:
/* Bits [31;16] */
return 55;
case BI_PRELOAD_LOCAL_ID_2:
/* Bits [15;0] */
return 56;
case BI_PRELOAD_WORKGROUP_ID_0:
return 57;
case BI_PRELOAD_WORKGROUP_ID_1:
return 58;
case BI_PRELOAD_WORKGROUP_ID_2:
return 59;
case BI_PRELOAD_GLOBAL_ID_0:
return 60;
case BI_PRELOAD_GLOBAL_ID_1:
return 61;
case BI_PRELOAD_GLOBAL_ID_2:
return 62;
/* Vertex */
case BI_PRELOAD_POS_RESULT_PTR_LO:
assert(arch < 9);
return 58;
case BI_PRELOAD_POS_RESULT_PTR_HI:
assert(arch < 9);
return 59;
case BI_PRELOAD_INTERNAL_ID:
assert(arch >= 9);
return 59;
case BI_PRELOAD_VERTEX_ID:
return (arch >= 9) ? 60 : 61;
case BI_PRELOAD_INSTANCE_ID:
return (arch >= 9) ? 61 : 62;
case BI_PRELOAD_DRAW_ID:
assert(arch >= 9);
return 62;
case BI_PRELOAD_VIEW_ID:
assert(arch >= 9);
return 63;
/* Fragment */
case BI_PRELOAD_PRIMITIVE_ID:
return 57;
case BI_PRELOAD_PRIMITIVE_FLAGS:
return 58;
case BI_PRELOAD_POSITION_XY:
return 59;
case BI_PRELOAD_CUMULATIVE_COVERAGE:
/* Bits [15;0] */
return 60;
case BI_PRELOAD_RASTERIZER_COVERAGE:
/* Bits [15;0] */
return 61;
case BI_PRELOAD_SAMPLE_ID:
/* Bits [23;16] */
return 61;
case BI_PRELOAD_CENTROID_ID:
/* Bits [31;24] */
return 61;
case BI_PRELOAD_FRAME_ARG:
/* Double reg */
return 62;
/* Blend */
case BI_PRELOAD_BLEND_SRC0_C0:
return 0;
case BI_PRELOAD_BLEND_SRC0_C1:
return 1;
case BI_PRELOAD_BLEND_SRC0_C2:
return 2;
case BI_PRELOAD_BLEND_SRC0_C3:
return 3;
case BI_PRELOAD_BLEND_SRC1_C0:
return 4;
case BI_PRELOAD_BLEND_SRC1_C1:
return 5;
case BI_PRELOAD_BLEND_SRC1_C2:
return 6;
case BI_PRELOAD_BLEND_SRC1_C3:
return 7;
case BI_PRELOAD_BLEND_LINK:
return 48;
}
UNREACHABLE("Non-handled BI_PRELOAD");
}
typedef struct {
const struct pan_compile_inputs *inputs;
nir_shader *nir;

View file

@ -1155,8 +1155,10 @@ va_lower_blend(bi_context *ctx)
unsigned prolog_length = 2 * 8;
/* By ABI, r48 is the link register shared with blend shaders */
assert(bi_is_equiv(I->dest[0], bi_register(48)));
/* By ABI, the preload blend link register is shared with blend
* shaders */
assert(bi_is_equiv(I->dest[0], bi_register(bi_preload_reg(
BI_PRELOAD_BLEND_LINK, ctx->arch))));
if (I->flow == VA_FLOW_END)
bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);