pan/bi: Simplify register precolouring in the IR

In the current IR, any register may be preloaded by reading it anywhere, and any
register may be precoloured by writing it anywhere. This is convenient for
instruction selection, but requires the register allocator to do considerable
gymnastics to ensure it doesn't clobber precoloured registers. It also breaks
the purity of our SSA representation, which complicates optimization passes
(e.g. copyprop).

Let's trade some instruction selection complexity for simplifying register
allocation by constraining how register precolouring works. Under the new model:

* Registers may only be preloaded at the start of the program.
* Precoloured destinations are handled explicitly by RA.

Internally, a stronger invariant is placed for preloading: registers may only be
preloaded by MOV.i32 instructions at the beginning of the block, and these moves
must be unique. These invariants ensure RA can trivially coalesce the moves.

A bi_preload helper is added as a safe version of bi_register respecting these
invariants, allowing a smooth transition for instruction selection.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16585>
This commit is contained in:
Alyssa Rosenzweig 2022-05-11 15:39:56 -04:00 committed by Marge Bot
parent dab5b62ecf
commit 3df5446cbd
6 changed files with 163 additions and 66 deletions

View file

@ -53,7 +53,10 @@ bi_opt_dead_code_eliminate(bi_context *ctx)
if (ins->op == BI_OPCODE_AXCHG_I32 || if (ins->op == BI_OPCODE_AXCHG_I32 ||
ins->op == BI_OPCODE_ACMPXCHG_I32 || ins->op == BI_OPCODE_ACMPXCHG_I32 ||
ins->op == BI_OPCODE_ATOM_RETURN_I32 || ins->op == BI_OPCODE_ATOM_RETURN_I32 ||
ins->op == BI_OPCODE_ATOM1_RETURN_I32) ins->op == BI_OPCODE_ATOM1_RETURN_I32 ||
ins->op == BI_OPCODE_BLEND ||
ins->op == BI_OPCODE_ATEST ||
ins->op == BI_OPCODE_ZS_EMIT)
continue; continue;
if (index < temp_count && !(live[index] & bi_writemask(ins, d))) if (index < temp_count && !(live[index] & bi_writemask(ins, d)))

View file

@ -124,13 +124,23 @@ bi_opt_message_preload(bi_context *ctx)
/* Report the preloading */ /* Report the preloading */
ctx->info.bifrost->messages[nr_preload] = msg; ctx->info.bifrost->messages[nr_preload] = msg;
/* Replace with moves at the start. Ideally, they will be /* Replace with a collect of preloaded registers. The collect
* coalesced out or copy propagated. * kills the moves, so the collect is free (it is coalesced).
*/ */
b.cursor = bi_before_instr(I);
bi_instr *collect = bi_collect_i32_to(&b, I->dest[0]); bi_instr *collect = bi_collect_i32_to(&b, I->dest[0]);
collect->nr_srcs = bi_count_write_registers(I, 0); collect->nr_srcs = bi_count_write_registers(I, 0);
/* The registers themselves must be preloaded at the start of
* the program. Preloaded registers are coalesced, so these
* moves are free.
*/
b.cursor = bi_before_block(block);
for (unsigned i = 0; i < collect->nr_srcs; ++i) { for (unsigned i = 0; i < collect->nr_srcs; ++i) {
collect->src[i] = bi_register((nr_preload * 4) + i); unsigned reg = (nr_preload * 4) + i;
collect->src[i] = bi_mov_i32(&b, bi_register(reg));
} }
bi_remove_instruction(I); bi_remove_instruction(I);

View file

@ -352,11 +352,57 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
node = bi_get_node(ins->src[4]); node = bi_get_node(ins->src[4]);
if (node < node_count) if (node < node_count)
l->solutions[node] = 4; l->solutions[node] = 4;
/* Writes to R48 */
node = bi_get_node(ins->dest[0]);
if (!bi_is_null(ins->dest[0])) {
assert(node < node_count);
l->solutions[node] = 48;
}
}
/* Coverage mask writes stay in R60 */
if ((ins->op == BI_OPCODE_ATEST ||
ins->op == BI_OPCODE_ZS_EMIT) &&
!bi_is_null(ins->dest[0])) {
unsigned node = bi_get_node(ins->dest[0]);
assert(node < node_count);
l->solutions[node] = 60;
} }
} }
bi_compute_interference(ctx, l, full_regs); bi_compute_interference(ctx, l, full_regs);
/* Coalesce register moves if we're allowed. We need to be careful due
* to the restricted affinity induced by the blend shader ABI.
*/
bi_foreach_instr_global(ctx, I) {
if (I->op != BI_OPCODE_MOV_I32) continue;
if (I->src[0].type != BI_INDEX_REGISTER) continue;
unsigned reg = I->src[0].value;
unsigned node = bi_get_node(I->dest[0]);
assert(node < node_count);
if (l->solutions[node] != ~0) continue;
uint64_t affinity = l->affinity[node];
if (ctx->inputs->is_blend) {
/* We're allowed to coalesce the moves to these */
affinity |= BITFIELD64_BIT(48);
affinity |= BITFIELD64_BIT(60);
}
/* Try to coalesce */
if (affinity & BITFIELD64_BIT(reg)) {
l->solutions[node] = reg;
if (!lcra_test_linear(l, l->solutions, node))
l->solutions[node] = ~0;
}
}
*success = lcra_solve(l); *success = lcra_solve(l);
return l; return l;

View file

@ -72,6 +72,47 @@ int bifrost_debug = 0;
static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
static bi_index
bi_preload(bi_builder *b, unsigned reg)
{
if (bi_is_null(b->shader->preloaded[reg])) {
/* Insert at the beginning of the shader */
bi_builder b_ = *b;
b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
/* Cache the result */
b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
}
return b->shader->preloaded[reg];
}
static bi_index
bi_coverage(bi_builder *b)
{
if (bi_is_null(b->shader->coverage))
b->shader->coverage = bi_preload(b, 60);
return b->shader->coverage;
}
/*
* Vertex ID and Instance ID are preloaded registers. Where they are preloaded
* changed from Bifrost to Valhall. Provide helpers that smooth over the
* architectural difference.
*/
static inline bi_index
bi_vertex_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
}
static inline bi_index
bi_instance_id(bi_builder *b)
{
return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
}
static void static void
bi_block_add_successor(bi_block *block, bi_block *successor) bi_block_add_successor(bi_block *block, bi_block *successor)
{ {
@ -269,7 +310,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
switch (intr->intrinsic) { switch (intr->intrinsic) {
case nir_intrinsic_load_barycentric_centroid: case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample: case nir_intrinsic_load_barycentric_sample:
return bi_register(61); return bi_preload(b, 61);
/* Need to put the sample ID in the top 16-bits */ /* Need to put the sample ID in the top 16-bits */
case nir_intrinsic_load_barycentric_at_sample: case nir_intrinsic_load_barycentric_at_sample:
@ -314,7 +355,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_pixel:
default: default:
return b->shader->arch >= 9 ? bi_register(61) : bi_dontcare(b); return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
} }
} }
@ -503,7 +544,7 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
* logically unused for flat varyings * logically unused for flat varyings
*/ */
if (b->shader->arch >= 9) if (b->shader->arch >= 9)
src0 = bi_register(61); src0 = bi_preload(b, 61);
} }
nir_src *offset = nir_get_io_offset_src(instr); nir_src *offset = nir_get_io_offset_src(instr);
@ -676,7 +717,7 @@ bi_load_sample_id_to(bi_builder *b, bi_index dst)
* seem to read garbage (despite being architecturally defined * seem to read garbage (despite being architecturally defined
* as zero), so use a 5-bit mask instead of 8-bits */ * as zero), so use a 5-bit mask instead of 8-bits */
bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f), bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
bi_imm_u8(16), false); bi_imm_u8(16), false);
} }
@ -710,25 +751,23 @@ bi_pixel_indices(bi_builder *b, unsigned rt)
return indices; return indices;
} }
/* Source color is passed through r0-r3, or r4-r7 for the second source when
* dual-source blending. Preload the corresponding vector.
*/
static void static void
bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
{ {
ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr); nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
assert(size == 16 || size == 32);
/* Source color is passed through r0-r3, or r4-r7 for the second
* source when dual-source blending. TODO: Precolour instead */
bi_index srcs[] = { bi_index srcs[] = {
bi_register(0), bi_register(1), bi_register(2), bi_register(3) bi_preload(b, base + 0), bi_preload(b, base + 1),
}; bi_preload(b, base + 2), bi_preload(b, base + 3)
bi_index srcs2[] = {
bi_register(4), bi_register(5), bi_register(6), bi_register(7)
}; };
bool second_source = (sem.location == VARYING_SLOT_VAR0); bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
bi_make_vec_to(b, bi_dest_index(&instr->dest),
second_source ? srcs2 : srcs,
NULL, 4, 32);
} }
static void static void
@ -755,7 +794,7 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
if (inputs->is_blend && inputs->blend.nr_samples > 1) { if (inputs->is_blend && inputs->blend.nr_samples > 1) {
/* Conversion descriptor comes from the compile inputs, pixel /* Conversion descriptor comes from the compile inputs, pixel
* indices derived at run time based on sample ID */ * indices derived at run time based on sample ID */
bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60), bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
bi_imm_u32(blend_desc >> 32), bi_imm_u32(blend_desc >> 32),
regfmt, BI_VECSIZE_V4); regfmt, BI_VECSIZE_V4);
} else if (b->shader->inputs->is_blend) { } else if (b->shader->inputs->is_blend) {
@ -764,8 +803,8 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
/* Blend descriptor comes from the compile inputs */ /* Blend descriptor comes from the compile inputs */
/* Put the result in r0 */ /* Put the result in r0 */
bi_blend_to(b, bifrost ? bi_register(0) : bi_null(), rgba, bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
bi_register(60), bi_coverage(b),
bi_imm_u32(blend_desc), bi_imm_u32(blend_desc),
bi_imm_u32(blend_desc >> 32), bi_imm_u32(blend_desc >> 32),
bi_null(), regfmt, sr_count, 0); bi_null(), regfmt, sr_count, 0);
@ -774,8 +813,8 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
* return address on Bifrost is stored in r48 and will be used * return address on Bifrost is stored in r48 and will be used
* by the blend shader to jump back to the fragment shader */ * by the blend shader to jump back to the fragment shader */
bi_blend_to(b, bifrost ? bi_register(48) : bi_null(), rgba, bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
bi_register(60), bi_coverage(b),
bi_fau(BIR_FAU_BLEND_0 + rt, false), bi_fau(BIR_FAU_BLEND_0 + rt, false),
bi_fau(BIR_FAU_BLEND_0 + rt, true), bi_fau(BIR_FAU_BLEND_0 + rt, true),
rgba2, regfmt, sr_count, sr_count_2); rgba2, regfmt, sr_count, sr_count_2);
@ -809,9 +848,9 @@ bi_emit_atest(bi_builder *b, bi_index alpha)
I->flow = 0x8; /* .wait0126 */ I->flow = 0x8; /* .wait0126 */
} }
bi_index coverage = bi_register(60); bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha);
bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
b->shader->emitted_atest = true; b->shader->emitted_atest = true;
b->shader->coverage = atest->dest[0];
/* Pseudo-source to encode in the tuple */ /* Pseudo-source to encode in the tuple */
atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false); atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
@ -845,10 +884,12 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
/* By ISA convention, the coverage mask is stored in R60. The store /* By ISA convention, the coverage mask is stored in R60. The store
* itself will be handled by a subsequent ATEST instruction */ * itself will be handled by a subsequent ATEST instruction */
if (loc == FRAG_RESULT_SAMPLE_MASK) { if (loc == FRAG_RESULT_SAMPLE_MASK) {
bi_index orig = bi_register(60); bi_index orig = bi_coverage(b);
bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0)); bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
b->shader->coverage =
bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
return; return;
} }
@ -882,9 +923,9 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
if (writeout & PAN_WRITEOUT_S) if (writeout & PAN_WRITEOUT_S)
s = bi_src_index(&instr->src[3]); s = bi_src_index(&instr->src[3]);
bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60), b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
writeout & PAN_WRITEOUT_S, writeout & PAN_WRITEOUT_S,
writeout & PAN_WRITEOUT_Z); writeout & PAN_WRITEOUT_Z);
} }
if (emit_blend) { if (emit_blend) {
@ -923,9 +964,9 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
* Bifrost. * Bifrost.
*/ */
if (b->shader->arch >= 8) if (b->shader->arch >= 8)
bi_branchzi(b, bi_register(48), bi_register(48), BI_CMPF_NE); bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
else else
bi_jump(b, bi_register(48)); bi_jump(b, bi_preload(b, 48));
} }
} }
@ -1032,10 +1073,10 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
unsigned snap4 = 0x5E; unsigned snap4 = 0x5E;
uint32_t format = identity | (snap4 << 12) | (regfmt << 24); uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
bi_st_cvt(b, data, bi_register(58), bi_register(59), bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
bi_imm_u32(format), regfmt, nr - 1); bi_imm_u32(format), regfmt, nr - 1);
} else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
bi_index index = bi_register(59); bi_index index = bi_preload(b, 59);
if (psiz) { if (psiz) {
assert(T_size == 16 && "should've been lowered"); assert(T_size == 16 && "should've been lowered");
@ -1487,7 +1528,7 @@ bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
for (unsigned i = 0; i < 2; ++i) { for (unsigned i = 0; i < 2; ++i) {
src[i] = bi_fadd_f32(b, src[i] = bi_fadd_f32(b,
bi_u16_to_f32(b, bi_half(bi_register(59), i)), bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
bi_imm_f32(0.5f)); bi_imm_f32(0.5f));
} }
@ -1534,7 +1575,7 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
I->flow = 0x9; /* .wait */ I->flow = 0x9; /* .wait */
} }
bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_register(60), desc, bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
regfmt, nr - 1); regfmt, nr - 1);
bi_emit_cached_split(b, dest, size * nr); bi_emit_cached_split(b, dest, size * nr);
} }
@ -1799,7 +1840,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_sample_mask_in: case nir_intrinsic_load_sample_mask_in:
/* r61[0:15] contains the coverage bitmap */ /* r61[0:15] contains the coverage bitmap */
bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false)); bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
break; break;
case nir_intrinsic_load_sample_id: case nir_intrinsic_load_sample_id:
@ -1808,7 +1849,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_front_face: case nir_intrinsic_load_front_face:
/* r58 == 0 means primitive is front facing */ /* r58 == 0 means primitive is front facing */
bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ, bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
BI_RESULT_TYPE_M1); BI_RESULT_TYPE_M1);
break; break;
@ -1848,20 +1889,20 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_local_invocation_id: case nir_intrinsic_load_local_invocation_id:
bi_collect_v3i32_to(b, dst, bi_collect_v3i32_to(b, dst,
bi_u16_to_u32(b, bi_half(bi_register(55), 0)), bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
bi_u16_to_u32(b, bi_half(bi_register(55), 1)), bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
bi_u16_to_u32(b, bi_half(bi_register(56), 0))); bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
break; break;
case nir_intrinsic_load_workgroup_id: case nir_intrinsic_load_workgroup_id:
bi_collect_v3i32_to(b, dst, bi_register(57), bi_register(58), bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
bi_register(59)); bi_preload(b, 59));
break; break;
case nir_intrinsic_load_global_invocation_id: case nir_intrinsic_load_global_invocation_id:
case nir_intrinsic_load_global_invocation_id_zero_base: case nir_intrinsic_load_global_invocation_id_zero_base:
bi_collect_v3i32_to(b, dst, bi_register(60), bi_register(61), bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
bi_register(62)); bi_preload(b, 62));
break; break;
case nir_intrinsic_shader_clock: case nir_intrinsic_shader_clock:

View file

@ -748,6 +748,17 @@ typedef struct {
bi_block *continue_block; bi_block *continue_block;
bool emitted_atest; bool emitted_atest;
/* During NIR->BIR, the coverage bitmap. If this is NULL, the default
* coverage bitmap should be source from preloaded register r60. This is
* written by ATEST and ZS_EMIT
*/
bi_index coverage;
/* During NIR->BIR, table of preloaded registers, or NULL if never
* preloaded.
*/
bi_index preloaded[64];
/* For creating temporaries */ /* For creating temporaries */
unsigned ssa_alloc; unsigned ssa_alloc;
unsigned reg_alloc; unsigned reg_alloc;
@ -1329,23 +1340,6 @@ bi_dontcare(bi_builder *b)
return bi_passthrough(BIFROST_SRC_FAU_HI); return bi_passthrough(BIFROST_SRC_FAU_HI);
} }
/*
* Vertex ID and Instance ID are preloaded registers. Where they are preloaded
* changed from Bifrost to Valhall. Provide helpers that smooth over the
* architectural difference.
*/
static inline bi_index
bi_vertex_id(bi_builder *b)
{
return bi_register((b->shader->arch >= 9) ? 60 : 61);
}
static inline bi_index
bi_instance_id(bi_builder *b)
{
return bi_register((b->shader->arch >= 9) ? 61 : 62);
}
#define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx) #define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx)
#define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index) #define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index)
#define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) #define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index)

View file

@ -83,8 +83,11 @@ protected:
bi_instr *I = bi_collect_i32_to(b, dest); bi_instr *I = bi_collect_i32_to(b, dest);
I->nr_srcs = count; I->nr_srcs = count;
b->cursor = bi_before_block(bi_start_block(&b->shader->blocks));
for (int i = 0; i < count; ++i) for (int i = 0; i < count; ++i)
I->src[i] = bi_register(idx*4 + i); I->src[i] = bi_mov_i32(b, bi_register(idx*4 + i));
b->cursor = bi_after_instr(I);
} }
}; };