mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 04:38:03 +02:00
pan/bi: Implement basic scoreboarding pass
Extend our existing bi_scoreboard infrastructure with a simple data flow analysis pass that calculates which dependency slots need waiting. We still lack a heuristic for selecting dependency slots. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14298>
This commit is contained in:
parent
8f25d88d90
commit
c81c022e66
3 changed files with 244 additions and 29 deletions
|
|
@ -36,6 +36,12 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
|
|||
unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
|
||||
dependency_wait |= next_2 ? next_2->dependencies : 0;
|
||||
|
||||
/* Signal barriers (slot #7) immediately. This is not optimal but good
|
||||
* enough. Doing better requires extending the IR and scheduler.
|
||||
*/
|
||||
if (clause->message_type == BIFROST_MESSAGE_BARRIER)
|
||||
dependency_wait |= BITFIELD_BIT(7);
|
||||
|
||||
bool staging_barrier = next_1 ? next_1->staging_barrier : false;
|
||||
staging_barrier |= next_2 ? next_2->staging_barrier : 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -1860,9 +1860,6 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint
|
|||
clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
|
||||
clause->block = block;
|
||||
|
||||
/* TODO: scoreboard assignment post-sched */
|
||||
clause->dependencies |= (1 << 0);
|
||||
|
||||
/* We emit in reverse and emitted to the back of the tuples array, so
|
||||
* move it up front for easy indexing */
|
||||
memmove(clause->tuples,
|
||||
|
|
|
|||
|
|
@ -54,55 +54,267 @@
|
|||
*/
|
||||
|
||||
#define BI_NUM_GENERAL_SLOTS 6
|
||||
#define BI_NUM_SLOTS 8
|
||||
#define BI_NUM_REGISTERS 64
|
||||
#define BI_SLOT_SERIAL 0 /* arbitrary */
|
||||
|
||||
/* A model for the state of the scoreboard */
|
||||
/*
|
||||
* Due to the crude scoreboarding we do, we need to serialize varying loads and
|
||||
* memory access. Identify these instructions here.
|
||||
*/
|
||||
static bool
|
||||
bi_should_serialize(bi_instr *I)
|
||||
{
|
||||
/* Although nominally on the attribute unit, image loads have the same
|
||||
* coherency requirements as general memory loads. Serialize them for
|
||||
* now until we can do something more clever.
|
||||
*/
|
||||
if (I->op == BI_OPCODE_LD_ATTR_TEX)
|
||||
return true;
|
||||
|
||||
struct bi_scoreboard_state {
|
||||
/* TODO: what do we track here for a heuristic? */
|
||||
};
|
||||
switch (bi_opcode_props[I->op].message) {
|
||||
case BIFROST_MESSAGE_VARYING:
|
||||
case BIFROST_MESSAGE_LOAD:
|
||||
case BIFROST_MESSAGE_STORE:
|
||||
case BIFROST_MESSAGE_ATOMIC:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Given a scoreboard model, choose a slot for a clause wrapping a given
|
||||
* message passing instruction. No side effects. */
|
||||
|
||||
static unsigned
|
||||
bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message)
|
||||
bi_choose_scoreboard_slot(bi_instr *message)
|
||||
{
|
||||
/* A clause that does not produce a message must use slot #0 */
|
||||
if (!message)
|
||||
return 0;
|
||||
|
||||
switch (message->op) {
|
||||
/* ATEST, ZS_EMIT must be issued with slot #0 */
|
||||
case BI_OPCODE_ATEST:
|
||||
case BI_OPCODE_ZS_EMIT:
|
||||
if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
|
||||
return 0;
|
||||
|
||||
/* BARRIER must be issued with slot #7 */
|
||||
case BI_OPCODE_BARRIER:
|
||||
if (message->op == BI_OPCODE_BARRIER)
|
||||
return 7;
|
||||
|
||||
default:
|
||||
break;
|
||||
/* For now, make serialization is easy */
|
||||
if (bi_should_serialize(message))
|
||||
return BI_SLOT_SERIAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
bi_read_mask(bi_instr *I, bool staging_only)
|
||||
{
|
||||
uint64_t mask = 0;
|
||||
|
||||
if (staging_only && !bi_opcode_props[I->op].sr_read)
|
||||
return mask;
|
||||
|
||||
bi_foreach_src(I, s) {
|
||||
if (I->src[s].type == BI_INDEX_REGISTER) {
|
||||
unsigned reg = I->src[s].value;
|
||||
unsigned count = bi_count_read_registers(I, s);
|
||||
|
||||
mask |= (BITFIELD64_MASK(count) << reg);
|
||||
}
|
||||
|
||||
if (staging_only)
|
||||
break;
|
||||
}
|
||||
|
||||
/* TODO: Use a heuristic */
|
||||
return 0;
|
||||
return mask;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
bi_write_mask(bi_instr *I)
|
||||
{
|
||||
uint64_t mask = 0;
|
||||
|
||||
bi_foreach_dest(I, d) {
|
||||
if (bi_is_null(I->dest[d])) continue;
|
||||
|
||||
assert(I->dest[d].type == BI_INDEX_REGISTER);
|
||||
|
||||
unsigned reg = I->dest[d].value;
|
||||
unsigned count = bi_count_write_registers(I, d);
|
||||
|
||||
mask |= (BITFIELD64_MASK(count) << reg);
|
||||
}
|
||||
|
||||
/* Instructions like AXCHG.i32 unconditionally both read and write
|
||||
* staging registers. Even if we discard the result, the write still
|
||||
* happens logically and needs to be included in our calculations.
|
||||
* Obscurely, ATOM_CX is sr_write but can ignore the staging register in
|
||||
* certain circumstances; this does not require consideration.
|
||||
*/
|
||||
if (bi_opcode_props[I->op].sr_write && bi_is_null(I->dest[0]) &&
|
||||
!bi_is_null(I->src[0])) {
|
||||
|
||||
unsigned reg = I->src[0].value;
|
||||
unsigned count = bi_count_write_registers(I, 0);
|
||||
|
||||
mask |= (BITFIELD64_MASK(count) << reg);
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
/* Update the scoreboard model to assign an instruction to a given slot */
|
||||
|
||||
static void
|
||||
bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
|
||||
{
|
||||
bi_instr *I = clause->message;
|
||||
unsigned slot = clause->scoreboard_id;
|
||||
|
||||
if (!I)
|
||||
return;
|
||||
|
||||
st->read[slot] |= bi_read_mask(I, true);
|
||||
|
||||
if (bi_opcode_props[I->op].sr_write)
|
||||
st->write[slot] |= bi_write_mask(I);
|
||||
}
|
||||
|
||||
/* Adds a dependency on each slot writing any specified register */
|
||||
|
||||
static void
|
||||
bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
|
||||
{
|
||||
for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
|
||||
if (!(st->write[slot] & regmask))
|
||||
continue;
|
||||
|
||||
st->write[slot] = 0;
|
||||
st->read[slot] = 0;
|
||||
|
||||
clause->dependencies |= BITFIELD_BIT(slot);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
|
||||
{
|
||||
for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
|
||||
if (!(st->read[slot] & regmask))
|
||||
continue;
|
||||
|
||||
st->read[slot] = 0;
|
||||
clause->staging_barrier = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sets the dependencies for a given clause, updating the model */
|
||||
|
||||
static void
|
||||
bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
|
||||
{
|
||||
bi_foreach_instr_in_clause(block, clause, I) {
|
||||
uint64_t read = bi_read_mask(I, false);
|
||||
uint64_t written = bi_write_mask(I);
|
||||
|
||||
/* Read-after-write; write-after-write */
|
||||
bi_depend_on_writers(clause, st, read | written);
|
||||
|
||||
/* Write-after-read */
|
||||
bi_set_staging_barrier(clause, st, written);
|
||||
}
|
||||
|
||||
/* LD_VAR instructions must be serialized per-quad. Just always depend
|
||||
* on any LD_VAR instructions. This isn't optimal, but doing better
|
||||
* requires divergence-aware data flow analysis.
|
||||
*
|
||||
* Similarly, memory loads/stores need to be synchronized. For now,
|
||||
* force them to be serialized. This is not optimal.
|
||||
*/
|
||||
if (clause->message && bi_should_serialize(clause->message))
|
||||
clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
|
||||
|
||||
/* Barriers must wait on all slots to flush existing work. It might be
|
||||
* possible to skip this with more information about the barrier. For
|
||||
* now, be conservative.
|
||||
*/
|
||||
if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
|
||||
clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
|
||||
}
|
||||
|
||||
static bool
|
||||
scoreboard_block_update(bi_block *blk)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
/* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
|
||||
bi_foreach_predecessor(blk, pred) {
|
||||
for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
|
||||
blk->scoreboard_in.read[i] |= pred->scoreboard_out.read[i];
|
||||
blk->scoreboard_in.write[i] |= pred->scoreboard_out.write[i];
|
||||
}
|
||||
}
|
||||
|
||||
struct bi_scoreboard_state state = blk->scoreboard_in;
|
||||
|
||||
/* Assign locally */
|
||||
|
||||
bi_foreach_clause_in_block(blk, clause) {
|
||||
bi_set_dependencies(blk, clause, &state);
|
||||
bi_push_clause(&state, clause);
|
||||
}
|
||||
|
||||
/* To figure out progress, diff scoreboard_out */
|
||||
|
||||
for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
|
||||
progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
|
||||
|
||||
blk->scoreboard_out = state;
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
void
|
||||
bi_assign_scoreboard(bi_context *ctx)
|
||||
{
|
||||
struct bi_scoreboard_state st = {};
|
||||
|
||||
/* Assign slots */
|
||||
/* First, assign slots. */
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_foreach_clause_in_block(block, clause) {
|
||||
unsigned slot = bi_choose_scoreboard_slot(&st, clause->message);
|
||||
clause->scoreboard_id = slot;
|
||||
|
||||
bi_clause *next = bi_next_clause(ctx, block, clause);
|
||||
if (next)
|
||||
next->dependencies |= (1 << slot);
|
||||
if (clause->message) {
|
||||
unsigned slot = bi_choose_scoreboard_slot(clause->message);
|
||||
clause->scoreboard_id = slot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Next, perform forward data flow analysis to calculate dependencies */
|
||||
/* Set of bi_block */
|
||||
struct set *work_list = _mesa_set_create(NULL,
|
||||
_mesa_hash_pointer,
|
||||
_mesa_key_pointer_equal);
|
||||
|
||||
struct set *visited = _mesa_set_create(NULL,
|
||||
_mesa_hash_pointer,
|
||||
_mesa_key_pointer_equal);
|
||||
|
||||
/* Initialize the work list with the first block */
|
||||
struct set_entry *cur;
|
||||
|
||||
cur = _mesa_set_add(work_list, bi_start_block(&ctx->blocks));
|
||||
|
||||
/* Iterate the work list */
|
||||
do {
|
||||
bi_block *blk = (struct bi_block *) cur->key;
|
||||
_mesa_set_remove(work_list, cur);
|
||||
|
||||
bool progress = scoreboard_block_update(blk);
|
||||
|
||||
if (progress || !_mesa_set_search(visited, blk)) {
|
||||
bi_foreach_successor(blk, pred)
|
||||
_mesa_set_add(work_list, pred);
|
||||
}
|
||||
|
||||
_mesa_set_add(visited, blk);
|
||||
} while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
|
||||
|
||||
_mesa_set_destroy(visited, NULL);
|
||||
_mesa_set_destroy(work_list, NULL);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue