mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 04:38:03 +02:00
pan/va: Do scoreboard analysis
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16804>
This commit is contained in:
parent
7e3b9cf754
commit
41b39d6d5d
1 changed files with 260 additions and 3 deletions
|
|
@ -32,8 +32,31 @@
|
|||
* followed by a cleanup pass to merge flow control modifiers on adjacent
|
||||
* instructions, eliminating the NOPs. This decouples optimization from
|
||||
* correctness, simplifying both passes.
|
||||
*
|
||||
* This pass is responsible for calculating dependencies, according to the
|
||||
* rules:
|
||||
*
|
||||
* 1. An instruction that depends on the results of a previous asyncronous
|
||||
* must first wait for that instruction's slot, unless all
|
||||
* reaching code paths already depended on it.
|
||||
* 2. More generally, any dependencies must be encoded. This includes
|
||||
* Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
|
||||
* 3. The shader must wait on slot #6 before running BLEND, ATEST
|
||||
* 4. The shader must wait on slot #7 before running BLEND, ST_TILE
|
||||
* 6. BARRIER must wait on every active slot.
|
||||
*
|
||||
* Unlike Bifrost, it is not necessary to worry about outbound staging
|
||||
* registers, as the hardware stalls reading staging registers when issuing
|
||||
* asynchronous instructions. So we don't track reads in our model of the
|
||||
* hardware scoreboard. This makes things a bit simpler.
|
||||
*
|
||||
* We may reuse slots for multiple asynchronous instructions, though there may
|
||||
* be a performance penalty.
|
||||
*/
|
||||
|
||||
#define BI_NUM_GENERAL_SLOTS 3
|
||||
#define BI_NUM_REGISTERS 64
|
||||
|
||||
/*
|
||||
* Insert a NOP instruction with given flow control.
|
||||
*/
|
||||
|
|
@ -45,6 +68,225 @@ bi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow)
|
|||
bi_nop(&b)->flow = flow;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
bi_read_mask(bi_instr *I)
|
||||
{
|
||||
uint64_t mask = 0;
|
||||
|
||||
bi_foreach_src(I, s) {
|
||||
if (I->src[s].type == BI_INDEX_REGISTER) {
|
||||
unsigned reg = I->src[s].value;
|
||||
unsigned count = bi_count_read_registers(I, s);
|
||||
|
||||
mask |= (BITFIELD64_MASK(count) << reg);
|
||||
}
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
bi_write_mask(bi_instr *I)
|
||||
{
|
||||
uint64_t mask = 0;
|
||||
|
||||
bi_foreach_dest(I, d) {
|
||||
if (bi_is_null(I->dest[d])) continue;
|
||||
|
||||
assert(I->dest[d].type == BI_INDEX_REGISTER);
|
||||
|
||||
unsigned reg = I->dest[d].value;
|
||||
unsigned count = bi_count_write_registers(I, d);
|
||||
|
||||
mask |= (BITFIELD64_MASK(count) << reg);
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static bool
|
||||
bi_ld_vary_writes_hidden_register(const bi_instr *I)
|
||||
{
|
||||
/* Only varying loads can write the hidden register */
|
||||
if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING)
|
||||
return false;
|
||||
|
||||
/* They only write in some update modes */
|
||||
return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER);
|
||||
}
|
||||
|
||||
static bool
|
||||
bi_is_memory_access(const bi_instr *I)
|
||||
{
|
||||
/* On the attribute unit but functionally a general memory load */
|
||||
if (I->op == BI_OPCODE_LD_ATTR_TEX)
|
||||
return true;
|
||||
|
||||
/* UBOs are read-only so there are no ordering constriants */
|
||||
if (I->seg == BI_SEG_UBO)
|
||||
return false;
|
||||
|
||||
switch (bi_opcode_props[I->op].message) {
|
||||
case BIFROST_MESSAGE_LOAD:
|
||||
case BIFROST_MESSAGE_STORE:
|
||||
case BIFROST_MESSAGE_ATOMIC:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the scoreboard model to assign an instruction to a given slot */
|
||||
|
||||
static void
|
||||
bi_push_instr(struct bi_scoreboard_state *st, bi_instr *I)
|
||||
{
|
||||
if (bi_opcode_props[I->op].sr_write)
|
||||
st->write[I->slot] |= bi_write_mask(I);
|
||||
|
||||
if (bi_is_memory_access(I))
|
||||
st->memory |= BITFIELD_BIT(I->slot);
|
||||
|
||||
if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING)
|
||||
st->varying |= BITFIELD_BIT(I->slot);
|
||||
}
|
||||
|
||||
static uint8_t MUST_CHECK
|
||||
bi_pop_slot(struct bi_scoreboard_state *st, unsigned slot)
|
||||
{
|
||||
st->write[slot] = 0;
|
||||
st->varying &= ~BITFIELD_BIT(slot);
|
||||
st->memory &= ~BITFIELD_BIT(slot);
|
||||
|
||||
return BITFIELD_BIT(slot);
|
||||
}
|
||||
|
||||
/* Adds a dependency on each slot writing any specified register */
|
||||
|
||||
static uint8_t MUST_CHECK
|
||||
bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask)
|
||||
{
|
||||
uint8_t slots = 0;
|
||||
|
||||
for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
|
||||
if (st->write[slot] & regmask)
|
||||
slots |= bi_pop_slot(st, slot);
|
||||
}
|
||||
|
||||
return slots;
|
||||
}
|
||||
|
||||
/* Sets the dependencies for a given clause, updating the model */
|
||||
|
||||
static void
|
||||
bi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st)
|
||||
{
|
||||
/* Depend on writers to handle read-after-write and write-after-write
|
||||
* dependencies. Write-after-read dependencies are handled in the hardware
|
||||
* where necessary, so we don't worry about them.
|
||||
*/
|
||||
I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I));
|
||||
|
||||
/* Handle write-after-write and write-after-read dependencies for the varying
|
||||
* hidden registers. Read-after-write dependencies handled in hardware.
|
||||
*/
|
||||
if (bi_ld_vary_writes_hidden_register(I)) {
|
||||
u_foreach_bit(slot, st->varying)
|
||||
I->flow |= bi_pop_slot(st, slot);
|
||||
}
|
||||
|
||||
/* For now, serialize all memory access */
|
||||
if (bi_is_memory_access(I)) {
|
||||
u_foreach_bit(slot, st->memory)
|
||||
I->flow |= bi_pop_slot(st, slot);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
scoreboard_block_update(bi_context *ctx, bi_block *blk)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
/* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
|
||||
bi_foreach_predecessor(blk, pred) {
|
||||
for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
|
||||
blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
|
||||
blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
|
||||
blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying;
|
||||
blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory;
|
||||
}
|
||||
}
|
||||
|
||||
struct bi_scoreboard_state state = blk->scoreboard_in;
|
||||
|
||||
/* Assign locally */
|
||||
|
||||
bi_foreach_instr_in_block(blk, I) {
|
||||
bi_set_dependencies(blk, I, &state);
|
||||
bi_push_instr(&state, I);
|
||||
}
|
||||
|
||||
/* Insert a wait for varyings at the end of the block.
|
||||
*
|
||||
* A varying load with .store has to wait for all other varying loads
|
||||
* in the quad to complete. The bad case looks like:
|
||||
*
|
||||
* if (dynamic) {
|
||||
* x = ld_var()
|
||||
* } else {
|
||||
* x = ld_var()
|
||||
* }
|
||||
*
|
||||
* Logically, a given thread executes only a single ld_var instruction. But
|
||||
* if the quad diverges, the second ld_var has to wait for the first ld_var.
|
||||
* For correct handling, we need to maintain a physical control flow graph
|
||||
* and do the dataflow analysis on that instead of the logical control flow
|
||||
* graph. However, this probably doesn't matter much in practice. This seems
|
||||
* like a decent compromise for now.
|
||||
*
|
||||
* TODO: Consider optimizing this case.
|
||||
*/
|
||||
if (state.varying) {
|
||||
uint8_t flow = 0;
|
||||
|
||||
u_foreach_bit(slot, state.varying)
|
||||
flow |= bi_pop_slot(&state, slot);
|
||||
|
||||
bi_flow(ctx, bi_after_block(blk), flow);
|
||||
}
|
||||
|
||||
/* To figure out progress, diff scoreboard_out */
|
||||
progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
|
||||
|
||||
blk->scoreboard_out = state;
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static void
|
||||
va_assign_scoreboard(bi_context *ctx)
|
||||
{
|
||||
u_worklist worklist;
|
||||
bi_worklist_init(ctx, &worklist);
|
||||
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_worklist_push_tail(&worklist, block);
|
||||
}
|
||||
|
||||
/* Perform forward data flow analysis to calculate dependencies */
|
||||
while (!u_worklist_is_empty(&worklist)) {
|
||||
/* Pop from the front for forward analysis */
|
||||
bi_block *blk = bi_worklist_pop_head(&worklist);
|
||||
|
||||
if (scoreboard_block_update(ctx, blk)) {
|
||||
bi_foreach_successor(blk, succ)
|
||||
bi_worklist_push_tail(&worklist, succ);
|
||||
}
|
||||
}
|
||||
|
||||
u_worklist_fini(&worklist);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if execution should terminate after a given block. Execution cannot
|
||||
* terminate within a basic block.
|
||||
|
|
@ -70,6 +312,11 @@ va_should_end(bi_block *block)
|
|||
void
|
||||
va_insert_flow_control_nops(bi_context *ctx)
|
||||
{
|
||||
/* First do dataflow analysis for the scoreboard. This populates I->flow with
|
||||
* a bitmap of slots to wait on.
|
||||
*/
|
||||
va_assign_scoreboard(ctx);
|
||||
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_foreach_instr_in_block_safe(block, I) {
|
||||
switch (I->op) {
|
||||
|
|
@ -94,12 +341,22 @@ va_insert_flow_control_nops(bi_context *ctx)
|
|||
bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
|
||||
break;
|
||||
|
||||
/* TODO: Optimize waits for asynchronous instructions */
|
||||
default:
|
||||
if (bi_opcode_props[I->op].message)
|
||||
bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
|
||||
break;
|
||||
}
|
||||
|
||||
if (I->flow && I->op != BI_OPCODE_NOP) {
|
||||
/* Wait on the results of asynchronous instructions
|
||||
*
|
||||
* Bitmap of general slots lines up with the encoding of va_flow for
|
||||
* waits on general slots. The dataflow analysis should be ignoring
|
||||
* the special slots #6 and #7, which are handled separately.
|
||||
*/
|
||||
assert((I->flow & ~BITFIELD_MASK(BI_NUM_GENERAL_SLOTS)) == 0);
|
||||
|
||||
bi_flow(ctx, bi_before_instr(I), I->flow);
|
||||
I->flow = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* End exeuction at the end of the block if needed, or reconverge if we
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue