ir3: Introduce systall metric and new helper functions

Add new centralized functions which will replace the various places we
hardcode 10 for the number of (ss) nops, add numbers for soft (sy) nops
based on similar computerator experiments with ldc, sam, and ldib (the
most common (sy) producers), and add a "systall" metric which is
analogous to sstall. This also fixes some cases where we'd erroniously
count ldl* as (sy) producers instead of (ss) producers when calculating
sstall.

This only switches over the metric reporting to the new functions, so
there is no behavior change. The following commit will switch over
the rest of the compiler.

While we're at it, remove max_sun as it's never set.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14246>
This commit is contained in:
Connor Abbott 2021-12-17 17:40:02 +01:00 committed by Marge Bot
parent 603791bdeb
commit 7e60978d30
7 changed files with 126 additions and 15 deletions

View file

@ -249,7 +249,7 @@ ir3_collect_info(struct ir3_shader_variant *v)
info->sizedwords = info->size / 4;
foreach_block (block, &shader->block_list) {
int sfu_delay = 0;
int sfu_delay = 0, mem_delay = 0;
foreach_instr (instr, &block->instr_list) {
@ -307,15 +307,25 @@ ir3_collect_info(struct ir3_shader_variant *v)
sfu_delay = 0;
}
if (instr->flags & IR3_INSTR_SY)
if (instr->flags & IR3_INSTR_SY) {
info->sy++;
info->systall += mem_delay;
mem_delay = 0;
}
if (is_sfu(instr)) {
sfu_delay = 10;
if (is_ss_producer(instr)) {
sfu_delay = soft_ss_delay(instr);
} else {
int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
sfu_delay -= n;
}
if (is_sy_producer(instr)) {
mem_delay = soft_sy_delay(instr, shader);
} else {
int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
mem_delay -= n;
}
}
}

View file

@ -78,6 +78,8 @@ struct ir3_info {
/* estimate of number of cycles stalled on (ss) */
uint16_t sstall;
/* estimate of number of cycles stalled on (sy) */
uint16_t systall;
uint16_t last_baryf; /* instruction # of last varying fetch */
@ -1655,6 +1657,102 @@ unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
unsigned ir3_delay_calc(struct ir3_block *block,
struct ir3_instruction *instr, bool mergedregs);
/* estimated (ss)/(sy) delay calculation */
static inline bool
is_local_mem_load(struct ir3_instruction *instr)
{
return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
instr->opc == OPC_LDLW;
}
/* Does this instruction need (ss) to wait for its result? */
static inline bool
is_ss_producer(struct ir3_instruction *instr)
{
return is_sfu(instr) || is_local_mem_load(instr);
}
/* The soft delay for approximating the cost of (ss). */
static inline unsigned
soft_ss_delay(struct ir3_instruction *instr)
{
/* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
* using nop's instead of (ss) is:
*
* 8 - single warp
* 9 - two warps
* 10 - four warps
*
* and so on. Not quite sure where it tapers out (ie. how many warps share an
* SFU unit). But 10 seems like a reasonable # to choose:
*/
return 10;
}
static inline bool
is_sy_producer(struct ir3_instruction *instr)
{
return is_tex_or_prefetch(instr) ||
(is_load(instr) && !is_local_mem_load(instr)) ||
is_atomic(instr->opc);
}
static inline unsigned
soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
{
/* TODO: this is just an optimistic guess, we can do better post-RA.
*/
bool double_wavesize =
shader->type == MESA_SHADER_FRAGMENT ||
shader->type == MESA_SHADER_COMPUTE;
unsigned components = reg_elems(instr->dsts[0]);
/* These numbers come from counting the number of delay slots to get
* cat5/cat6 results back using nops instead of (sy). Note that these numbers
* are with the result preloaded to cache by loading it before in the same
* shader - uncached results are much larger.
*
* Note: most ALU instructions can't complete at the full doubled rate, so
* they take 2 cycles. The only exception is fp16 instructions with no
* built-in conversions. Therefore divide the latency by 2.
*
* TODO: Handle this properly in the scheduler and remove this.
*/
if (instr->opc == OPC_LDC) {
if (double_wavesize)
return (21 + 8 * components) / 2;
else
return 18 + 4 * components;
} else if (is_tex_or_prefetch(instr)) {
if (double_wavesize) {
switch (components) {
case 1: return 58 / 2;
case 2: return 60 / 2;
case 3: return 77 / 2;
case 4: return 79 / 2;
default: unreachable("bad number of components");
}
} else {
switch (components) {
case 1: return 51;
case 2: return 53;
case 3: return 62;
case 4: return 64;
default: unreachable("bad number of components");
}
}
} else {
/* TODO: measure other cat6 opcodes like ldg */
if (double_wavesize)
return (172 + components) / 2;
else
return 109 + components;
}
}
/* unreachable block elimination: */
bool ir3_remove_unreachable(struct ir3 *ir);

View file

@ -264,11 +264,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
ir3_NOP(block)->flags |= IR3_INSTR_SS;
last_input_needs_ss = false;
} else if (is_load(n)) {
/* seems like ldlv needs (ss) bit instead?? which is odd but
* makes a bunch of flat-varying tests start working on a4xx.
*/
if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
(n->opc == OPC_LDLW))
if (is_local_mem_load(n))
regmask_set(&state->needs_ss, n->dsts[0]);
else
regmask_set(&state->needs_sy, n->dsts[0]);

View file

@ -790,9 +790,9 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
fprintf(
out,
"; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
type, so->shader->id, so->id, so->info.sstall, so->info.ss, so->info.sy,
so->max_sun, so->loops);
"; %s prog %d/%d: %u sstall, %u (ss), %u systall, %u (sy), %d loops\n",
type, so->shader->id, so->id, so->info.sstall, so->info.ss,
so->info.systall, so->info.sy, so->loops);
/* print shader type specific info: */
switch (so->type) {

View file

@ -538,7 +538,6 @@ struct ir3_shader_variant {
*/
unsigned branchstack;
unsigned max_sun;
unsigned loops;
/* the instructions length is in units of instruction groups

View file

@ -3584,6 +3584,14 @@ tu_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.sstall;
}
vk_outarray_append(&out, stat) {
WRITE_STR(stat->name, "Estimated cycles stalled on SY");
WRITE_STR(stat->description,
"A better metric to estimate the impact of SY syncs.");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = exe->stats.systall;
}
for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
vk_outarray_append(&out, stat) {
WRITE_STR(stat->name, "cat%d instructions", i);

View file

@ -85,7 +85,7 @@ dump_shader_info(struct ir3_shader_variant *v,
"%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
"%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
"%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
"%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
"%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
"%d loops\n",
ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
v->info.instrs_count - v->info.nops_count, v->info.mov_count,
@ -96,7 +96,7 @@ dump_shader_info(struct ir3_shader_variant *v,
v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
v->info.stp_count, v->info.ldp_count, v->info.sstall,
v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
}
static void