mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-26 06:20:09 +01:00
ir3: Introduce systall metric and new helper functions
Add new centralized functions which will replace the various places we hardcode 10 for the number of (ss) nops, add numbers for soft (sy) nops based on similar computerator experiments with ldc, sam, and ldib (the most common (sy) producers), and add a "systall" metric which is analogous to sstall. This also fixes some cases where we'd erroniously count ldl* as (sy) producers instead of (ss) producers when calculating sstall. This only switches over the metric reporting to the new functions, so there is no behavior change. The following commit will switch over the rest of the compiler. While we're at it, remove max_sun as it's never set. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14246>
This commit is contained in:
parent
603791bdeb
commit
7e60978d30
7 changed files with 126 additions and 15 deletions
|
|
@ -249,7 +249,7 @@ ir3_collect_info(struct ir3_shader_variant *v)
|
|||
info->sizedwords = info->size / 4;
|
||||
|
||||
foreach_block (block, &shader->block_list) {
|
||||
int sfu_delay = 0;
|
||||
int sfu_delay = 0, mem_delay = 0;
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
|
||||
|
|
@ -307,15 +307,25 @@ ir3_collect_info(struct ir3_shader_variant *v)
|
|||
sfu_delay = 0;
|
||||
}
|
||||
|
||||
if (instr->flags & IR3_INSTR_SY)
|
||||
if (instr->flags & IR3_INSTR_SY) {
|
||||
info->sy++;
|
||||
info->systall += mem_delay;
|
||||
mem_delay = 0;
|
||||
}
|
||||
|
||||
if (is_sfu(instr)) {
|
||||
sfu_delay = 10;
|
||||
if (is_ss_producer(instr)) {
|
||||
sfu_delay = soft_ss_delay(instr);
|
||||
} else {
|
||||
int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
|
||||
sfu_delay -= n;
|
||||
}
|
||||
|
||||
if (is_sy_producer(instr)) {
|
||||
mem_delay = soft_sy_delay(instr, shader);
|
||||
} else {
|
||||
int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
|
||||
mem_delay -= n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -78,6 +78,8 @@ struct ir3_info {
|
|||
|
||||
/* estimate of number of cycles stalled on (ss) */
|
||||
uint16_t sstall;
|
||||
/* estimate of number of cycles stalled on (sy) */
|
||||
uint16_t systall;
|
||||
|
||||
uint16_t last_baryf; /* instruction # of last varying fetch */
|
||||
|
||||
|
|
@ -1655,6 +1657,102 @@ unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
|
|||
unsigned ir3_delay_calc(struct ir3_block *block,
|
||||
struct ir3_instruction *instr, bool mergedregs);
|
||||
|
||||
/* estimated (ss)/(sy) delay calculation */
|
||||
|
||||
static inline bool
|
||||
is_local_mem_load(struct ir3_instruction *instr)
|
||||
{
|
||||
return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
|
||||
instr->opc == OPC_LDLW;
|
||||
}
|
||||
|
||||
/* Does this instruction need (ss) to wait for its result? */
|
||||
static inline bool
|
||||
is_ss_producer(struct ir3_instruction *instr)
|
||||
{
|
||||
return is_sfu(instr) || is_local_mem_load(instr);
|
||||
}
|
||||
|
||||
/* The soft delay for approximating the cost of (ss). */
|
||||
static inline unsigned
|
||||
soft_ss_delay(struct ir3_instruction *instr)
|
||||
{
|
||||
/* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
|
||||
* using nop's instead of (ss) is:
|
||||
*
|
||||
* 8 - single warp
|
||||
* 9 - two warps
|
||||
* 10 - four warps
|
||||
*
|
||||
* and so on. Not quite sure where it tapers out (ie. how many warps share an
|
||||
* SFU unit). But 10 seems like a reasonable # to choose:
|
||||
*/
|
||||
return 10;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_sy_producer(struct ir3_instruction *instr)
|
||||
{
|
||||
return is_tex_or_prefetch(instr) ||
|
||||
(is_load(instr) && !is_local_mem_load(instr)) ||
|
||||
is_atomic(instr->opc);
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
|
||||
{
|
||||
/* TODO: this is just an optimistic guess, we can do better post-RA.
|
||||
*/
|
||||
bool double_wavesize =
|
||||
shader->type == MESA_SHADER_FRAGMENT ||
|
||||
shader->type == MESA_SHADER_COMPUTE;
|
||||
|
||||
unsigned components = reg_elems(instr->dsts[0]);
|
||||
|
||||
/* These numbers come from counting the number of delay slots to get
|
||||
* cat5/cat6 results back using nops instead of (sy). Note that these numbers
|
||||
* are with the result preloaded to cache by loading it before in the same
|
||||
* shader - uncached results are much larger.
|
||||
*
|
||||
* Note: most ALU instructions can't complete at the full doubled rate, so
|
||||
* they take 2 cycles. The only exception is fp16 instructions with no
|
||||
* built-in conversions. Therefore divide the latency by 2.
|
||||
*
|
||||
* TODO: Handle this properly in the scheduler and remove this.
|
||||
*/
|
||||
if (instr->opc == OPC_LDC) {
|
||||
if (double_wavesize)
|
||||
return (21 + 8 * components) / 2;
|
||||
else
|
||||
return 18 + 4 * components;
|
||||
} else if (is_tex_or_prefetch(instr)) {
|
||||
if (double_wavesize) {
|
||||
switch (components) {
|
||||
case 1: return 58 / 2;
|
||||
case 2: return 60 / 2;
|
||||
case 3: return 77 / 2;
|
||||
case 4: return 79 / 2;
|
||||
default: unreachable("bad number of components");
|
||||
}
|
||||
} else {
|
||||
switch (components) {
|
||||
case 1: return 51;
|
||||
case 2: return 53;
|
||||
case 3: return 62;
|
||||
case 4: return 64;
|
||||
default: unreachable("bad number of components");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* TODO: measure other cat6 opcodes like ldg */
|
||||
if (double_wavesize)
|
||||
return (172 + components) / 2;
|
||||
else
|
||||
return 109 + components;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* unreachable block elimination: */
|
||||
bool ir3_remove_unreachable(struct ir3 *ir);
|
||||
|
||||
|
|
|
|||
|
|
@ -264,11 +264,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
|
|||
ir3_NOP(block)->flags |= IR3_INSTR_SS;
|
||||
last_input_needs_ss = false;
|
||||
} else if (is_load(n)) {
|
||||
/* seems like ldlv needs (ss) bit instead?? which is odd but
|
||||
* makes a bunch of flat-varying tests start working on a4xx.
|
||||
*/
|
||||
if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
|
||||
(n->opc == OPC_LDLW))
|
||||
if (is_local_mem_load(n))
|
||||
regmask_set(&state->needs_ss, n->dsts[0]);
|
||||
else
|
||||
regmask_set(&state->needs_sy, n->dsts[0]);
|
||||
|
|
|
|||
|
|
@ -790,9 +790,9 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
|
|||
|
||||
fprintf(
|
||||
out,
|
||||
"; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
|
||||
type, so->shader->id, so->id, so->info.sstall, so->info.ss, so->info.sy,
|
||||
so->max_sun, so->loops);
|
||||
"; %s prog %d/%d: %u sstall, %u (ss), %u systall, %u (sy), %d loops\n",
|
||||
type, so->shader->id, so->id, so->info.sstall, so->info.ss,
|
||||
so->info.systall, so->info.sy, so->loops);
|
||||
|
||||
/* print shader type specific info: */
|
||||
switch (so->type) {
|
||||
|
|
|
|||
|
|
@ -538,7 +538,6 @@ struct ir3_shader_variant {
|
|||
*/
|
||||
unsigned branchstack;
|
||||
|
||||
unsigned max_sun;
|
||||
unsigned loops;
|
||||
|
||||
/* the instructions length is in units of instruction groups
|
||||
|
|
|
|||
|
|
@ -3584,6 +3584,14 @@ tu_GetPipelineExecutableStatisticsKHR(
|
|||
stat->value.u64 = exe->stats.sstall;
|
||||
}
|
||||
|
||||
vk_outarray_append(&out, stat) {
|
||||
WRITE_STR(stat->name, "Estimated cycles stalled on SY");
|
||||
WRITE_STR(stat->description,
|
||||
"A better metric to estimate the impact of SY syncs.");
|
||||
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
|
||||
stat->value.u64 = exe->stats.systall;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
|
||||
vk_outarray_append(&out, stat) {
|
||||
WRITE_STR(stat->name, "cat%d instructions", i);
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ dump_shader_info(struct ir3_shader_variant *v,
|
|||
"%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
|
||||
"%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
|
||||
"%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
|
||||
"%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
|
||||
"%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
|
||||
"%d loops\n",
|
||||
ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
|
||||
v->info.instrs_count - v->info.nops_count, v->info.mov_count,
|
||||
|
|
@ -96,7 +96,7 @@ dump_shader_info(struct ir3_shader_variant *v,
|
|||
v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
|
||||
v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
|
||||
v->info.stp_count, v->info.ldp_count, v->info.sstall,
|
||||
v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
|
||||
v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue