mesa/src/asahi/compiler/agx_performance.c
Alyssa Rosenzweig b9a359e9bd agx: start a crude cycle model
based on notes by Dougall Johnson and Philip Turner.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
2024-03-30 00:26:19 +00:00

131 lines
3.7 KiB
C

/*
* Copyright 2023 Alyssa Rosenzweig
* SPDX-License-Identifier: MIT
*/
#include "agx_compile.h"
#include "agx_compiler.h"
#include "agx_opcodes.h"
/* Table describing the relationship between registers pressure and thread
* count. Each entry describes a maximum number of registers and the associated
* best-case thread count.
*
* Sorted in ascending order of maximum registers for easy lookup.
*/
static const struct agx_occupancy occupancies[] = {
{104, 1024}, {112, 896}, {128, 832}, {136, 768}, {144, 704},
{160, 640}, {184, 576}, {208, 512}, {232, 448}, {256, 384},
};
struct agx_occupancy
agx_occupancy_for_register_count(unsigned halfregs)
{
for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
unsigned max = occupancies[i].max_registers;
assert((i == 0 || max > occupancies[i - 1].max_registers) && "ascending");
if (halfregs <= max)
return occupancies[i];
}
unreachable("Register count must be less than the maximum");
}
unsigned
agx_max_registers_for_occupancy(unsigned occupancy)
{
unsigned max_regs = 0;
for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
if (occupancy <= occupancies[i].max_threads)
max_regs = occupancies[i].max_registers;
else
break;
}
assert(max_regs > 0 && "Thread count must be less than the maximum");
return max_regs;
}
/* Crude cycle model for G13G */
enum alu_unit {
NONE,
SCIB,
IC,
F32,
F16,
};
struct alu_timing {
enum alu_unit unit;
unsigned latency;
unsigned tp;
};
/* clang-format off */
struct alu_timing op_timings[] = {
[AGX_OPCODE_FMA] = { F32, 2, 1 },
[AGX_OPCODE_FADD] = { F32, 2, 1 },
[AGX_OPCODE_FMUL] = { F32, 2, 1 },
[AGX_OPCODE_MOV_IMM] = { SCIB, 1, 1 },
[AGX_OPCODE_BITOP] = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */
[AGX_OPCODE_ICMPSEL] = { SCIB, 2, 1 },
[AGX_OPCODE_FCMPSEL] = { SCIB, 2, 1 },
[AGX_OPCODE_IADD] = { SCIB, 2, 1 },
[AGX_OPCODE_GET_SR] = { SCIB, 2, 2 },
[AGX_OPCODE_GET_SR_BARRIER] = { SCIB, 2, 2 },
[AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 },
[AGX_OPCODE_IMAD] = { IC, 3, 2 },
[AGX_OPCODE_BFI] = { IC, 3, 2 },
[AGX_OPCODE_EXTR] = { IC, 3, 2 },
[AGX_OPCODE_ASR] = { IC, 3, 2 },
[AGX_OPCODE_FLOOR] = { IC, 3, 2 },
[AGX_OPCODE_SIN_PT_1] = { IC, 3, 2 },
[AGX_OPCODE_SIN_PT_2] = { IC, 5, 2 },
[AGX_OPCODE_LOG2] = { IC, 5, 2 },
[AGX_OPCODE_EXP2] = { IC, 5, 2 },
[AGX_OPCODE_RCP] = { IC, 5, 3 },
[AGX_OPCODE_RSQRT] = { IC, 6, 4 },
[AGX_OPCODE_SRSQRT] = { IC, 6, 4 },
[AGX_OPCODE_SIMD_PREFIX_IADD] = { SCIB, 18, 18 },
[AGX_OPCODE_SIMD_IADD] = { SCIB, 24, 24 },
[AGX_OPCODE_SIMD_SHUFFLE] = { SCIB, 5, 2 },
[AGX_OPCODE_ICMP_BALLOT] = { SCIB, 5, 2 },
[AGX_OPCODE_FCMP_BALLOT] = { SCIB, 5, 2 },
[AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2 },
[AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2 },
};
/* clang-format on */
/*
* TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc.
*/
struct agx_cycle_estimate
agx_estimate_cycles(agx_context *ctx)
{
struct agx_cycle_estimate est = {0};
agx_foreach_instr_global(ctx, I) {
struct alu_timing alu = I->op < ARRAY_SIZE(op_timings)
? op_timings[I->op]
: (struct alu_timing){0};
if (alu.unit == IC) {
est.ic += alu.tp * 2;
} else if (alu.unit) {
est.f_scib += alu.tp;
} else {
/* TODO */
}
}
/* IC and F/SCIB run in parallel across warps */
est.alu = MAX2(est.ic, est.f_scib);
return est;
}