tu: Use predicate bit for perf queries

Start to take advantage of a6xx gen3+ having multiple predicate bits.
For now we define 2 predicate bits, but there will be more for
concurrent binning.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
This commit is contained in:
Connor Abbott 2025-05-21 12:17:59 -04:00 committed by Marge Bot
parent 46438d407d
commit 4ac666eaa7
2 changed files with 20 additions and 7 deletions

View file

@ -148,6 +148,11 @@
*/
#define TU_FDM_OFFSET_GRANULARITY 8
enum tu_predicate_bit {
TU_PREDICATE_LOAD_STORE = 0,
TU_PREDICATE_PERFCNTRS = 1,
};
#define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)
#define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)

View file

@ -1137,14 +1137,18 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
}
static void
emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
emit_perfcntrs_pass_start(bool has_pred_bit, struct tu_cs *cs, uint32_t pass)
{
tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
A6XX_CP_REG_TEST_0_BIT(pass) |
(has_pred_bit ?
A6XX_CP_REG_TEST_0_PRED_BIT(TU_PREDICATE_PERFCNTRS) : 0) |
A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST) |
(has_pred_bit ?
CP_COND_REG_EXEC_0_PRED_BIT(TU_PREDICATE_PERFCNTRS) : 0));
}
template <chip CHIP>
@ -1156,8 +1160,10 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
uint32_t last_pass = ~0;
bool has_pred_bit =
cmdbuf->device->physical_device->info->a6xx.has_pred_bit;
if (cmdbuf->state.pass) {
if (cmdbuf->state.pass && !has_pred_bit) {
cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
}
@ -1198,7 +1204,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(cs, data->pass);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1222,7 +1228,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(cs, data->pass);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1637,6 +1643,8 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
uint64_t begin_iova;
uint64_t result_iova;
uint32_t last_pass = ~0;
bool has_pred_bit =
cmdbuf->device->physical_device->info->a6xx.has_pred_bit;
/* Wait for the profiled work to finish so that collected counter values
* are as accurate as possible.
@ -1651,7 +1659,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(cs, data->pass);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1678,7 +1686,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(cs, data->pass);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
}
result_iova = query_result_iova(pool, query, struct perfcntr_query_slot,