tu: fix ZPASS_DONE interference between occlusion queries and autotuner

On newer devices where ZPASS_DONE events have sample count writing
abilities the firmware expects these events to come in begin-end pairs,
essentially corresponding to a typical occlusion query usage. Since this
event is also used in the autotuner we have to avoid event pairs to be
emitted in an interleaved fashion.

Additional renderpass state now tracks whether a given renderpass contains
an occlusion query. If so, autotuner will emit miscellaneous ZPASS_DONE
events in order to form its own begin-end pairs before and after the
renderpass commands.

Occlusion query behavior inside a renderpass doesn't change. But when used
outside of a renderpass, possible autotuner usage requires to again emit
ZPASS_DONE events that end up forming begin-end pairs of these events both
at the start and the end of the query.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Fixes: 4e6a1f8852 ("tu/autotune: Use `CP_EVENT_WRITE7::ZPASS_DONE` on A7XX")
Tested-by: Mike Lothian <mike@fireburn.co.uk>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29403>
This commit is contained in:
Zan Dobersek 2024-06-07 12:45:05 +02:00 committed by Marge Bot
parent 6bc7cd6108
commit 5653c52151
4 changed files with 67 additions and 1 deletions

View file

@ -672,6 +672,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true).value);
tu_cs_emit_qw(cs, result_iova);
/* If the renderpass contains an occlusion query with its own ZPASS_DONE,
* we have to provide a fake ZPASS_DONE event here to logically close the
* previous one, preventing firmware from misbehaving due to nested events.
* This writes into the samples_end field, which will be overwritten in
* tu_autotune_end_renderpass.
*/
if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true).value);
tu_cs_emit_qw(cs, result_iova);
}
} else {
tu_cs_emit_regs(cs,
A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
@ -697,10 +712,24 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
/* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
* event here, composing a pair of these events that firmware handles without
* issue. This first event writes into the samples_end field and the second
* event overwrites it. The second event also enables the accumulation flag
* even when we don't use that result because the blob always sets it.
*/
if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true).value);
tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
}
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true).value);
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true).value);
tu_cs_emit_qw(cs, result_iova);
} else {
result_iova += offsetof(struct tu_renderpass_samples, samples_end);

View file

@ -3888,6 +3888,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst,
dst->has_tess |= src->has_tess;
dst->has_gs |= src->has_gs;
dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
dst->has_zpass_done_sample_count_write_in_rp |= src->has_zpass_done_sample_count_write_in_rp;
dst->disable_gmem |= src->disable_gmem;
dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;

View file

@ -278,6 +278,7 @@ struct tu_render_pass_state
bool has_tess;
bool has_gs;
bool has_prim_generated_query_in_rp;
bool has_zpass_done_sample_count_write_in_rp;
bool disable_gmem;
bool sysmem_single_prim_mode;
bool shared_viewport;

View file

@ -880,6 +880,27 @@ emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true).value);
tu_cs_emit_qw(cs, begin_iova);
/* ZPASS_DONE events should come in begin-end pairs. When emitting and
* occlusion query outside of a renderpass, we emit a fake end event that
* closes the previous one since the autotuner's ZPASS_DONE use could end
* up causing problems. This events writes into the end field of the query
* slot, but it will be overwritten by events in emit_end_occlusion_query
* with the proper value.
* When inside a renderpass, the corresponding ZPASS_DONE event will be
* emitted in emit_end_occlusion_query. We note the use of ZPASS_DONE on
* the state object, enabling autotuner to optimize its own events.
*/
if (!cmdbuf->state.pass) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true).value);
tu_cs_emit_qw(cs, begin_iova);
} else {
cmdbuf->state.rp.has_zpass_done_sample_count_write_in_rp = true;
}
}
}
@ -1188,6 +1209,20 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit(cs, CCU_CLEAN_DEPTH);
}
} else {
/* When outside of renderpass, potential autotuner activity can cause
* interference between ZPASS_DONE event pairs. In that case, like at the
* beginning of the occlusion query, a fake ZPASS_DONE event is emitted to
* compose a begin-end event pair. The first event will write into the end
* field, but that will be overwritten by the second ZPASS_DONE which will
* also handle the diff accumulation.
*/
if (!cmdbuf->state.pass) {
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true).value);
tu_cs_emit_qw(cs, end_iova);
}
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
.write_sample_count = true,