hk: parallelize after-graphics available sets

reduces cdm overhead.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31532>
This commit is contained in:
Alyssa Rosenzweig 2024-09-18 18:40:44 -04:00 committed by Marge Bot
parent a2edffad2f
commit ad8f005ecb
6 changed files with 87 additions and 6 deletions

View file

@ -109,3 +109,9 @@ libagx_increment_ia_counters(constant struct libagx_increment_ia_counters *p,
*(p->vs_invocations) += count;
}
}
void
libagx_write_u32s(constant struct libagx_imm_write *p, uint id)
{
*(p[id].address) = p[id].value;
}

View file

@ -67,3 +67,8 @@ struct libagx_increment_ia_counters {
uint32_t index_buffer_range_el;
uint32_t restart_index;
};
struct libagx_imm_write {
GLOBAL(uint32_t) address;
uint32_t value;
};

View file

@ -269,7 +269,7 @@ hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
perf_debug(dev, "End command buffer");
hk_cmd_buffer_end_compute(cmd);
hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
/* With rasterizer discard, we might end up with empty VDM batches.
* It is difficult to avoid creating these empty batches, but it's easy to

View file

@ -325,6 +325,11 @@ struct hk_cs {
struct hk_scratch_req fs;
} scratch;
/* Immediate writes, type libagx_imm_write. These all happen in parallel at
* the end of the control stream. This accelerates queries. Implies CDM.
*/
struct util_dynarray imm_writes;
/* Statistics */
struct {
uint32_t calls, cmds, flushes;
@ -565,16 +570,29 @@ hk_cs_destroy(struct hk_cs *cs)
if (cs->type == HK_CS_VDM) {
util_dynarray_fini(&cs->scissor);
util_dynarray_fini(&cs->depth_bias);
} else {
util_dynarray_fini(&cs->imm_writes);
}
free(cs);
}
void hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs);
static void
hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
hk_cmd_buffer_end_compute_internal(struct hk_cmd_buffer *cmd,
struct hk_cs **ptr)
{
if (*ptr) {
struct hk_cs *cs = *ptr;
/* This control stream may write immediates as it ends. Queue the writes
* now that we're done emitting everything else.
*/
if (cs->imm_writes.size) {
hk_dispatch_imm_writes(cmd, cs);
}
void *map = cs->current;
agx_push(map, CDM_STREAM_TERMINATE, _)
;
@ -588,7 +606,7 @@ hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
static void
hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd)
{
hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.cs);
}
static void
@ -615,8 +633,8 @@ hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd)
cmd->current_cs.gfx->current = map;
cmd->current_cs.gfx = NULL;
hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx);
hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.pre_gfx);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
}
assert(cmd->current_cs.gfx == NULL);

View file

@ -752,7 +752,7 @@ hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
cs->cr.zls_control = render->cr.zls_control;
/* Reordering barrier for post-gfx, in case we had any. */
hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
/* Don't reorder compute across render passes.
*

View file

@ -28,6 +28,7 @@
#include "compiler/nir/nir_builder.h"
#include "util/os_time.h"
#include "util/u_dynarray.h"
#include "vulkan/vulkan_core.h"
struct hk_query_report {
@ -209,6 +210,45 @@ hk_nir_write_u32(nir_builder *b, UNUSED const void *key)
nir_store_global(b, addr, 4, value, nir_component_mask(1));
}
static void
hk_nir_write_u32s(nir_builder *b, const void *data)
{
nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
libagx_write_u32s(b, params, id);
}
void
hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
{
hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
/* As soon as we mark a query available, it needs to be available system
* wide, otherwise a CPU-side get result can query. As such, we cache flush
* before and then let coherency works its magic. Without this barrier, we
* get flakes in
*
* dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
*/
struct hk_device *dev = hk_cmd_buffer_device(cmd);
hk_cdm_cache_flush(dev, cs);
perf_debug(dev, "Queued writes");
struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32s, NULL, 0);
uint64_t params =
hk_pool_upload(cmd, cs->imm_writes.data, cs->imm_writes.size, 16);
uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
uint32_t count =
util_dynarray_num_elements(&cs->imm_writes, struct libagx_imm_write);
assert(count > 0);
hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(count, 1, 1),
hk_grid(32, 1, 1));
}
void
hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
bool after_gfx)
@ -218,6 +258,18 @@ hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
if (!cs)
return;
/* TODO: Generalize this mechanism suitably */
if (after_gfx) {
struct libagx_imm_write imm = {.address = address, .value = value};
if (!cs->imm_writes.data) {
util_dynarray_init(&cs->imm_writes, NULL);
}
util_dynarray_append(&cs->imm_writes, struct libagx_imm_write, imm);
return;
}
hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
/* As soon as we mark a query available, it needs to be available system