anv: dynamically allocate utrace batch buffers

Estimating the batch space required can be tricky because of all the
workarounds. So implement chaining of batches like we do for command
buffers.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26087>
This commit is contained in:
Lionel Landwerlin 2023-11-02 22:50:38 +02:00 committed by Marge Bot
parent 9ebb7721b5
commit 2dc452ec7c
4 changed files with 106 additions and 70 deletions

View file

@ -5509,7 +5509,7 @@ struct anv_utrace_submit {
*/
struct anv_reloc_list relocs;
struct anv_batch batch;
struct anv_bo *batch_bo;
struct util_dynarray batch_bos;
/* Stream for temporary allocations */
struct anv_state_stream dynamic_state_stream;

View file

@ -25,7 +25,7 @@
#include "anv_internal_kernels.h"
#include "ds/intel_tracepoints.h"
#include "genxml/gen8_pack.h"
#include "genxml/gen9_pack.h"
#include "perf/intel_perf.h"
#include "util/perf/cpu_trace.h"
@ -88,10 +88,9 @@ anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
if (submit->trace_bo)
anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
if (submit->batch_bo) {
anv_reloc_list_finish(&submit->relocs);
anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
}
util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
anv_bo_pool_free(&device->utrace_bo_pool, *bo);
util_dynarray_fini(&submit->batch_bos);
vk_sync_destroy(&device->vk, submit->sync);
@ -151,6 +150,44 @@ anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx,
push_data_state);
}
static VkResult
anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
void *user_data)
{
struct anv_utrace_submit *submit = user_data;
uint32_t alloc_size = 0;
util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
alloc_size += (*bo)->size;
alloc_size = MAX2(alloc_size * 2, 8192);
struct anv_bo *bo;
VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
align(alloc_size, 4096),
&bo);
if (result != VK_SUCCESS)
return result;
util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
GFX9_MI_BATCH_BUFFER_START_length_bias;
bbs.SecondLevelBatchBuffer = Firstlevelbatch;
bbs.AddressSpaceIndicator = ASI_PPGTT;
bbs.BatchBufferStartAddress = (struct anv_address) { bo, 0 };
}
anv_batch_set_storage(batch,
(struct anv_address) { .bo = bo, },
bo->map,
bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
return VK_SUCCESS;
}
VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
uint32_t cmd_buffer_count,
@ -175,6 +212,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
if (!submit)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
submit->queue = queue;
intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
@ -182,6 +221,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
if (result != VK_SUCCESS)
goto error_sync;
util_dynarray_init(&submit->batch_bos, NULL);
if (utrace_copies > 0) {
result = anv_bo_pool_alloc(&device->utrace_bo_pool,
utrace_copies * 4096,
@ -189,22 +230,6 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
if (result != VK_SUCCESS)
goto error_trace_buf;
uint32_t batch_size = 512; /* 128 dwords of setup */
if (intel_needs_workaround(device->info, 16013994831)) {
/* Enable/Disable preemption at the begin/end */
batch_size += 2 * (250 /* 250 MI_NOOPs*/ +
6 /* PIPE_CONTROL */ +
3 /* MI_LRI */) * 4 /* dwords */;
}
batch_size += 256 * utrace_copies; /* 64 dwords per copy */
batch_size = align(batch_size + 4, 8); /* MI_BATCH_BUFFER_END */
result = anv_bo_pool_alloc(&device->utrace_bo_pool,
align(batch_size, 4096),
&submit->batch_bo);
if (result != VK_SUCCESS)
goto error_batch_buf;
const bool uses_relocs = device->physical->uses_relocs;
result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
if (result != VK_SUCCESS)
@ -215,11 +240,12 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
anv_state_stream_init(&submit->general_state_stream,
&device->general_state_pool, 16384);
submit->batch.alloc = &device->vk.alloc;
submit->batch.relocs = &submit->relocs;
anv_batch_set_storage(&submit->batch,
(struct anv_address) { .bo = submit->batch_bo, },
submit->batch_bo->map, submit->batch_bo->size);
submit->batch = (struct anv_batch) {
.alloc = &device->vk.alloc,
.relocs = &submit->relocs,
.user_data = submit,
.extend_cb = anv_utrace_submit_extend_batch,
};
/* Only engine class where we support timestamp copies
*
@ -304,17 +330,15 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
}
}
submit->queue = queue;
*out_submit = submit;
return VK_SUCCESS;
error_batch:
anv_reloc_list_finish(&submit->relocs);
util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
anv_bo_pool_free(&device->utrace_bo_pool, *bo);
error_reloc_list:
anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
error_batch_buf:
anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
error_trace_buf:
vk_sync_destroy(&device->vk, submit->sync);
@ -555,21 +579,17 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg
if (result != VK_SUCCESS)
goto error_trace;
result = anv_bo_pool_alloc(&device->utrace_bo_pool, 4096,
&submit->batch_bo);
if (result != VK_SUCCESS)
goto error_sync;
const bool uses_relocs = device->physical->uses_relocs;
result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
if (result != VK_SUCCESS)
goto error_batch_bo;
goto error_sync;
submit->batch.alloc = &device->vk.alloc;
submit->batch.relocs = &submit->relocs;
anv_batch_set_storage(&submit->batch,
(struct anv_address) { .bo = submit->batch_bo, },
submit->batch_bo->map, submit->batch_bo->size);
submit->batch = (struct anv_batch) {
.alloc = &device->vk.alloc,
.relocs = &submit->relocs,
.user_data = submit,
.extend_cb = anv_utrace_submit_extend_batch,
};
if (frame) {
if (begin)
@ -588,8 +608,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg
}
}
anv_batch_emit(&submit->batch, GFX8_MI_BATCH_BUFFER_END, bbs);
anv_batch_emit(&submit->batch, GFX8_MI_NOOP, noop);
anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
if (submit->batch.status != VK_SUCCESS) {
result = submit->batch.status;
@ -606,8 +626,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg
error_reloc_list:
anv_reloc_list_finish(&submit->relocs);
error_batch_bo:
anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
anv_bo_pool_free(&device->utrace_bo_pool, *bo);
error_sync:
vk_sync_destroy(&device->vk, submit->sync);
error_trace:

View file

@ -522,38 +522,43 @@ setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
if (result != VK_SUCCESS)
return result;
result = anv_execbuf_add_bo(device, execbuf,
submit->batch_bo,
&submit->relocs, 0);
if (result != VK_SUCCESS)
return result;
util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) {
struct anv_bo *bo = *_bo;
result = anv_execbuf_add_bo(device, execbuf, bo,
&submit->relocs, 0);
if (result != VK_SUCCESS)
return result;
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush)
intel_flush_range(bo->map, bo->size);
#endif
}
result = anv_execbuf_add_sync(device, execbuf, submit->sync,
true /* is_signal */, 0 /* value */);
if (result != VK_SUCCESS)
return result;
if (submit->batch_bo->exec_obj_index != execbuf->bo_count - 1) {
uint32_t idx = submit->batch_bo->exec_obj_index;
struct anv_bo *batch_bo =
*util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
if (batch_bo->exec_obj_index != execbuf->bo_count - 1) {
uint32_t idx = batch_bo->exec_obj_index;
uint32_t last_idx = execbuf->bo_count - 1;
struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
assert(execbuf->bos[idx] == submit->batch_bo);
assert(execbuf->bos[idx] == batch_bo);
execbuf->objects[idx] = execbuf->objects[last_idx];
execbuf->bos[idx] = execbuf->bos[last_idx];
execbuf->bos[idx]->exec_obj_index = idx;
execbuf->objects[last_idx] = tmp_obj;
execbuf->bos[last_idx] = submit->batch_bo;
submit->batch_bo->exec_obj_index = last_idx;
execbuf->bos[last_idx] = batch_bo;
batch_bo->exec_obj_index = last_idx;
}
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush)
intel_flush_range(submit->batch_bo->map, submit->batch_bo->size);
#endif
uint64_t exec_flags = 0;
uint32_t context_id;
get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
@ -596,7 +601,8 @@ static VkResult
anv_queue_exec_utrace_locked(struct anv_queue *queue,
struct anv_utrace_submit *submit)
{
assert(submit->batch_bo);
assert(util_dynarray_num_elements(&submit->batch_bos,
struct anv_bo *) > 0);
struct anv_device *device = queue->device;
struct anv_execbuf execbuf = {
@ -740,7 +746,9 @@ i915_queue_exec_locked(struct anv_queue *queue,
};
VkResult result;
if (utrace_submit && !utrace_submit->batch_bo) {
if (utrace_submit &&
util_dynarray_num_elements(&utrace_submit->batch_bos,
struct anv_bo *) == 0) {
result = anv_execbuf_add_sync(device, &execbuf,
utrace_submit->sync,
true /* is_signal */,
@ -950,7 +958,8 @@ VkResult
i915_queue_exec_trace(struct anv_queue *queue,
struct anv_utrace_submit *submit)
{
assert(submit->batch_bo);
assert(util_dynarray_num_elements(&submit->batch_bos,
struct anv_bo *) > 0);
return anv_queue_exec_utrace_locked(queue, submit);
}

View file

@ -126,7 +126,9 @@ xe_exec_process_syncs(struct anv_queue *queue,
/* Signal the utrace sync only if it doesn't have a batch. Otherwise the
* it's the utrace batch that should signal its own sync.
*/
if (utrace_submit && !utrace_submit->batch_bo) {
if (utrace_submit &&
util_dynarray_num_elements(&utrace_submit->batch_bos,
struct anv_bo *) == 0) {
struct drm_xe_sync *xe_sync = &xe_syncs[count++];
xe_exec_fill_sync(xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
@ -186,17 +188,20 @@ xe_queue_exec_utrace_locked(struct anv_queue *queue,
xe_exec_fill_sync(&xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush)
intel_flush_range(utrace_submit->batch_bo->map,
utrace_submit->batch_bo->size);
if (device->physical->memory.need_flush) {
util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo)
intel_flush_range((*bo)->map, (*bo)->size);
}
#endif
struct anv_bo *batch_bo =
*util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0);
struct drm_xe_exec exec = {
.exec_queue_id = queue->exec_queue_id,
.num_batch_buffer = 1,
.syncs = (uintptr_t)&xe_sync,
.num_syncs = 1,
.address = utrace_submit->batch_bo->offset,
.address = batch_bo->offset,
};
if (likely(!device->info->no_hw)) {
if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
@ -283,7 +288,9 @@ xe_queue_exec_locked(struct anv_queue *queue,
return result;
/* If we have no batch for utrace, just forget about it now. */
if (utrace_submit && !utrace_submit->batch_bo)
if (utrace_submit &&
util_dynarray_num_elements(&utrace_submit->batch_bos,
struct anv_bo *) == 0)
utrace_submit = NULL;
struct drm_xe_exec exec = {