winsys/amdgpu: enable unlimited number of parallel queues for VCN

This fixes a VCN performance regression introduced by the new BO fence
tracking mechanism.

VCN can have many queues. The current BO fence tracking mechanism only
supports 1 queue per IP, and there is an interest to use all VCN queues via
VAAPI. This introduces an alternative BO fence tracking mechanism that is
only enabled for VCN, supports unlimited parallel queues, is similar to
the previous system, can co-exist with the current queue system, and has no
negative impact on CPU overhead as long as it's only used by VCN.

Since we want an unlimited number of queues, we can't generate our own
sequence numbers for those queues. Instead, each buffer will have a new
field "alt_fence", which means an alternative fence. This fence is the last
use of that buffer on any VCN queue. If any other queue wants to use that
buffer, it has to insert alt_fence as a dependency, and replace alt_fence
with the new submitted fence, so that it's always equal to the last use.

Only VCN uses and updates alt_fence when an IB is submitted. Other IPs only
use alt_fence as a fence dependency. alt_fence is NULL when VCN isn't used,
so there is no negative impact on CPU overhead in that case.

It uses a C++ template for amdgpu_cs_submit_ib due to different BO loop
bodies between normal queues and VCN. Those loop bodies execute for every
BO, so they shouldn't have extra code for alt_fence if the queue doesn't
update it.

Acked-and-Tested-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27627>
This commit is contained in:
Marek Olšák 2024-02-12 17:34:25 -05:00 committed by Marge Bot
parent 3e118c6d2f
commit f933536517
4 changed files with 202 additions and 96 deletions

View file

@ -31,6 +31,43 @@ struct amdgpu_sparse_backing_chunk {
uint32_t begin, end;
};
static bool amdgpu_bo_fence_wait(struct amdgpu_winsys *ws,
struct pipe_fence_handle **fence,
uint64_t timeout, int64_t abs_timeout)
{
if (timeout == 0) {
bool idle = amdgpu_fence_wait(*fence, 0, false);
if (!idle) {
simple_mtx_unlock(&ws->bo_fence_lock);
return false; /* busy */
}
/* It's idle. Remove it from the ring to skip checking it again later. */
amdgpu_fence_reference(fence, NULL);
} else {
struct pipe_fence_handle *tmp_fence = NULL;
amdgpu_fence_reference(&tmp_fence, *fence);
/* While waiting, unlock the mutex. */
simple_mtx_unlock(&ws->bo_fence_lock);
bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
if (!idle) {
amdgpu_fence_reference(&tmp_fence, NULL);
return false; /* busy */
}
simple_mtx_lock(&ws->bo_fence_lock);
/* It's idle. Remove it from the ring to skip checking it again later. */
if (tmp_fence == *fence)
amdgpu_fence_reference(fence, NULL);
amdgpu_fence_reference(&tmp_fence, NULL);
}
return true;
}
static bool amdgpu_bo_wait(struct radeon_winsys *rws,
struct pb_buffer_lean *_buf, uint64_t timeout,
unsigned usage)
@ -53,10 +90,14 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
return false;
}
if (is_real_bo(bo) && get_real_bo(bo)->is_shared) {
/* We can't use user fences for shared buffers, because user fences
* are local to this process only. If we want to wait for all buffer
* uses in all processes, we have to use amdgpu_bo_wait_for_idle.
if (is_real_bo(bo) && (get_real_bo(bo)->is_shared || get_real_bo(bo)->slab_has_busy_alt_fences)) {
/* We can't use user fences for shared buffers, because user fences are local to this
* process only. If we want to wait for all buffer uses in all processes, we have to
* use amdgpu_bo_wait_for_idle.
*
* Additionally, if this is a slab buffer and one of the slab entries has non-NULL
* alt_fence, we can't easily wait for that here. Instead, use the kernel ioctl to wait
* for the buffer.
*/
bool buffer_busy = true;
int r;
@ -64,6 +105,9 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
r = amdgpu_bo_wait_for_idle(get_real_bo(bo)->bo_handle, timeout, &buffer_busy);
if (r)
fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r);
if (!buffer_busy)
get_real_bo(bo)->slab_has_busy_alt_fences = false;
return !buffer_busy;
}
@ -73,40 +117,21 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
struct pipe_fence_handle **fence = get_fence_from_ring(ws, &bo->fences, i);
if (fence) {
if (timeout == 0) {
bool idle = amdgpu_fence_wait(*fence, 0, false);
if (!idle) {
simple_mtx_unlock(&ws->bo_fence_lock);
return false; /* busy */
}
/* It's idle. Remove it from the ring to skip checking it again later. */
amdgpu_fence_reference(fence, NULL);
} else {
struct pipe_fence_handle *tmp_fence = NULL;
amdgpu_fence_reference(&tmp_fence, *fence);
/* While waiting, unlock the mutex. */
simple_mtx_unlock(&ws->bo_fence_lock);
bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
if (!idle) {
amdgpu_fence_reference(&tmp_fence, NULL);
return false; /* busy */
}
simple_mtx_lock(&ws->bo_fence_lock);
/* It's idle. Remove it from the ring to skip checking it again later. */
if (tmp_fence == *fence)
amdgpu_fence_reference(fence, NULL);
amdgpu_fence_reference(&tmp_fence, NULL);
}
/* This also unlocks the mutex on failure. */
if (!amdgpu_bo_fence_wait(ws, fence, timeout, abs_timeout))
return false;
}
bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */
}
/* Also wait for alt_fence. */
if (bo->alt_fence) {
/* This also unlocks the mutex on failure. */
if (!amdgpu_bo_fence_wait(ws, &bo->alt_fence, timeout, abs_timeout))
return false;
}
simple_mtx_unlock(&ws->bo_fence_lock);
return true; /* idle */
}
@ -136,6 +161,7 @@ static enum radeon_bo_flag amdgpu_bo_get_flags(
static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
{
bo->fences.valid_fence_mask = 0;
amdgpu_fence_reference(&bo->alt_fence, NULL);
}
void amdgpu_bo_destroy(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf)

View file

@ -55,6 +55,19 @@ struct amdgpu_winsys_bo {
enum amdgpu_bo_type type:8;
struct amdgpu_seq_no_fences fences;
/* Since some IPs like VCN want to have an unlimited number of queues, we can't generate our
* own sequence numbers for those queues. Instead, each buffer will have "alt_fence", which
* means an alternative fence. This fence is the last use of that buffer on any VCN queue.
* If any other queue wants to use that buffer, it has to insert alt_fence as a dependency,
* and replace alt_fence with the new submitted fence, so that it's always equal to the last
* use.
*
* Only VCN uses and updates alt_fence when an IB is submitted. Other IPs only use alt_fence
* as a fence dependency. alt_fence is NULL when VCN isn't used, so there is no negative
* impact on CPU overhead in that case.
*/
struct pipe_fence_handle *alt_fence;
/* This is set when a buffer is returned by buffer_create(), not when the memory is allocated
* as part of slab BO.
*/
@ -90,6 +103,9 @@ struct amdgpu_bo_real {
* it can only transition from false to true. Protected by lock.
*/
bool is_shared;
/* Whether this is a slab buffer and alt_fence was set on one of the slab entries. */
bool slab_has_busy_alt_fences;
};
/* Same as amdgpu_bo_real except this BO isn't destroyed when its reference count drops to 0.
@ -174,6 +190,12 @@ static inline struct amdgpu_bo_real *get_slab_entry_real_bo(struct amdgpu_winsys
return &get_bo_from_slab(((struct amdgpu_bo_slab_entry*)bo)->entry.slab)->b.b;
}
static struct amdgpu_bo_real_reusable_slab *get_real_bo_reusable_slab(struct amdgpu_winsys_bo *bo)
{
assert(bo->type == AMDGPU_BO_REAL_REUSABLE_SLAB);
return (struct amdgpu_bo_real_reusable_slab*)bo;
}
/* Given a sequence number "fences->seq_no[queue_index]", return a pointer to a non-NULL fence
* pointer in the queue ring corresponding to that sequence number if the fence is non-NULL.
* If the fence is not present in the ring (= is idle), return NULL. If it returns a non-NULL

View file

@ -870,6 +870,13 @@ static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
return cs->ip_type;
}
static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
{
/* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
return ip_type == AMD_IP_VCN_DEC ||
ip_type == AMD_IP_VCN_ENC ||
ip_type == AMD_IP_VCN_JPEG;
}
static bool
amdgpu_cs_create(struct radeon_cmdbuf *rcs,
@ -901,20 +908,25 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
/* Compute the queue index by counting the IPs that have queues. */
assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip));
assert(ctx->ws->info.ip[ip_type].num_queues);
cs->queue_index = 0;
for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
if (!ctx->ws->info.ip[i].num_queues)
continue;
if (ip_uses_alt_fence(ip_type)) {
cs->queue_index = INT_MAX;
cs->uses_alt_fence = true;
} else {
cs->queue_index = 0;
if (i == ip_type)
break;
for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
if (!ctx->ws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
continue;
cs->queue_index++;
if (i == ip_type)
break;
cs->queue_index++;
}
assert(cs->queue_index < AMDGPU_MAX_QUEUES);
}
assert(cs->queue_index < AMDGPU_MAX_QUEUES);
struct amdgpu_cs_fence_info fence_info;
fence_info.handle = cs->ctx->user_fence_bo;
fence_info.offset = cs->ip_type * 4;
@ -1176,16 +1188,21 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
add_fence_to_list(&cs->syncobj_dependencies, fence);
}
static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws, unsigned queue_index,
static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
struct amdgpu_cs_context *cs,
unsigned queue_index_bit,
struct amdgpu_seq_no_fences *dependencies,
struct amdgpu_winsys_bo *bo, unsigned usage)
{
if (usage & RADEON_USAGE_SYNCHRONIZED) {
/* Add BO fences from queues other than 'queue_index' to dependencies. */
u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~BITFIELD_BIT(queue_index)) {
u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
add_seq_no_to_list(ws, dependencies, other_queue_idx,
bo->fences.seq_no[other_queue_idx]);
}
if (bo->alt_fence)
add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
}
}
@ -1212,6 +1229,11 @@ static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
}
/* The template parameter determines whether the queue should skip code used by the default queue
* system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
* for all BOs.
*/
template<bool QUEUE_USES_ALT_FENCE>
static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
{
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
@ -1221,40 +1243,48 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
uint64_t seq_no = 0;
bool has_user_fence = amdgpu_cs_has_user_fence(acs);
assert(QUEUE_USES_ALT_FENCE == acs->uses_alt_fence);
simple_mtx_lock(&ws->bo_fence_lock);
unsigned queue_index = acs->queue_index;
struct amdgpu_queue *queue = &ws->queues[queue_index];
uint_seq_no prev_seq_no = queue->latest_seq_no;
unsigned queue_index;
struct amdgpu_queue *queue;
uint_seq_no prev_seq_no, next_seq_no;
/* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
* but the values aren't related.
*/
uint_seq_no next_seq_no = prev_seq_no + 1;
if (!QUEUE_USES_ALT_FENCE) {
queue_index = acs->queue_index;
queue = &ws->queues[queue_index];
prev_seq_no = queue->latest_seq_no;
/* Wait for the oldest fence to signal. This should always check the user fence, then wait
* via the ioctl. We have to do this because we are going to release the oldest fence and
* replace it with the latest fence in the ring.
*/
struct pipe_fence_handle **oldest_fence =
&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
/* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
* but the values aren't related.
*/
next_seq_no = prev_seq_no + 1;
if (*oldest_fence) {
if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
/* Take the reference because the fence can be released by other threads after we
* unlock the mutex.
*/
struct pipe_fence_handle *tmp_fence = NULL;
amdgpu_fence_reference(&tmp_fence, *oldest_fence);
/* Wait for the oldest fence to signal. This should always check the user fence, then wait
* via the ioctl. We have to do this because we are going to release the oldest fence and
* replace it with the latest fence in the ring.
*/
struct pipe_fence_handle **oldest_fence =
&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
/* Unlock the mutex before waiting. */
simple_mtx_unlock(&ws->bo_fence_lock);
amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
amdgpu_fence_reference(&tmp_fence, NULL);
simple_mtx_lock(&ws->bo_fence_lock);
if (*oldest_fence) {
if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
/* Take the reference because the fence can be released by other threads after we
* unlock the mutex.
*/
struct pipe_fence_handle *tmp_fence = NULL;
amdgpu_fence_reference(&tmp_fence, *oldest_fence);
/* Unlock the mutex before waiting. */
simple_mtx_unlock(&ws->bo_fence_lock);
amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
amdgpu_fence_reference(&tmp_fence, NULL);
simple_mtx_lock(&ws->bo_fence_lock);
}
/* Remove the idle fence from the ring. */
amdgpu_fence_reference(oldest_fence, NULL);
}
/* Remove the idle fence from the ring. */
amdgpu_fence_reference(oldest_fence, NULL);
}
/* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
@ -1263,17 +1293,19 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
struct amdgpu_seq_no_fences seq_no_dependencies;
memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
/* Add a fence dependency on the previous IB if the IP has multiple physical queues to
* make it appear as if it had only 1 queue, or if the previous IB comes from a different
* context. The reasons are:
* - Our BO fence tracking only supports 1 queue per IP.
* - IBs from different contexts must wait for each other and can't execute in a random order.
*/
struct amdgpu_fence *prev_fence =
(struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
if (!QUEUE_USES_ALT_FENCE) {
/* Add a fence dependency on the previous IB if the IP has multiple physical queues to
* make it appear as if it had only 1 queue, or if the previous IB comes from a different
* context. The reasons are:
* - Our BO fence tracking only supports 1 queue per IP.
* - IBs from different contexts must wait for each other and can't execute in a random order.
*/
struct amdgpu_fence *prev_fence =
(struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
add_seq_no_to_list(ws, &seq_no_dependencies, queue_index, prev_seq_no);
if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
add_seq_no_to_list(ws, &seq_no_dependencies, queue_index, prev_seq_no);
}
/* Since the kernel driver doesn't synchronize execution between different
* rings automatically, we have to add fence dependencies manually. This gathers sequence
@ -1284,13 +1316,18 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
unsigned queue_index_bit = QUEUE_USES_ALT_FENCE ? 0 : BITFIELD_BIT(queue_index);
for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(ws, queue_index, &seq_no_dependencies, bo, buffer->usage);
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (QUEUE_USES_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
/* We didn't add any slab entries into the real buffer list that will be submitted
* to the kernel. Do it now.
@ -1313,8 +1350,12 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(ws, queue_index, &seq_no_dependencies, bo, buffer->usage);
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (QUEUE_USES_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
/* Add backing buffers of sparse buffers to the buffer list.
*
@ -1354,8 +1395,13 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_add_fences_to_dependencies(ws, queue_index, &seq_no_dependencies, bo, buffer->usage);
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
amdgpu_add_fences_to_dependencies(ws, cs, queue_index_bit, &seq_no_dependencies, bo,
buffer->usage);
if (QUEUE_USES_ALT_FENCE)
amdgpu_fence_reference(&bo->alt_fence, cs->fence);
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
}
@ -1364,7 +1410,11 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
if (QUEUE_USES_ALT_FENCE)
get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
else
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
}
@ -1417,13 +1467,15 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
}
}
/* Finally, add the IB fence into the winsys queue. */
amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
queue->latest_seq_no = next_seq_no;
((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
if (!QUEUE_USES_ALT_FENCE) {
/* Finally, add the IB fence into the fence ring of the queue. */
amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
queue->latest_seq_no = next_seq_no;
((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
/* Update the last used context in the queue. */
amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
/* Update the last used context in the queue. */
amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
}
simple_mtx_unlock(&ws->bo_fence_lock);
#if DEBUG
@ -1758,7 +1810,8 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
/* Submit. */
util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
amdgpu_cs_submit_ib, NULL, 0);
cs->uses_alt_fence ? amdgpu_cs_submit_ib<true>
: amdgpu_cs_submit_ib<false>, NULL, 0);
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
cs->csc->secure = !cs->cst->secure;

View file

@ -125,6 +125,11 @@ struct amdgpu_cs {
enum amd_ip_type ip_type;
unsigned queue_index;
/* Whether this queue uses amdgpu_winsys_bo::alt_fence instead of generating its own
* sequence numbers for synchronization.
*/
bool uses_alt_fence;
/* We flip between these two CS. While one is being consumed
* by the kernel in another thread, the other one is being filled
* by the pipe driver. */