diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 95f7c8641b2..687cfd3a144 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -65,63 +65,48 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws, return !buffer_busy; } - if (timeout == 0) { - unsigned idle_fences; - bool buffer_idle; + simple_mtx_lock(&ws->bo_fence_lock); - simple_mtx_lock(&ws->bo_fence_lock); + u_foreach_bit(i, bo->fences.valid_fence_mask) { + struct pipe_fence_handle **fence = get_fence_from_ring(ws, &bo->fences, i); - for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { - if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) - break; - } + if (fence) { + if (timeout == 0) { + bool idle = amdgpu_fence_wait(*fence, 0, false); - /* Release the idle fences to avoid checking them again later. */ - for (unsigned i = 0; i < idle_fences; ++i) - amdgpu_fence_reference(&bo->fences[i], NULL); + if (!idle) { + simple_mtx_unlock(&ws->bo_fence_lock); + return false; /* busy */ + } - memmove(&bo->fences[0], &bo->fences[idle_fences], - (bo->num_fences - idle_fences) * sizeof(*bo->fences)); - bo->num_fences -= idle_fences; + /* It's idle. Remove it from the ring to skip checking it again later. */ + amdgpu_fence_reference(fence, NULL); + } else { + struct pipe_fence_handle *tmp_fence = NULL; + amdgpu_fence_reference(&tmp_fence, *fence); - buffer_idle = !bo->num_fences; - simple_mtx_unlock(&ws->bo_fence_lock); + /* While waiting, unlock the mutex. */ + simple_mtx_unlock(&ws->bo_fence_lock); - return buffer_idle; - } else { - bool buffer_idle = true; + bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true); + if (!idle) { + amdgpu_fence_reference(&tmp_fence, NULL); + return false; /* busy */ + } - simple_mtx_lock(&ws->bo_fence_lock); - while (bo->num_fences && buffer_idle) { - struct pipe_fence_handle *fence = NULL; - bool fence_idle = false; - - amdgpu_fence_reference(&fence, bo->fences[0]); - - /* Wait for the fence. */ - simple_mtx_unlock(&ws->bo_fence_lock); - if (amdgpu_fence_wait(fence, abs_timeout, true)) - fence_idle = true; - else - buffer_idle = false; - simple_mtx_lock(&ws->bo_fence_lock); - - /* Release an idle fence to avoid checking it again later, keeping in - * mind that the fence array may have been modified by other threads. - */ - if (fence_idle && bo->num_fences && bo->fences[0] == fence) { - amdgpu_fence_reference(&bo->fences[0], NULL); - memmove(&bo->fences[0], &bo->fences[1], - (bo->num_fences - 1) * sizeof(*bo->fences)); - bo->num_fences--; + simple_mtx_lock(&ws->bo_fence_lock); + /* It's idle. Remove it from the ring to skip checking it again later. */ + if (tmp_fence == *fence) + amdgpu_fence_reference(fence, NULL); + amdgpu_fence_reference(&tmp_fence, NULL); } - - amdgpu_fence_reference(&fence, NULL); } - simple_mtx_unlock(&ws->bo_fence_lock); - return buffer_idle; + bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */ } + + simple_mtx_unlock(&ws->bo_fence_lock); + return true; /* idle */ } static inline unsigned get_slab_entry_offset(struct amdgpu_winsys_bo *bo) @@ -148,12 +133,7 @@ static enum radeon_bo_flag amdgpu_bo_get_flags( static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) { - for (unsigned i = 0; i < bo->num_fences; ++i) - amdgpu_fence_reference(&bo->fences[i], NULL); - - FREE(bo->fences); - bo->num_fences = 0; - bo->max_fences = 0; + bo->fences.valid_fence_mask = 0; } void amdgpu_bo_destroy(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf) @@ -937,8 +917,11 @@ sparse_free_backing_buffer(struct amdgpu_winsys *ws, struct amdgpu_bo_sparse *bo { bo->num_backing_pages -= backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE; + /* Add fences from bo to backing->bo. */ simple_mtx_lock(&ws->bo_fence_lock); - amdgpu_add_fences(&backing->bo->b, bo->b.num_fences, bo->b.fences); + u_foreach_bit(i, bo->b.fences.valid_fence_mask) { + add_seq_no_to_list(ws, &backing->bo->b.fences, i, bo->b.fences.seq_no[i]); + } simple_mtx_unlock(&ws->bo_fence_lock); list_del(&backing->list); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 0e4a0ee71f8..f034d148858 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -49,7 +49,8 @@ enum amdgpu_bo_type { /* Base class of the buffer object that other structures inherit. */ struct amdgpu_winsys_bo { struct pb_buffer_lean base; - enum amdgpu_bo_type type; + enum amdgpu_bo_type type:8; + struct amdgpu_seq_no_fences fences; /* This is set when a buffer is returned by buffer_create(), not when the memory is allocated * as part of slab BO. @@ -59,11 +60,6 @@ struct amdgpu_winsys_bo { /* how many command streams, which are being emitted in a separate * thread, is this bo referenced in? */ volatile int num_active_ioctls; - - /* Fences for buffer synchronization. */ - uint16_t num_fences; - uint16_t max_fences; - struct pipe_fence_handle **fences; }; /* Real GPU memory allocation managed by the amdgpu kernel driver. @@ -177,6 +173,66 @@ static struct amdgpu_bo_real *get_slab_entry_real_bo(struct amdgpu_winsys_bo *bo return &get_bo_from_slab(((struct amdgpu_bo_slab_entry*)bo)->entry.slab)->b.b; } +/* Given a sequence number "fences->seq_no[queue_index]", return a pointer to a non-NULL fence + * pointer in the queue ring corresponding to that sequence number if the fence is non-NULL. + * If the fence is not present in the ring (= is idle), return NULL. If it returns a non-NULL + * pointer and the caller finds the fence to be idle, it's recommended to use the returned pointer + * to set the fence to NULL in the ring, which is why we return a pointer to a pointer. + */ +static inline struct pipe_fence_handle ** +get_fence_from_ring(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences, + unsigned queue_index) +{ + /* The caller should check if the BO has a fence. */ + assert(queue_index < AMDGPU_MAX_QUEUES); + assert(fences->valid_fence_mask & BITFIELD_BIT(queue_index)); + + uint_seq_no buffer_seq_no = fences->seq_no[queue_index]; + uint_seq_no latest_seq_no = ws->queues[queue_index].latest_seq_no; + bool fence_present = latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE; + + if (fence_present) { + struct pipe_fence_handle **fence = + &ws->queues[queue_index].fences[buffer_seq_no % AMDGPU_FENCE_RING_SIZE]; + + if (*fence) + return fence; + } + + /* If the sequence number references a fence that is not present, it's guaranteed to be idle + * because the winsys always waits for the oldest fence when it removes it from the ring. + */ + fences->valid_fence_mask &= ~BITFIELD_BIT(queue_index); + return NULL; +} + +static inline uint_seq_no pick_latest_seq_no(struct amdgpu_winsys *ws, unsigned queue_index, + uint_seq_no n1, uint_seq_no n2) +{ + uint_seq_no latest = ws->queues[queue_index].latest_seq_no; + + /* Since sequence numbers can wrap around, we need to pick the later number that's logically + * before "latest". The trick is to subtract "latest + 1" to underflow integer such + * that "latest" becomes UINT*_MAX, and then just return the maximum. + */ + uint_seq_no s1 = n1 - latest - 1; + uint_seq_no s2 = n2 - latest - 1; + + return s1 >= s2 ? n1 : n2; +} + +static inline void add_seq_no_to_list(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences, + unsigned queue_index, uint_seq_no seq_no) +{ + if (fences->valid_fence_mask & BITFIELD_BIT(queue_index)) { + fences->seq_no[queue_index] = pick_latest_seq_no(ws, queue_index, seq_no, + fences->seq_no[queue_index]); + } else { + fences->seq_no[queue_index] = seq_no; + fences->valid_fence_mask |= BITFIELD_BIT(queue_index); + } +} + bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf); struct pb_buffer_lean *amdgpu_bo_create(struct amdgpu_winsys *ws, uint64_t size, diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index b2c0ac2f5ae..55689fb5215 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -51,6 +51,7 @@ amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd) } util_queue_fence_init(&fence->submitted); + fence->imported = true; assert(amdgpu_fence_is_syncobj(fence)); return (struct pipe_fence_handle*)fence; @@ -84,6 +85,7 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) } util_queue_fence_init(&fence->submitted); + fence->imported = true; return (struct pipe_fence_handle*)fence; } @@ -975,6 +977,23 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs, cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE); + /* Compute the queue index by counting the IPs that have queues. */ + assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip)); + assert(ctx->ws->info.ip[ip_type].num_queues); + cs->queue_index = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) { + if (!ctx->ws->info.ip[i].num_queues) + continue; + + if (i == ip_type) + break; + + cs->queue_index++; + } + + assert(cs->queue_index < AMDGPU_MAX_QUEUES); + struct amdgpu_cs_fence_info fence_info; fence_info.handle = cs->ctx->user_fence_bo; fence_info.offset = cs->ip_type * 4; @@ -1190,27 +1209,6 @@ static void add_fence_to_list(struct amdgpu_fence_list *fences, amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); } -static bool is_noop_fence_dependency(struct amdgpu_cs *acs, - struct amdgpu_fence *fence) -{ - struct amdgpu_cs_context *cs = acs->csc; - - /* Detect no-op dependencies only when there is only 1 ring, - * because IBs on one ring are always executed one at a time. - * - * We always want no dependency between back-to-back gfx IBs, because - * we need the parallelism between IBs for good performance. - */ - if ((acs->ip_type == AMD_IP_GFX || - acs->ws->info.ip[acs->ip_type].num_queues == 1) && - !amdgpu_fence_is_syncobj(fence) && - fence->ctx == acs->ctx && - fence->fence.ip_type == cs->chunk_ib[IB_MAIN].ip_type) - return true; - - return amdgpu_fence_wait((void *)fence, 0, false); -} - static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws, struct pipe_fence_handle *pfence, unsigned dependency_flags) @@ -1221,7 +1219,8 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws, util_queue_fence_wait(&fence->submitted); - if (is_noop_fence_dependency(acs, fence)) + /* Ignore non-imported idle fences. This will only check the user fence in memory. */ + if (!fence->imported && amdgpu_fence_wait((void *)fence, 0, false)) return; if (amdgpu_fence_is_syncobj(fence)) @@ -1230,94 +1229,30 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws, add_fence_to_list(&cs->fence_dependencies, fence); } -static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs, - struct amdgpu_cs_context *cs, - struct amdgpu_cs_buffer *buffer) -{ - struct amdgpu_winsys_bo *bo = buffer->bo; - unsigned new_num_fences = 0; - const unsigned num_fences = bo->num_fences; - - for (unsigned j = 0; j < num_fences; ++j) { - struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; - - if (is_noop_fence_dependency(acs, bo_fence)) - continue; - - amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); - new_num_fences++; - - if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) - continue; - - add_fence_to_list(&cs->fence_dependencies, bo_fence); - } - - for (unsigned j = new_num_fences; j < num_fences; ++j) - amdgpu_fence_reference(&bo->fences[j], NULL); - - bo->num_fences = new_num_fences; -} - -/* Add the given list of fences to the buffer's fence list. - * - * Must be called with the winsys bo_fence_lock held. - */ -void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, - unsigned num_fences, - struct pipe_fence_handle **fences) -{ - if (bo->num_fences + num_fences > bo->max_fences) { - unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2); - struct pipe_fence_handle **new_fences = - REALLOC(bo->fences, - bo->num_fences * sizeof(*new_fences), - new_max_fences * sizeof(*new_fences)); - if (likely(new_fences && new_max_fences < UINT16_MAX)) { - bo->fences = new_fences; - bo->max_fences = new_max_fences; - } else { - unsigned drop; - - fprintf(stderr, new_fences ? "amdgpu_add_fences: too many fences, dropping some\n" - : "amdgpu_add_fences: allocation failure, dropping fence(s)\n"); - free(new_fences); - - if (!bo->num_fences) - return; - - bo->num_fences--; /* prefer to keep the most recent fence if possible */ - amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); - - drop = bo->num_fences + num_fences - bo->max_fences; - num_fences -= drop; - fences += drop; - } - } - - unsigned bo_num_fences = bo->num_fences; - - for (unsigned i = 0; i < num_fences; ++i) { - bo->fences[bo_num_fences] = NULL; - amdgpu_fence_reference(&bo->fences[bo_num_fences], fences[i]); - bo_num_fences++; - } - bo->num_fences = bo_num_fences; -} - static void amdgpu_add_bo_fences_to_dependencies(struct amdgpu_cs *acs, - struct amdgpu_cs_context *cs, - struct pipe_fence_handle *fence, + struct amdgpu_seq_no_fences *dependencies, + uint_seq_no new_queue_seq_no, struct amdgpu_buffer_list *list) { + struct amdgpu_winsys *ws = acs->ws; + unsigned queue_index = acs->queue_index; unsigned num_buffers = list->num_buffers; for (unsigned i = 0; i < num_buffers; i++) { struct amdgpu_cs_buffer *buffer = &list->buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; - amdgpu_add_bo_fence_dependencies(acs, cs, buffer); - amdgpu_add_fences(bo, 1, &fence); + /* Add BO fences from queues other than 'queue_index' to dependencies. */ + if (buffer->usage & RADEON_USAGE_SYNCHRONIZED) { + u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~BITFIELD_BIT(queue_index)) { + add_seq_no_to_list(ws, dependencies, other_queue_idx, + bo->fences.seq_no[other_queue_idx]); + } + } + + /* Also set the fence in the BO. */ + bo->fences.seq_no[queue_index] = new_queue_seq_no; + bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index); } } @@ -1378,11 +1313,112 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) bool has_user_fence = amdgpu_cs_has_user_fence(cs); simple_mtx_lock(&ws->bo_fence_lock); - /* Since the kernel driver doesn't synchronize execution between different - * rings automatically, we have to add fence dependencies manually. + struct amdgpu_queue *queue = &ws->queues[acs->queue_index]; + uint_seq_no prev_seq_no = queue->latest_seq_no; + + /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno, + * but the values aren't related. */ - for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) - amdgpu_add_bo_fences_to_dependencies(acs, cs, cs->fence, &cs->buffer_lists[i]); + uint_seq_no next_seq_no = prev_seq_no + 1; + + /* Wait for the oldest fence to signal. This should always check the user fence, then wait + * via the ioctl. We have to do this because we are going to release the oldest fence and + * replace it with the latest fence in the ring. + */ + struct pipe_fence_handle **oldest_fence = + &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE]; + + if (*oldest_fence) { + if (!amdgpu_fence_wait(*oldest_fence, 0, false)) { + /* Take the reference because the fence can be released by other threads after we + * unlock the mutex. + */ + struct pipe_fence_handle *tmp_fence = NULL; + amdgpu_fence_reference(&tmp_fence, *oldest_fence); + + /* Unlock the mutex before waiting. */ + simple_mtx_unlock(&ws->bo_fence_lock); + amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false); + amdgpu_fence_reference(&tmp_fence, NULL); + simple_mtx_lock(&ws->bo_fence_lock); + } + + /* Remove the idle fence from the ring. */ + amdgpu_fence_reference(oldest_fence, NULL); + } + + /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest + * sequence number per queue and removes all older ones. + */ + struct amdgpu_seq_no_fences seq_no_dependencies; + seq_no_dependencies.valid_fence_mask = 0; + + /* Add a fence dependency on the previous IB if the IP has multiple physical queues to + * make it appear as if it had only 1 queue, or if the previous IB comes from a different + * context. The reasons are: + * - Our BO fence tracking only supports 1 queue per IP. + * - IBs from different contexts must wait for each other and can't execute in a random order. + */ + struct amdgpu_fence *prev_fence = + (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE]; + + if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || prev_fence->ctx != acs->ctx)) + add_seq_no_to_list(ws, &seq_no_dependencies, acs->queue_index, prev_seq_no); + + /* Since the kernel driver doesn't synchronize execution between different + * rings automatically, we have to add fence dependencies manually. This gathers sequence + * numbers from BOs and sets the next sequence number in the BOs. + */ + for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) { + amdgpu_add_bo_fences_to_dependencies(acs, &seq_no_dependencies, next_seq_no, + &cs->buffer_lists[i]); + } + +#if 0 /* Debug code. */ + printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no); + + /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */ + for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) { + if (i == acs->queue_index) + continue; + + struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE]; + if (!fence) { + if (i <= 1) + printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no); + continue; + } + + bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i); + uint_seq_no old = seq_no_dependencies.seq_no[i]; + add_seq_no_to_list(ws, &seq_no_dependencies, i, ws->queues[i].latest_seq_no); + uint_seq_no new = seq_no_dependencies.seq_no[i]; + + if (!valid) + printf(" missing dependency on queue=%u, seq_no=%u\n", i, new); + else if (old != new) + printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new); + else + printf(" has dependency on queue=%u, seq_no=%u\n", i, old); + } +#endif + + /* Convert the sequence numbers we gathered to fence dependencies. */ + u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) { + struct pipe_fence_handle **fence = get_fence_from_ring(ws, &seq_no_dependencies, i); + + if (fence) { + /* If it's idle, don't add it to the list of dependencies. */ + if (amdgpu_fence_wait(*fence, 0, false)) + amdgpu_fence_reference(fence, NULL); + else + add_fence_to_list(&cs->fence_dependencies, (struct amdgpu_fence*)*fence); + } + } + + /* Finally, add the IB fence into the winsys queue. */ + amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence); + queue->latest_seq_no = next_seq_no; simple_mtx_unlock(&ws->bo_fence_lock); struct drm_amdgpu_bo_list_entry *bo_list = NULL; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index c922efc596f..5b505af00bc 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -119,6 +119,7 @@ struct amdgpu_cs { */ struct drm_amdgpu_cs_chunk_fence fence_chunk; enum amd_ip_type ip_type; + unsigned queue_index; /* We flip between these two CS. While one is being consumed * by the kernel in another thread, the other one is being filled @@ -166,6 +167,7 @@ struct amdgpu_fence { struct util_queue_fence submitted; volatile int signalled; /* bool (int for atomicity) */ + bool imported; }; static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence) @@ -242,9 +244,6 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute); -void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, - unsigned num_fences, - struct pipe_fence_handle **fences); void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs); void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index efd9c18c32f..8d3ef782a25 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -72,6 +72,11 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws) if (ws->reserve_vmid) amdgpu_vm_unreserve_vmid(ws->dev, 0); + for (unsigned i = 0; i < ARRAY_SIZE(ws->queues); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(ws->queues[i].fences); j++) + amdgpu_fence_reference(&ws->queues[i].fences[j], NULL); + } + if (util_queue_is_initialized(&ws->cs_queue)) util_queue_destroy(&ws->cs_queue); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 4c552461bca..70564e41bbc 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -62,11 +62,88 @@ struct amdgpu_screen_winsys { struct hash_table *kms_handles; }; +/* Maximum this number of IBs can be busy per queue. When submitting a new IB and the oldest IB + * ("AMDGPU_FENCE_RING_SIZE" IBs ago) is still busy, the CS thread will wait for it and will + * also block all queues from submitting new IBs. + */ +#define AMDGPU_FENCE_RING_SIZE 32 + +/* The maximum number of queues that can be present. */ +#define AMDGPU_MAX_QUEUES 6 + +/* This can use any integer type because the logic handles integer wraparounds robustly, but + * uint8_t wraps around so quickly that some BOs might never become idle because we don't + * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps + * around and they may stay "busy" in pb_cache long enough that we run out of memory. + */ +typedef uint16_t uint_seq_no; + +struct amdgpu_queue { + /* Ring buffer of fences. + * + * We only remember a certain number of the most recent fences per queue. When we add a new + * fence, we wait for the oldest one, which implies that all older fences not present + * in the ring are idle. This way we don't have to keep track of a million fence references + * for a million BOs. + * + * We only support 1 queue per IP. If an IP has multiple queues, we always add a fence + * dependency on the previous fence to make it behave like there is only 1 queue. + * + * amdgpu_winsys_bo doesn't have a list of fences. It only remembers the last sequence number + * for every queue where it was used. We then use the BO's sequence number to look up a fence + * in this ring. + */ + struct pipe_fence_handle *fences[AMDGPU_FENCE_RING_SIZE]; + + /* The sequence number of the latest fence. + * + * This sequence number is global per queue per device, shared by all contexts, and generated + * by the winsys, not the kernel. + * + * The latest fence is: fences[latest_seq_no % AMDGPU_FENCE_RING_SIZE] + * The oldest fence is: fences([latest_seq_no + 1) % AMDGPU_FENCE_RING_SIZE] + * The oldest sequence number in the ring: latest_seq_no - AMDGPU_FENCE_RING_SIZE + 1 + * + * The sequence number is in the ring if: + * latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE + * If the sequence number is not in the ring, it's idle. + * + * Integer wraparounds of the sequence number behave as follows: + * + * The comparison above gives the correct answer if buffer_seq_no isn't older than UINT*_MAX. + * If it's older than UINT*_MAX but not older than UINT*_MAX + AMDGPU_FENCE_RING_SIZE, we + * incorrectly pick and wait for one of the fences in the ring. That's only a problem when + * the type is so small (uint8_t) that seq_no wraps around very frequently, causing BOs to + * never become idle in certain very unlucky scenarios and running out of memory. + */ + uint_seq_no latest_seq_no; +}; + +/* This is part of every BO. */ +struct amdgpu_seq_no_fences { + /* A fence sequence number per queue. This number is used to look up the fence from + * struct amdgpu_queue. + * + * This sequence number is global per queue per device, shared by all contexts, and generated + * by the winsys, not the kernel. + */ + uint_seq_no seq_no[AMDGPU_MAX_QUEUES]; + + /* The mask of queues where seq_no[i] is valid. */ + uint8_t valid_fence_mask; +}; + +/* valid_fence_mask should have 1 bit for each queue. */ +static_assert(sizeof(((struct amdgpu_seq_no_fences*)NULL)->valid_fence_mask) * 8 >= AMDGPU_MAX_QUEUES, ""); + struct amdgpu_winsys { struct pipe_reference reference; /* See comment above */ int fd; + /* Protected by bo_fence_lock. */ + struct amdgpu_queue queues[AMDGPU_MAX_QUEUES]; + struct pb_cache bo_cache; struct pb_slabs bo_slabs; /* Slab allocator. */