mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-09 06:10:12 +01:00
winsys/amdgpu: do not synchronize unsynchronized buffers
When a buffer is added to a CS without the SYNCHRONIZED usage flag, we now no longer add a dependency on the buffer's fence(s). However, we still need to add a fence to the buffer during flush, so that cache reclaim works correctly (and in the hypothetical case that the buffer is later added to a CS _with_ the SYNCHRONIZED flag). It is now possible that the submissions refererring to a buffer are no longer linearly ordered, and so we may have to keep multiple fences around. We keep the fences in a FIFO. It should usually stay quite short (# of contexts * 2, for gfx + dma rings). While we're at it, extract amdgpu_add_fence_dependency for a single buffer, which will make adding the distinction between real buffer and slab cases easier. Reviewed-by: Marek Olšák <marek.olsak@amd.com>
This commit is contained in:
parent
6d89a40676
commit
5af9eef719
3 changed files with 121 additions and 48 deletions
|
|
@ -73,43 +73,58 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
|
|||
}
|
||||
|
||||
if (timeout == 0) {
|
||||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
if (bo->fence) {
|
||||
if (amdgpu_fence_wait(bo->fence, 0, false)) {
|
||||
/* Release the idle fence to avoid checking it again later. */
|
||||
amdgpu_fence_reference(&bo->fence, NULL);
|
||||
} else {
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
return true;
|
||||
unsigned idle_fences;
|
||||
bool buffer_idle;
|
||||
|
||||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
|
||||
for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
|
||||
if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
|
||||
break;
|
||||
}
|
||||
|
||||
/* Release the idle fences to avoid checking them again later. */
|
||||
for (unsigned i = 0; i < idle_fences; ++i)
|
||||
amdgpu_fence_reference(&bo->fences[i], NULL);
|
||||
|
||||
memmove(&bo->fences[0], &bo->fences[idle_fences],
|
||||
(bo->num_fences - idle_fences) * sizeof(*bo->fences));
|
||||
bo->num_fences -= idle_fences;
|
||||
|
||||
buffer_idle = !bo->num_fences;
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
|
||||
return buffer_idle;
|
||||
} else {
|
||||
struct pipe_fence_handle *fence = NULL;
|
||||
bool fence_idle = false;
|
||||
bool buffer_idle = true;
|
||||
|
||||
/* Take a reference to the fences, so that we can wait for it
|
||||
* without the lock. */
|
||||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
amdgpu_fence_reference(&fence, bo->fence);
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
while (bo->num_fences && buffer_idle) {
|
||||
struct pipe_fence_handle *fence = NULL;
|
||||
bool fence_idle = false;
|
||||
|
||||
/* Now wait for the fence. */
|
||||
if (fence) {
|
||||
amdgpu_fence_reference(&fence, bo->fences[0]);
|
||||
|
||||
/* Wait for the fence. */
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
if (amdgpu_fence_wait(fence, abs_timeout, true))
|
||||
fence_idle = true;
|
||||
else
|
||||
buffer_idle = false;
|
||||
}
|
||||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
|
||||
/* Release idle fences to avoid checking them again later. */
|
||||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
if (fence == bo->fence && fence_idle)
|
||||
amdgpu_fence_reference(&bo->fence, NULL);
|
||||
amdgpu_fence_reference(&fence, NULL);
|
||||
/* Release an idle fence to avoid checking it again later, keeping in
|
||||
* mind that the fence array may have been modified by other threads.
|
||||
*/
|
||||
if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
|
||||
amdgpu_fence_reference(&bo->fences[0], NULL);
|
||||
memmove(&bo->fences[0], &bo->fences[1],
|
||||
(bo->num_fences - 1) * sizeof(*bo->fences));
|
||||
bo->num_fences--;
|
||||
}
|
||||
|
||||
amdgpu_fence_reference(&fence, NULL);
|
||||
}
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
|
||||
return buffer_idle;
|
||||
|
|
@ -122,6 +137,16 @@ static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
|
|||
return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
|
||||
}
|
||||
|
||||
static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
|
||||
{
|
||||
for (unsigned i = 0; i < bo->num_fences; ++i)
|
||||
amdgpu_fence_reference(&bo->fences[i], NULL);
|
||||
|
||||
FREE(bo->fences);
|
||||
bo->num_fences = 0;
|
||||
bo->max_fences = 0;
|
||||
}
|
||||
|
||||
void amdgpu_bo_destroy(struct pb_buffer *_buf)
|
||||
{
|
||||
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
|
||||
|
|
@ -135,7 +160,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
|
|||
amdgpu_va_range_free(bo->va_handle);
|
||||
amdgpu_bo_free(bo->bo);
|
||||
|
||||
amdgpu_fence_reference(&bo->fence, NULL);
|
||||
amdgpu_bo_remove_fences(bo);
|
||||
|
||||
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
|
||||
bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
|
||||
|
|
|
|||
|
|
@ -62,8 +62,10 @@ struct amdgpu_winsys_bo {
|
|||
*/
|
||||
volatile int is_shared; /* bool (int for atomicity) */
|
||||
|
||||
/* Fence for buffer synchronization. */
|
||||
struct pipe_fence_handle *fence;
|
||||
/* Fences for buffer synchronization. */
|
||||
unsigned num_fences;
|
||||
unsigned max_fences;
|
||||
struct pipe_fence_handle **fences;
|
||||
|
||||
struct list_head global_list_item;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -801,33 +801,33 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
|
|||
|
||||
DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
|
||||
|
||||
/* Since the kernel driver doesn't synchronize execution between different
|
||||
* rings automatically, we have to add fence dependencies manually.
|
||||
*/
|
||||
static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
|
||||
static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
|
||||
struct amdgpu_cs_buffer *buffer)
|
||||
{
|
||||
struct amdgpu_cs_context *cs = acs->csc;
|
||||
int i;
|
||||
struct amdgpu_winsys_bo *bo = buffer->bo;
|
||||
struct amdgpu_cs_fence *dep;
|
||||
unsigned new_num_fences = 0;
|
||||
|
||||
cs->request.number_of_dependencies = 0;
|
||||
|
||||
for (i = 0; i < cs->num_buffers; i++) {
|
||||
struct amdgpu_cs_fence *dep;
|
||||
for (unsigned j = 0; j < bo->num_fences; ++j) {
|
||||
struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
|
||||
unsigned idx;
|
||||
|
||||
struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence;
|
||||
if (!bo_fence)
|
||||
continue;
|
||||
|
||||
if (bo_fence->ctx == acs->ctx &&
|
||||
bo_fence->fence.ip_type == cs->request.ip_type &&
|
||||
bo_fence->fence.ip_instance == cs->request.ip_instance &&
|
||||
bo_fence->fence.ring == cs->request.ring)
|
||||
bo_fence->fence.ip_type == cs->request.ip_type &&
|
||||
bo_fence->fence.ip_instance == cs->request.ip_instance &&
|
||||
bo_fence->fence.ring == cs->request.ring)
|
||||
continue;
|
||||
|
||||
if (amdgpu_fence_wait((void *)bo_fence, 0, false))
|
||||
continue;
|
||||
|
||||
amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
|
||||
new_num_fences++;
|
||||
|
||||
if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
|
||||
continue;
|
||||
|
||||
if (bo_fence->submission_in_progress)
|
||||
os_wait_until_zero(&bo_fence->submission_in_progress,
|
||||
PIPE_TIMEOUT_INFINITE);
|
||||
|
|
@ -844,6 +844,52 @@ static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
|
|||
dep = &cs->request.dependencies[idx];
|
||||
memcpy(dep, &bo_fence->fence, sizeof(*dep));
|
||||
}
|
||||
|
||||
for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
|
||||
amdgpu_fence_reference(&bo->fences[j], NULL);
|
||||
|
||||
bo->num_fences = new_num_fences;
|
||||
}
|
||||
|
||||
/* Since the kernel driver doesn't synchronize execution between different
|
||||
* rings automatically, we have to add fence dependencies manually.
|
||||
*/
|
||||
static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
|
||||
{
|
||||
struct amdgpu_cs_context *cs = acs->csc;
|
||||
int i;
|
||||
|
||||
cs->request.number_of_dependencies = 0;
|
||||
|
||||
for (i = 0; i < cs->num_buffers; i++)
|
||||
amdgpu_add_fence_dependency(acs, &cs->buffers[i]);
|
||||
}
|
||||
|
||||
static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo,
|
||||
struct pipe_fence_handle *fence)
|
||||
{
|
||||
if (bo->num_fences >= bo->max_fences) {
|
||||
unsigned new_max_fences = MAX2(1, bo->max_fences * 2);
|
||||
struct pipe_fence_handle **new_fences =
|
||||
REALLOC(bo->fences,
|
||||
bo->num_fences * sizeof(*new_fences),
|
||||
new_max_fences * sizeof(*new_fences));
|
||||
if (new_fences) {
|
||||
bo->fences = new_fences;
|
||||
bo->max_fences = new_max_fences;
|
||||
} else {
|
||||
fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n");
|
||||
if (!bo->num_fences)
|
||||
return;
|
||||
|
||||
bo->num_fences--; /* prefer to keep a more recent fence if possible */
|
||||
amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
|
||||
}
|
||||
}
|
||||
|
||||
bo->fences[bo->num_fences] = NULL;
|
||||
amdgpu_fence_reference(&bo->fences[bo->num_fences], fence);
|
||||
bo->num_fences++;
|
||||
}
|
||||
|
||||
void amdgpu_cs_submit_ib(void *job, int thread_index)
|
||||
|
|
@ -1031,9 +1077,9 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
|
|||
pipe_mutex_lock(ws->bo_fence_lock);
|
||||
amdgpu_add_fence_dependencies(cs);
|
||||
for (i = 0; i < num_buffers; i++) {
|
||||
p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
|
||||
amdgpu_fence_reference(&cur->buffers[i].bo->fence,
|
||||
cur->fence);
|
||||
struct amdgpu_winsys_bo *bo = cur->buffers[i].bo;
|
||||
p_atomic_inc(&bo->num_active_ioctls);
|
||||
amdgpu_add_fence(bo, cur->fence);
|
||||
}
|
||||
pipe_mutex_unlock(ws->bo_fence_lock);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue