freedreno/drm: Async submit support

Move the submit ioctl to it's own thread to unblock the driver thread
and let it move on to the next frame.

Note that I did experiment with doing the append_bo() parts
synchronously on the theory that we should be more likely to hit the
fast path if we did that part of submit merging before the bo was
potentially re-used in the next batch/submit.  It helped some things
by a couple percent, but hurt more things.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10444>
This commit is contained in:
Rob Clark 2021-04-19 15:20:15 -07:00 committed by Marge Bot
parent 2c9e8db28d
commit e9a9ac6f77
7 changed files with 169 additions and 15 deletions

View file

@ -33,8 +33,7 @@
#include "freedreno_drmif.h"
#include "freedreno_priv.h"
struct fd_device *kgsl_device_new(int fd);
struct fd_device *msm_device_new(int fd);
struct fd_device *msm_device_new(int fd, drmVersionPtr version);
struct fd_device *
fd_device_new(int fd)
@ -58,7 +57,7 @@ fd_device_new(int fd)
goto out;
}
dev = msm_device_new(fd);
dev = msm_device_new(fd, version);
dev->version = version->version_minor;
#if HAVE_FREEDRENO_KGSL
} else if (!strcmp(version->name, "kgsl")) {

View file

@ -30,7 +30,7 @@
#include <stdio.h>
#include "util/u_atomic.h"
#include "util/u_debug.h"
#include "util/u_dynarray.h"
#include "util/u_queue.h"
#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
@ -98,6 +98,13 @@ struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit,
* out-fence-fd
*/
struct fd_submit_fence {
/**
* The ready fence is signaled once the submit is actually flushed down
* to the kernel, and fence/fence_fd are populated. You must wait for
* this fence to be signaled before reading fence/fence_fd.
*/
struct util_queue_fence ready;
struct fd_fence fence;
/**

View file

@ -34,6 +34,9 @@ static void
msm_device_destroy(struct fd_device *dev)
{
struct msm_device *msm_dev = to_msm_device(dev);
if (util_queue_is_initialized(&msm_dev->submit_queue)) {
util_queue_destroy(&msm_dev->submit_queue);
}
free(msm_dev);
}
@ -45,7 +48,7 @@ static const struct fd_device_funcs funcs = {
};
struct fd_device *
msm_device_new(int fd)
msm_device_new(int fd, drmVersionPtr version)
{
struct msm_device *msm_dev;
struct fd_device *dev;
@ -61,6 +64,15 @@ msm_device_new(int fd)
dev = &msm_dev->base;
dev->funcs = &funcs;
/* async submit_queue currently only used for msm_submit_sp: */
if (version->version_minor >= FD_VERSION_SOFTPIN) {
/* Note the name is intentionally short to avoid the queue
* thread's comm truncating the interesting part of the
* process name.
*/
util_queue_init(&msm_dev->submit_queue, "sq", 8, 1, 0);
}
dev->bo_size = sizeof(struct msm_bo);
return dev;

View file

@ -40,10 +40,11 @@
struct msm_device {
struct fd_device base;
struct fd_bo_cache ring_cache;
struct util_queue submit_queue;
};
FD_DEFINE_CAST(fd_device, msm_device);
struct fd_device *msm_device_new(int fd);
struct fd_device *msm_device_new(int fd, drmVersionPtr version);
struct msm_pipe {
struct fd_pipe base;
@ -54,6 +55,14 @@ struct msm_pipe {
uint32_t chip_id;
uint32_t queue_id;
struct slab_parent_pool ring_pool;
/**
* The last fence seqno that was flushed to kernel (doesn't mean that it
* is complete, just that the kernel knows about it)
*/
uint32_t last_submit_fence;
uint32_t last_enqueue_fence; /* just for debugging */
};
FD_DEFINE_CAST(fd_pipe, msm_pipe);

View file

@ -26,6 +26,7 @@
#include <assert.h>
#include <inttypes.h>
#include <pthread.h>
#include "util/hash_table.h"
#include "util/os_file.h"
@ -41,6 +42,14 @@
#define INIT_SIZE 0x1000
/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on,
* instead use a condition-variable. Note that pipe->flush() is not expected
* to be a common/hot path.
*/
static pthread_cond_t flush_cnd = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER;
struct msm_submit_sp {
struct fd_submit base;
@ -64,6 +73,13 @@ struct msm_submit_sp {
*/
int in_fence_fd;
struct fd_submit_fence *out_fence;
/* State for enqueued submits:
*/
struct list_head submit_list; /* includes this submit as last element */
/* Used in case out_fence==NULL: */
struct util_queue_fence fence;
};
FD_DEFINE_CAST(fd_submit, msm_submit_sp);
@ -369,14 +385,65 @@ flush_submit_list(struct list_head *submit_list)
if (!bos_on_stack)
free(submit_bos);
pthread_mutex_lock(&flush_mtx);
assert(fd_fence_before(msm_pipe->last_submit_fence, msm_submit->base.fence));
msm_pipe->last_submit_fence = msm_submit->base.fence;
pthread_cond_broadcast(&flush_cnd);
pthread_mutex_unlock(&flush_mtx);
if (msm_submit->in_fence_fd != -1)
close(msm_submit->in_fence_fd);
fd_submit_del(&msm_submit->base);
return ret;
}
static void
msm_submit_sp_flush_execute(void *job, int thread_index)
{
struct fd_submit *submit = job;
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
flush_submit_list(&msm_submit->submit_list);
DEBUG_MSG("finish: %u", submit->fence);
}
static void
msm_submit_sp_flush_cleanup(void *job, int thread_index)
{
struct fd_submit *submit = job;
fd_submit_del(submit);
}
static int
enqueue_submit_list(struct list_head *submit_list)
{
struct fd_submit *submit = last_submit(submit_list);
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
struct msm_device *msm_dev = to_msm_device(submit->pipe->dev);
list_replace(submit_list, &msm_submit->submit_list);
list_inithead(submit_list);
struct util_queue_fence *fence;
if (msm_submit->out_fence) {
fence = &msm_submit->out_fence->ready;
} else {
util_queue_fence_init(&msm_submit->fence);
fence = &msm_submit->fence;
}
DEBUG_MSG("enqueue: %u", submit->fence);
util_queue_add_job(&msm_dev->submit_queue,
submit, fence,
msm_submit_sp_flush_execute,
msm_submit_sp_flush_cleanup,
0);
return 0;
}
static bool
should_defer(struct fd_submit *submit)
{
@ -402,6 +469,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
struct fd_submit_fence *out_fence)
{
struct fd_device *dev = submit->pipe->dev;
struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
/* Acquire lock before flush_prep() because it is possible to race between
* this and pipe->flush():
@ -420,15 +488,16 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
list_inithead(&dev->deferred_submits);
dev->deferred_cmds = 0;
simple_mtx_unlock(&dev->submit_lock);
flush_submit_list(&submit_list);
simple_mtx_lock(&dev->submit_lock);
enqueue_submit_list(&submit_list);
}
list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
assert(fd_fence_before(msm_pipe->last_enqueue_fence, submit->fence));
msm_pipe->last_enqueue_fence = submit->fence;
/* If we don't need an out-fence, we can defer the submit.
*
* TODO we could defer submits with in-fence as well.. if we took our own
@ -436,6 +505,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
* deferred submits
*/
if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
DEBUG_MSG("defer: %u", submit->fence);
dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
simple_mtx_unlock(&dev->submit_lock);
@ -451,19 +521,24 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
simple_mtx_unlock(&dev->submit_lock);
return flush_submit_list(&submit_list);
return enqueue_submit_list(&submit_list);
}
void
msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
{
struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
struct fd_device *dev = pipe->dev;
struct list_head submit_list;
DEBUG_MSG("flush: %u", fence);
list_inithead(&submit_list);
simple_mtx_lock(&dev->submit_lock);
assert(!fd_fence_after(fence, msm_pipe->last_enqueue_fence));
foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
/* We should never have submits from multiple pipes in the deferred
* list. If we did, we couldn't compare their fence to our fence,
@ -485,9 +560,20 @@ msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
simple_mtx_unlock(&dev->submit_lock);
if (list_is_empty(&submit_list))
return;
goto flush_sync;
flush_submit_list(&submit_list);
enqueue_submit_list(&submit_list);
flush_sync:
/* Once we are sure that we've enqueued at least up to the requested
* submit, we need to be sure that submitq has caught up and flushed
* them to the kernel
*/
pthread_mutex_lock(&flush_mtx);
while (fd_fence_before(msm_pipe->last_submit_fence, fence)) {
pthread_cond_wait(&flush_cnd, &flush_mtx);
}
pthread_mutex_unlock(&flush_mtx);
}
static void

View file

@ -59,6 +59,16 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence,
}
}
/* If after the pre-created unflushed fence is flushed, we end up
* re-populated to a previous last_fence, then *that* is the one
* whose submit_fence.ready we want to wait on:
*/
if (fence->last_fence) {
return fence_flush(pctx, fence->last_fence, timeout);
}
util_queue_fence_wait(&fence->submit_fence.ready);
/* We've already waited for batch to be flushed and fence->batch
* to be cleared:
*/
@ -69,6 +79,8 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence,
if (fence->batch)
fd_batch_flush(fence->batch);
util_queue_fence_wait(&fence->submit_fence.ready);
debug_assert(!fence->batch);
return true;
@ -81,19 +93,37 @@ fd_fence_repopulate(struct pipe_fence_handle *fence, struct pipe_fence_handle *l
* might have been)
*/
assert(!fence->submit_fence.use_fence_fd);
assert(!last_fence->batch);
fence->submit_fence.fence = last_fence->submit_fence.fence;
fd_fence_ref(&fence->last_fence, last_fence);
/* We have nothing to flush, so nothing will clear the batch reference
* (which is normally done when the batch is flushed), so do it now:
*/
fd_fence_set_batch(fence, NULL);
}
static void
fd_fence_destroy(struct pipe_fence_handle *fence)
{
fd_fence_ref(&fence->last_fence, NULL);
tc_unflushed_batch_token_reference(&fence->tc_token, NULL);
if (fence->submit_fence.use_fence_fd)
close(fence->submit_fence.fence_fd);
if (fence->syncobj)
drmSyncobjDestroy(fd_device_fd(fence->screen->dev), fence->syncobj);
fd_pipe_del(fence->pipe);
/* TODO might be worth trying harder to avoid a potential stall here,
* but that would require the submit somehow holding a reference to
* the pipe_fence_handle.. and I'm not sure if it is a thing that is
* likely to matter much.
*/
util_queue_fence_wait(&fence->submit_fence.ready);
FREE(fence);
}
@ -113,6 +143,9 @@ fd_fence_finish(struct pipe_screen *pscreen, struct pipe_context *pctx,
if (!fence_flush(pctx, fence, timeout))
return false;
if (fence->last_fence)
fence = fence->last_fence;
if (fence->submit_fence.use_fence_fd) {
int ret = sync_wait(fence->submit_fence.fence_fd, timeout / 1000000);
return ret == 0;
@ -136,9 +169,10 @@ fence_create(struct fd_context *ctx, struct fd_batch *batch, int fence_fd,
pipe_reference_init(&fence->reference, 1);
util_queue_fence_init(&fence->ready);
util_queue_fence_init(&fence->submit_fence.ready);
fence->ctx = ctx;
fence->batch = batch;
fd_fence_set_batch(fence, batch);
fence->pipe = fd_pipe_ref(ctx->pipe);
fence->screen = ctx->screen;
fence->submit_fence.fence_fd = fence_fd;
@ -237,6 +271,7 @@ fd_fence_set_batch(struct pipe_fence_handle *fence, struct fd_batch *batch)
if (batch) {
assert(!fence->batch);
fence->batch = batch;
batch->needs_flush = true;
} else {
fence->batch = NULL;

View file

@ -35,6 +35,12 @@
struct pipe_fence_handle {
struct pipe_reference reference;
/* When a pre-created unflushed fence has no actual rendering to flush, and
* the last_fence optimization is used, this will be a reference to the
* *actualy* fence which needs to be flushed before waiting.
*/
struct pipe_fence_handle *last_fence;
/* fence holds a weak reference to the batch until the batch is flushed, to
* accommodate PIPE_FLUSH_DEFERRED. When the batch is actually flushed, it
* is cleared (before the batch reference is dropped). If we need to wait