radv: Add wait-before-submit support for timelines.

This is actually a non-threaded implementation. I'd summarize this
as event-based submission.

When submit happens we walk a tree of submissions that depend on
the syncobj signal operations to be submitted and if those submission
we no other dependencies we start to execute them immediately.

Or, well I still use a list to avoid issues with long chains and
the stacksize when using recursion.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
This commit is contained in:
Bas Nieuwenhuizen 2019-10-28 02:44:54 +01:00
parent 88d41367b8
commit 4aa75bb3bd
2 changed files with 154 additions and 7 deletions

View file

@ -61,6 +61,7 @@
#include "util/debug.h" #include "util/debug.h"
#include "util/mesa-sha1.h" #include "util/mesa-sha1.h"
#include "util/timespec.h" #include "util/timespec.h"
#include "util/u_atomic.h"
#include "compiler/glsl_types.h" #include "compiler/glsl_types.h"
#include "util/xmlpool.h" #include "util/xmlpool.h"
@ -74,6 +75,9 @@ radv_timeline_add_point_locked(struct radv_device *device,
struct radv_timeline *timeline, struct radv_timeline *timeline,
uint64_t p); uint64_t p);
static void
radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline,
struct list_head *processing_list);
static static
void radv_destroy_semaphore_part(struct radv_device *device, void radv_destroy_semaphore_part(struct radv_device *device,
@ -1820,12 +1824,17 @@ radv_queue_init(struct radv_device *device, struct radv_queue *queue,
if (!queue->hw_ctx) if (!queue->hw_ctx)
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
list_inithead(&queue->pending_submissions);
pthread_mutex_init(&queue->pending_mutex, NULL);
return VK_SUCCESS; return VK_SUCCESS;
} }
static void static void
radv_queue_finish(struct radv_queue *queue) radv_queue_finish(struct radv_queue *queue)
{ {
pthread_mutex_destroy(&queue->pending_mutex);
if (queue->hw_ctx) if (queue->hw_ctx)
queue->device->ws->ctx_destroy(queue->hw_ctx); queue->device->ws->ctx_destroy(queue->hw_ctx);
@ -3592,7 +3601,8 @@ radv_finalize_timelines(struct radv_device *device,
const uint64_t *wait_values, const uint64_t *wait_values,
uint32_t num_signal_sems, uint32_t num_signal_sems,
struct radv_semaphore_part **signal_sems, struct radv_semaphore_part **signal_sems,
const uint64_t *signal_values) const uint64_t *signal_values,
struct list_head *processing_list)
{ {
for (uint32_t i = 0; i < num_wait_sems; ++i) { for (uint32_t i = 0; i < num_wait_sems; ++i) {
if (wait_sems[i] && wait_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) { if (wait_sems[i] && wait_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) {
@ -3614,10 +3624,10 @@ radv_finalize_timelines(struct radv_device *device,
MAX2(signal_sems[i]->timeline.highest_submitted, point->value); MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
point->wait_count--; point->wait_count--;
} }
radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list);
pthread_mutex_unlock(&signal_sems[i]->timeline.mutex); pthread_mutex_unlock(&signal_sems[i]->timeline.mutex);
} }
} }
pthread_cond_broadcast(&device->timeline_cond);
} }
static void static void
@ -3720,6 +3730,12 @@ struct radv_deferred_queue_submission {
struct radv_semaphore_part *temporary_semaphore_parts; struct radv_semaphore_part *temporary_semaphore_parts;
uint32_t temporary_semaphore_part_count; uint32_t temporary_semaphore_part_count;
struct list_head queue_pending_list;
uint32_t submission_wait_count;
struct radv_timeline_waiter *wait_nodes;
struct list_head processing_list;
}; };
struct radv_queue_submission { struct radv_queue_submission {
@ -3768,6 +3784,7 @@ radv_create_deferred_submission(struct radv_queue *queue,
size += submission->signal_semaphore_count * sizeof(struct radv_semaphore_part *); size += submission->signal_semaphore_count * sizeof(struct radv_semaphore_part *);
size += submission->wait_value_count * sizeof(uint64_t); size += submission->wait_value_count * sizeof(uint64_t);
size += submission->signal_value_count * sizeof(uint64_t); size += submission->signal_value_count * sizeof(uint64_t);
size += submission->wait_semaphore_count * sizeof(struct radv_timeline_waiter);
deferred = calloc(1, size); deferred = calloc(1, size);
if (!deferred) if (!deferred)
@ -3830,12 +3847,76 @@ radv_create_deferred_submission(struct radv_queue *queue,
deferred->signal_values = deferred->wait_values + submission->wait_value_count; deferred->signal_values = deferred->wait_values + submission->wait_value_count;
memcpy(deferred->signal_values, submission->signal_values, submission->signal_value_count * sizeof(uint64_t)); memcpy(deferred->signal_values, submission->signal_values, submission->signal_value_count * sizeof(uint64_t));
deferred->wait_nodes = (void*)(deferred->signal_values + submission->signal_value_count);
/* This is worst-case. radv_queue_enqueue_submission will fill in further, but this
* ensure the submission is not accidentally triggered early when adding wait timelines. */
deferred->submission_wait_count = 1 + submission->wait_semaphore_count;
*out = deferred; *out = deferred;
return VK_SUCCESS; return VK_SUCCESS;
} }
static void
radv_queue_enqueue_submission(struct radv_deferred_queue_submission *submission,
struct list_head *processing_list)
{
uint32_t wait_cnt = 0;
struct radv_timeline_waiter *waiter = submission->wait_nodes;
for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) {
if (submission->wait_semaphores[i]->kind == RADV_SEMAPHORE_TIMELINE) {
pthread_mutex_lock(&submission->wait_semaphores[i]->timeline.mutex);
if (submission->wait_semaphores[i]->timeline.highest_submitted < submission->wait_values[i]) {
++wait_cnt;
waiter->value = submission->wait_values[i];
waiter->submission = submission;
list_addtail(&waiter->list, &submission->wait_semaphores[i]->timeline.waiters);
++waiter;
}
pthread_mutex_unlock(&submission->wait_semaphores[i]->timeline.mutex);
}
}
pthread_mutex_lock(&submission->queue->pending_mutex);
bool is_first = list_is_empty(&submission->queue->pending_submissions);
list_addtail(&submission->queue_pending_list, &submission->queue->pending_submissions);
pthread_mutex_unlock(&submission->queue->pending_mutex);
/* If there is already a submission in the queue, that will decrement the counter by 1 when
* submitted, but if the queue was empty, we decrement ourselves as there is no previous
* submission. */
uint32_t decrement = submission->wait_semaphore_count - wait_cnt + (is_first ? 1 : 0);
if (__atomic_sub_fetch(&submission->submission_wait_count, decrement, __ATOMIC_ACQ_REL) == 0) {
list_addtail(&submission->processing_list, processing_list);
}
}
static void
radv_queue_submission_update_queue(struct radv_deferred_queue_submission *submission,
struct list_head *processing_list)
{
pthread_mutex_lock(&submission->queue->pending_mutex);
list_del(&submission->queue_pending_list);
/* trigger the next submission in the queue. */
if (!list_is_empty(&submission->queue->pending_submissions)) {
struct radv_deferred_queue_submission *next_submission =
list_first_entry(&submission->queue->pending_submissions,
struct radv_deferred_queue_submission,
queue_pending_list);
if (p_atomic_dec_zero(&next_submission->submission_wait_count)) {
list_addtail(&next_submission->processing_list, processing_list);
}
}
pthread_mutex_unlock(&submission->queue->pending_mutex);
pthread_cond_broadcast(&submission->queue->device->timeline_cond);
}
static VkResult static VkResult
radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission) radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission,
struct list_head *processing_list)
{ {
RADV_FROM_HANDLE(radv_fence, fence, submission->fence); RADV_FROM_HANDLE(radv_fence, fence, submission->fence);
struct radv_queue *queue = submission->queue; struct radv_queue *queue = submission->queue;
@ -3957,7 +4038,12 @@ success:
submission->wait_values, submission->wait_values,
submission->signal_semaphore_count, submission->signal_semaphore_count,
submission->signal_semaphores, submission->signal_semaphores,
submission->signal_values); submission->signal_values,
processing_list);
/* Has to happen after timeline finalization to make sure the
* condition variable is only triggered when timelines and queue have
* been updated. */
radv_queue_submission_update_queue(submission, processing_list);
radv_free_sem_info(&sem_info); radv_free_sem_info(&sem_info);
free(submission); free(submission);
return VK_SUCCESS; return VK_SUCCESS;
@ -3970,6 +4056,21 @@ fail:
return VK_ERROR_DEVICE_LOST; return VK_ERROR_DEVICE_LOST;
} }
static VkResult
radv_process_submissions(struct list_head *processing_list)
{
while(!list_is_empty(processing_list)) {
struct radv_deferred_queue_submission *submission =
list_first_entry(processing_list, struct radv_deferred_queue_submission, processing_list);
list_del(&submission->processing_list);
VkResult result = radv_queue_submit_deferred(submission, processing_list);
if (result != VK_SUCCESS)
return result;
}
return VK_SUCCESS;
}
static VkResult radv_queue_submit(struct radv_queue *queue, static VkResult radv_queue_submit(struct radv_queue *queue,
const struct radv_queue_submission *submission) const struct radv_queue_submission *submission)
{ {
@ -3979,7 +4080,11 @@ static VkResult radv_queue_submit(struct radv_queue *queue,
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return result; return result;
return radv_queue_submit_deferred(deferred); struct list_head processing_list;
list_inithead(&processing_list);
radv_queue_enqueue_submission(deferred, &processing_list);
return radv_process_submissions(&processing_list);
} }
/* Signals fence as soon as all the work currently put on queue is done. */ /* Signals fence as soon as all the work currently put on queue is done. */
@ -4063,6 +4168,12 @@ VkResult radv_QueueWaitIdle(
{ {
RADV_FROM_HANDLE(radv_queue, queue, _queue); RADV_FROM_HANDLE(radv_queue, queue, _queue);
pthread_mutex_lock(&queue->pending_mutex);
while (!list_is_empty(&queue->pending_submissions)) {
pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex);
}
pthread_mutex_unlock(&queue->pending_mutex);
queue->device->ws->ctx_wait_idle(queue->hw_ctx, queue->device->ws->ctx_wait_idle(queue->hw_ctx,
radv_queue_family_to_ring(queue->queue_family_index), radv_queue_family_to_ring(queue->queue_family_index),
queue->queue_idx); queue->queue_idx);
@ -4971,6 +5082,7 @@ radv_create_timeline(struct radv_timeline *timeline, uint64_t value)
timeline->highest_submitted = value; timeline->highest_submitted = value;
list_inithead(&timeline->points); list_inithead(&timeline->points);
list_inithead(&timeline->free_points); list_inithead(&timeline->free_points);
list_inithead(&timeline->waiters);
pthread_mutex_init(&timeline->mutex, NULL); pthread_mutex_init(&timeline->mutex, NULL);
} }
@ -5106,6 +5218,22 @@ radv_timeline_wait_locked(struct radv_device *device,
return success ? VK_SUCCESS : VK_TIMEOUT; return success ? VK_SUCCESS : VK_TIMEOUT;
} }
static void
radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline,
struct list_head *processing_list)
{
list_for_each_entry_safe(struct radv_timeline_waiter, waiter,
&timeline->waiters, list) {
if (waiter->value > timeline->highest_submitted)
continue;
if (p_atomic_dec_zero(&waiter->submission->submission_wait_count)) {
list_addtail(&waiter->submission->processing_list, processing_list);
}
list_del(&waiter->list);
}
}
static static
void radv_destroy_semaphore_part(struct radv_device *device, void radv_destroy_semaphore_part(struct radv_device *device,
struct radv_semaphore_part *part) struct radv_semaphore_part *part)
@ -5288,8 +5416,13 @@ radv_SignalSemaphoreKHR(VkDevice _device,
radv_timeline_gc_locked(device, &part->timeline); radv_timeline_gc_locked(device, &part->timeline);
part->timeline.highest_submitted = MAX2(part->timeline.highest_submitted, pSignalInfo->value); part->timeline.highest_submitted = MAX2(part->timeline.highest_submitted, pSignalInfo->value);
part->timeline.highest_signaled = MAX2(part->timeline.highest_signaled, pSignalInfo->value); part->timeline.highest_signaled = MAX2(part->timeline.highest_signaled, pSignalInfo->value);
struct list_head processing_list;
list_inithead(&processing_list);
radv_timeline_trigger_waiters_locked(&part->timeline, &processing_list);
pthread_mutex_unlock(&part->timeline.mutex); pthread_mutex_unlock(&part->timeline.mutex);
break;
return radv_process_submissions(&processing_list);
} }
case RADV_SEMAPHORE_NONE: case RADV_SEMAPHORE_NONE:
case RADV_SEMAPHORE_SYNCOBJ: case RADV_SEMAPHORE_SYNCOBJ:

View file

@ -722,6 +722,9 @@ struct radv_queue {
struct radeon_cmdbuf *initial_preamble_cs; struct radeon_cmdbuf *initial_preamble_cs;
struct radeon_cmdbuf *initial_full_flush_preamble_cs; struct radeon_cmdbuf *initial_full_flush_preamble_cs;
struct radeon_cmdbuf *continue_preamble_cs; struct radeon_cmdbuf *continue_preamble_cs;
struct list_head pending_submissions;
pthread_mutex_t pending_mutex;
}; };
struct radv_bo_list { struct radv_bo_list {
@ -2167,7 +2170,6 @@ struct radv_query_pool {
uint32_t pipeline_stats_mask; uint32_t pipeline_stats_mask;
}; };
typedef enum { typedef enum {
RADV_SEMAPHORE_NONE, RADV_SEMAPHORE_NONE,
RADV_SEMAPHORE_WINSYS, RADV_SEMAPHORE_WINSYS,
@ -2175,6 +2177,14 @@ typedef enum {
RADV_SEMAPHORE_TIMELINE, RADV_SEMAPHORE_TIMELINE,
} radv_semaphore_kind; } radv_semaphore_kind;
struct radv_deferred_queue_submission;
struct radv_timeline_waiter {
struct list_head list;
struct radv_deferred_queue_submission *submission;
uint64_t value;
};
struct radv_timeline_point { struct radv_timeline_point {
struct list_head list; struct list_head list;
@ -2198,6 +2208,10 @@ struct radv_timeline {
/* Keep free points on hand so we do not have to recreate syncobjs all /* Keep free points on hand so we do not have to recreate syncobjs all
* the time. */ * the time. */
struct list_head free_points; struct list_head free_points;
/* Submissions that are deferred waiting for a specific value to be
* submitted. */
struct list_head waiters;
}; };
struct radv_semaphore_part { struct radv_semaphore_part {