diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c index aae730b0cbd..939e828d475 100644 --- a/src/gallium/drivers/iris/iris_batch.c +++ b/src/gallium/drivers/iris/iris_batch.c @@ -266,8 +266,15 @@ ensure_exec_obj_space(struct iris_batch *batch, uint32_t count) static void add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable) { + uint64_t extra_flags = 0; + assert(batch->exec_array_size > batch->exec_count); + if (writable) + extra_flags |= EXEC_OBJECT_WRITE; + if (!iris_bo_is_external(bo)) + extra_flags |= EXEC_OBJECT_ASYNC; + iris_bo_reference(bo); batch->exec_bos[batch->exec_count] = bo; @@ -276,7 +283,7 @@ add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable) (struct drm_i915_gem_exec_object2) { .handle = bo->gem_handle, .offset = bo->address, - .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0), + .flags = bo->kflags | extra_flags, }; bo->index = batch->exec_count; @@ -346,12 +353,8 @@ iris_use_pinned_bo(struct iris_batch *batch, * we want to avoid synchronizing in this case. */ if (other_entry && - ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) { + ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) iris_batch_flush(batch->other_batches[b]); - iris_batch_add_syncobj(batch, - batch->other_batches[b]->last_fence->syncobj, - I915_EXEC_FENCE_WAIT); - } } } @@ -627,6 +630,123 @@ iris_batch_check_for_reset(struct iris_batch *batch) return status; } +static void +move_syncobj_to_batch(struct iris_batch *batch, + struct iris_syncobj **p_syncobj, + unsigned flags) +{ + struct iris_bufmgr *bufmgr = batch->screen->bufmgr; + + if (!*p_syncobj) + return; + + bool found = false; + util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s) { + if (*p_syncobj == *s) { + found = true; + break; + } + } + + if (!found) + iris_batch_add_syncobj(batch, *p_syncobj, flags); + + iris_syncobj_reference(bufmgr, p_syncobj, NULL); +} + +static void +update_bo_syncobjs(struct iris_batch *batch, struct iris_bo *bo, bool write) +{ + struct iris_screen *screen = batch->screen; + struct iris_bufmgr *bufmgr = screen->bufmgr; + + /* Make sure bo->deps is big enough */ + if (screen->id >= bo->deps_size) { + int new_size = screen->id + 1; + bo->deps= realloc(bo->deps, new_size * sizeof(bo->deps[0])); + memset(&bo->deps[bo->deps_size], 0, + sizeof(bo->deps[0]) * (new_size - bo->deps_size)); + + bo->deps_size = new_size; + } + + /* When it comes to execbuf submission of non-shared buffers, we only need + * to care about the reads and writes done by the other batches of our own + * screen, and we also don't care about the reads and writes done by our + * own batch, although we need to track them. Just note that other places of + * our code may need to care about all the operations done by every batch + * on every screen. + */ + struct iris_bo_screen_deps *deps = &bo->deps[screen->id]; + int batch_idx = batch->name; + +#if IRIS_BATCH_COUNT == 2 + /* Due to the above, we exploit the fact that IRIS_NUM_BATCHES is actually + * 2, which means there's only one other batch we need to care about. + */ + int other_batch_idx = 1 - batch_idx; +#else + /* For IRIS_BATCH_COUNT == 3 we can do: + * int other_batch_idxs[IRIS_BATCH_COUNT - 1] = { + * (batch_idx ^ 1) & 1, + * (batch_idx ^ 2) & 2, + * }; + * For IRIS_BATCH_COUNT == 4 we can do: + * int other_batch_idxs[IRIS_BATCH_COUNT - 1] = { + * (batch_idx + 1) & 3, + * (batch_idx + 2) & 3, + * (batch_idx + 3) & 3, + * }; + */ +#error "Implement me." +#endif + + /* If it is being written to by others, wait on it. */ + if (deps->write_syncobjs[other_batch_idx]) + move_syncobj_to_batch(batch, &deps->write_syncobjs[other_batch_idx], + I915_EXEC_FENCE_WAIT); + + struct iris_syncobj *batch_syncobj = iris_batch_get_signal_syncobj(batch); + + if (write) { + /* If we're writing to it, set our batch's syncobj as write_syncobj so + * others can wait on us. Also wait every reader we care about before + * writing. + */ + iris_syncobj_reference(bufmgr, &deps->write_syncobjs[batch_idx], + batch_syncobj); + + move_syncobj_to_batch(batch, &deps->read_syncobjs[other_batch_idx], + I915_EXEC_FENCE_WAIT); + + } else { + /* If we're reading, replace the other read from our batch index. */ + iris_syncobj_reference(bufmgr, &deps->read_syncobjs[batch_idx], + batch_syncobj); + } +} + +static void +update_batch_syncobjs(struct iris_batch *batch) +{ + struct iris_bufmgr *bufmgr = batch->screen->bufmgr; + simple_mtx_t *bo_deps_lock = iris_bufmgr_get_bo_deps_lock(bufmgr); + + simple_mtx_lock(bo_deps_lock); + + for (int i = 0; i < batch->exec_count; i++) { + struct iris_bo *bo = batch->exec_bos[i]; + struct drm_i915_gem_exec_object2 *exec_obj = &batch->validation_list[i]; + bool write = exec_obj->flags & EXEC_OBJECT_WRITE; + + if (bo == batch->screen->workaround_bo) + continue; + + update_bo_syncobjs(batch, bo, write); + } + simple_mtx_unlock(bo_deps_lock); +} + /** * Submit the batch to the GPU via execbuffer2. */ @@ -711,6 +831,8 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line) iris_finish_batch(batch); + update_batch_syncobjs(batch); + if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) { const char *basefile = strstr(file, "iris/"); if (basefile) diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h index cd7de5221c9..68617d0a6be 100644 --- a/src/gallium/drivers/iris/iris_batch.h +++ b/src/gallium/drivers/iris/iris_batch.h @@ -56,8 +56,6 @@ enum iris_batch_name { IRIS_BATCH_COMPUTE, }; -#define IRIS_BATCH_COUNT 2 - struct iris_batch { struct iris_context *ice; struct iris_screen *screen; diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c index 52a71259283..08f88032bf0 100644 --- a/src/gallium/drivers/iris/iris_bufmgr.c +++ b/src/gallium/drivers/iris/iris_bufmgr.c @@ -181,6 +181,7 @@ struct iris_bufmgr { int fd; simple_mtx_t lock; + simple_mtx_t bo_deps_lock; /** Array of lists of cached gem objects of power-of-two sizes */ struct bo_cache_bucket cache_bucket[14 * 4]; @@ -381,20 +382,100 @@ vma_free(struct iris_bufmgr *bufmgr, util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size); } -int -iris_bo_busy(struct iris_bo *bo) +static bool +iris_bo_busy_gem(struct iris_bo *bo) { struct iris_bufmgr *bufmgr = bo->bufmgr; struct drm_i915_gem_busy busy = { .handle = bo->gem_handle }; int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy); if (ret == 0) { - bo->idle = !busy.busy; return busy.busy; } return false; } +/* A timeout of 0 just checks for busyness. */ +static int +iris_bo_wait_syncobj(struct iris_bo *bo, int64_t timeout_ns) +{ + int ret = 0; + struct iris_bufmgr *bufmgr = bo->bufmgr; + + /* If we know it's idle, don't bother with the kernel round trip */ + if (bo->idle) + return 0; + + simple_mtx_lock(&bufmgr->bo_deps_lock); + + uint32_t handles[bo->deps_size * IRIS_BATCH_COUNT * 2]; + int handle_count = 0; + + for (int d = 0; d < bo->deps_size; d++) { + for (int b = 0; b < IRIS_BATCH_COUNT; b++) { + struct iris_syncobj *r = bo->deps[d].read_syncobjs[b]; + struct iris_syncobj *w = bo->deps[d].write_syncobjs[b]; + if (r) + handles[handle_count++] = r->handle; + if (w) + handles[handle_count++] = w->handle; + } + } + + if (handle_count == 0) + goto out; + + /* Unlike the gem wait, negative values are not infinite here. */ + int64_t timeout_abs = os_time_get_absolute_timeout(timeout_ns); + if (timeout_abs < 0) + timeout_abs = INT64_MAX; + + struct drm_syncobj_wait args = { + .handles = (uintptr_t) handles, + .timeout_nsec = timeout_abs, + .count_handles = handle_count, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, + }; + + ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args); + if (ret != 0) { + ret = -errno; + goto out; + } + + /* We just waited everything, so clean all the deps. */ + for (int d = 0; d < bo->deps_size; d++) { + for (int b = 0; b < IRIS_BATCH_COUNT; b++) { + iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL); + iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL); + } + } + +out: + simple_mtx_unlock(&bufmgr->bo_deps_lock); + return ret; +} + +static bool +iris_bo_busy_syncobj(struct iris_bo *bo) +{ + return iris_bo_wait_syncobj(bo, 0) == -ETIME; +} + +bool +iris_bo_busy(struct iris_bo *bo) +{ + bool busy; + if (iris_bo_is_external(bo)) + busy = iris_bo_busy_gem(bo); + else + busy = iris_bo_busy_syncobj(bo); + + bo->idle = !busy; + + return busy; +} + int iris_bo_madvise(struct iris_bo *bo, int state) { @@ -865,6 +946,14 @@ bo_close(struct iris_bo *bo) /* Return the VMA for reuse */ vma_free(bo->bufmgr, bo->address, bo->size); + for (int d = 0; d < bo->deps_size; d++) { + for (int b = 0; b < IRIS_BATCH_COUNT; b++) { + iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL); + iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL); + } + } + free(bo->deps); + free(bo); } @@ -1149,6 +1238,22 @@ iris_bo_wait_rendering(struct iris_bo *bo) iris_bo_wait(bo, -1); } +static int +iris_bo_wait_gem(struct iris_bo *bo, int64_t timeout_ns) +{ + struct iris_bufmgr *bufmgr = bo->bufmgr; + struct drm_i915_gem_wait wait = { + .bo_handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + if (ret != 0) + return -errno; + + return 0; +} + /** * Waits on a BO for the given amount of time. * @@ -1179,17 +1284,13 @@ iris_bo_wait_rendering(struct iris_bo *bo) int iris_bo_wait(struct iris_bo *bo, int64_t timeout_ns) { - struct iris_bufmgr *bufmgr = bo->bufmgr; + int ret; - /* If we know it's idle, don't bother with the kernel round trip */ - if (bo->idle && !iris_bo_is_external(bo)) - return 0; + if (iris_bo_is_external(bo)) + ret = iris_bo_wait_gem(bo, timeout_ns); + else + ret = iris_bo_wait_syncobj(bo, timeout_ns); - struct drm_i915_gem_wait wait = { - .bo_handle = bo->gem_handle, - .timeout_ns = timeout_ns, - }; - int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); if (ret != 0) return -errno; @@ -1208,6 +1309,7 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr) bufmgr->aux_map_ctx = NULL; simple_mtx_destroy(&bufmgr->lock); + simple_mtx_destroy(&bufmgr->bo_deps_lock); /* Free any cached buffer objects we were going to reuse */ for (int i = 0; i < bufmgr->num_buckets; i++) { @@ -1786,6 +1888,7 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse) p_atomic_set(&bufmgr->refcount, 1); simple_mtx_init(&bufmgr->lock, mtx_plain); + simple_mtx_init(&bufmgr->bo_deps_lock, mtx_plain); list_inithead(&bufmgr->zombie_list); @@ -1924,3 +2027,9 @@ iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr) { return bufmgr->aux_map_ctx; } + +simple_mtx_t * +iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr) +{ + return &bufmgr->bo_deps_lock; +} diff --git a/src/gallium/drivers/iris/iris_bufmgr.h b/src/gallium/drivers/iris/iris_bufmgr.h index 226168dd28c..3c2da0e06a6 100644 --- a/src/gallium/drivers/iris/iris_bufmgr.h +++ b/src/gallium/drivers/iris/iris_bufmgr.h @@ -31,13 +31,15 @@ #include "c11/threads.h" #include "util/macros.h" #include "util/u_atomic.h" +#include "util/u_dynarray.h" #include "util/list.h" +#include "util/simple_mtx.h" #include "pipe/p_defines.h" -struct iris_batch; struct intel_device_info; struct pipe_debug_callback; struct isl_surf; +struct iris_syncobj; /** * Memory zones. When allocating a buffer, you can request that it is @@ -129,6 +131,13 @@ enum iris_mmap_mode { IRIS_MMAP_WB, /**< Write-back mapping with CPU caches enabled */ }; +#define IRIS_BATCH_COUNT 2 + +struct iris_bo_screen_deps { + struct iris_syncobj *write_syncobjs[IRIS_BATCH_COUNT]; + struct iris_syncobj *read_syncobjs[IRIS_BATCH_COUNT]; +}; + struct iris_bo { /** * Size in bytes of the buffer object. @@ -213,6 +222,10 @@ struct iris_bo { */ uint64_t last_seqnos[NUM_IRIS_DOMAINS] __attribute__ ((aligned (8))); + /** Up to one per screen, may need realloc. */ + struct iris_bo_screen_deps *deps; + int deps_size; + /** * Boolean of whether the GPU is definitely not accessing the buffer. * @@ -346,10 +359,10 @@ iris_bo_is_external(const struct iris_bo *bo) void iris_bo_mark_exported(struct iris_bo *bo); /** - * Returns 1 if mapping the buffer for write could cause the process + * Returns true if mapping the buffer for write could cause the process * to block, due to the object being active in the GPU. */ -int iris_bo_busy(struct iris_bo *bo); +bool iris_bo_busy(struct iris_bo *bo); /** * Specify the volatility of the buffer. @@ -451,4 +464,6 @@ enum iris_memory_zone iris_memzone_for_address(uint64_t address); int iris_bufmgr_create_screen_id(struct iris_bufmgr *bufmgr); +simple_mtx_t *iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr); + #endif /* IRIS_BUFMGR_H */