mesa/src/gallium/drivers/iris/iris_batch.c

965 lines
29 KiB
C
Raw Normal View History

/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* @file iris_batch.c
*
* Batchbuffer and command submission module.
*
* Every API draw call results in a number of GPU commands, which we
* collect into a "batch buffer". Typically, many draw calls are grouped
* into a single batch to amortize command submission overhead.
*
* We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl.
* One critical piece of data is the "validation list", which contains a
* list of the buffer objects (BOs) which the commands in the GPU need.
* The kernel will make sure these are resident and pinned at the correct
* virtual memory address before executing our batch. If a BO is not in
* the validation list, it effectively does not exist, so take care.
*/
#include "iris_batch.h"
#include "iris_bufmgr.h"
#include "iris_context.h"
#include "iris_fence.h"
#include "drm-uapi/i915_drm.h"
#include "common/intel_aux_map.h"
#include "intel/common/intel_gem.h"
#include "util/hash_table.h"
#include "util/set.h"
#include "util/u_upload_mgr.h"
#include "main/macros.h"
#include <errno.h>
#include <xf86drm.h>
#if HAVE_VALGRIND
#include <valgrind.h>
#include <memcheck.h>
#define VG(x) x
#else
#define VG(x)
#endif
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
2017-12-27 02:25:20 -08:00
static void
iris_batch_reset(struct iris_batch *batch);
static unsigned
num_fences(struct iris_batch *batch)
{
return util_dynarray_num_elements(&batch->exec_fences,
struct drm_i915_gem_exec_fence);
}
/**
* Debugging code to dump the fence list, used by INTEL_DEBUG=submit.
*/
static void
dump_fence_list(struct iris_batch *batch)
{
fprintf(stderr, "Fence list (length %u): ", num_fences(batch));
util_dynarray_foreach(&batch->exec_fences,
struct drm_i915_gem_exec_fence, f) {
fprintf(stderr, "%s%u%s ",
(f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "",
f->handle,
(f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : "");
}
fprintf(stderr, "\n");
}
/**
* Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
*/
static void
dump_validation_list(struct iris_batch *batch,
struct drm_i915_gem_exec_object2 *validation_list)
{
2017-12-27 02:25:20 -08:00
fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
2017-12-27 02:25:20 -08:00
for (int i = 0; i < batch->exec_count; i++) {
uint64_t flags = validation_list[i].flags;
assert(validation_list[i].handle == batch->exec_bos[i]->gem_handle);
fprintf(stderr, "[%2d]: %2d %-14s @ 0x%"PRIx64" (%"PRIu64"B)\t %2d refs %s\n",
i,
validation_list[i].handle,
2017-12-27 02:25:20 -08:00
batch->exec_bos[i]->name,
(uint64_t)validation_list[i].offset,
batch->exec_bos[i]->size,
batch->exec_bos[i]->refcount,
(flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
2017-12-27 02:25:20 -08:00
}
}
/**
* Return BO information to the batch decoder (for debugging).
*/
static struct intel_batch_decode_bo
decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
2018-04-06 16:21:21 -07:00
{
struct iris_batch *batch = v_batch;
assert(ppgtt);
2018-04-06 16:21:21 -07:00
for (int i = 0; i < batch->exec_count; i++) {
2018-04-06 17:01:45 -07:00
struct iris_bo *bo = batch->exec_bos[i];
2018-04-07 06:59:22 -07:00
/* The decoder zeroes out the top 16 bits, so we need to as well */
uint64_t bo_address = bo->address & (~0ull >> 16);
2018-04-07 06:59:22 -07:00
if (address >= bo_address && address < bo_address + bo->size) {
return (struct intel_batch_decode_bo) {
.addr = bo_address,
2018-04-06 17:01:45 -07:00
.size = bo->size,
.map = iris_bo_map(batch->dbg, bo, MAP_READ),
2018-04-06 16:21:21 -07:00
};
}
}
return (struct intel_batch_decode_bo) { };
2018-04-06 16:21:21 -07:00
}
iris: Record state sizes for INTEL_DEBUG=bat decoding. Felix noticed a crash when using INTEL_DEBUG=bat decoding. It turned out that we were sometimes placing variable length data near the end of a buffer, and with the decoder guessing random lengths rather than having an actual count, it was walking off the end and crashing. So this does more than improve the decoder output. Unfortunately, this is a bit more complicated than i965's handling, because we don't have a single state buffer. Various places upload data via u_upload_mgr, and so there isn't a central place to record the size. We don't need to catch every single place, however, since it's only important to record variable length packets (like viewports and binding tables). State data also lives arbitrarily long, rather than being discarded on every batch like i965, so we don't know when to clear out old entries either. (We also don't have a callback when an upload buffer is released.) So, this tracking may space leak over time. That's probably okay though, as this is only a debugging feature and it's a slow leak. We may also get lucky and overwrite existing entries as we reuse BOs, though I find this unlikely to happen. The fact that the decoder works in terms of offsets from a state base address is also not ideal, as dynamic state base address and surface state base address differ for iris. However, because dynamic state addresses start from the top of a 4GB region, and binding tables start from addresses [0, 64K), it's highly unlikely that we'll get overlap. We can always improve this, but for now it's better than what we had.
2019-05-22 18:14:38 -07:00
static unsigned
decode_get_state_size(void *v_batch,
uint64_t address,
UNUSED uint64_t base_address)
iris: Record state sizes for INTEL_DEBUG=bat decoding. Felix noticed a crash when using INTEL_DEBUG=bat decoding. It turned out that we were sometimes placing variable length data near the end of a buffer, and with the decoder guessing random lengths rather than having an actual count, it was walking off the end and crashing. So this does more than improve the decoder output. Unfortunately, this is a bit more complicated than i965's handling, because we don't have a single state buffer. Various places upload data via u_upload_mgr, and so there isn't a central place to record the size. We don't need to catch every single place, however, since it's only important to record variable length packets (like viewports and binding tables). State data also lives arbitrarily long, rather than being discarded on every batch like i965, so we don't know when to clear out old entries either. (We also don't have a callback when an upload buffer is released.) So, this tracking may space leak over time. That's probably okay though, as this is only a debugging feature and it's a slow leak. We may also get lucky and overwrite existing entries as we reuse BOs, though I find this unlikely to happen. The fact that the decoder works in terms of offsets from a state base address is also not ideal, as dynamic state base address and surface state base address differ for iris. However, because dynamic state addresses start from the top of a 4GB region, and binding tables start from addresses [0, 64K), it's highly unlikely that we'll get overlap. We can always improve this, but for now it's better than what we had.
2019-05-22 18:14:38 -07:00
{
struct iris_batch *batch = v_batch;
unsigned size = (uintptr_t)
_mesa_hash_table_u64_search(batch->state_sizes, address);
iris: Record state sizes for INTEL_DEBUG=bat decoding. Felix noticed a crash when using INTEL_DEBUG=bat decoding. It turned out that we were sometimes placing variable length data near the end of a buffer, and with the decoder guessing random lengths rather than having an actual count, it was walking off the end and crashing. So this does more than improve the decoder output. Unfortunately, this is a bit more complicated than i965's handling, because we don't have a single state buffer. Various places upload data via u_upload_mgr, and so there isn't a central place to record the size. We don't need to catch every single place, however, since it's only important to record variable length packets (like viewports and binding tables). State data also lives arbitrarily long, rather than being discarded on every batch like i965, so we don't know when to clear out old entries either. (We also don't have a callback when an upload buffer is released.) So, this tracking may space leak over time. That's probably okay though, as this is only a debugging feature and it's a slow leak. We may also get lucky and overwrite existing entries as we reuse BOs, though I find this unlikely to happen. The fact that the decoder works in terms of offsets from a state base address is also not ideal, as dynamic state base address and surface state base address differ for iris. However, because dynamic state addresses start from the top of a 4GB region, and binding tables start from addresses [0, 64K), it's highly unlikely that we'll get overlap. We can always improve this, but for now it's better than what we had.
2019-05-22 18:14:38 -07:00
return size;
}
/**
* Decode the current batch.
*/
static void
decode_batch(struct iris_batch *batch)
{
void *map = iris_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ);
intel_print_batch(&batch->decoder, map, batch->primary_batch_size,
batch->exec_bos[0]->address, false);
}
void
iris_init_batch(struct iris_context *ice,
enum iris_batch_name name,
int priority)
{
struct iris_batch *batch = &ice->batches[name];
struct iris_screen *screen = (void *) ice->ctx.screen;
batch->screen = screen;
batch->dbg = &ice->dbg;
batch->reset = &ice->reset;
batch->state_sizes = ice->state.sizes;
batch->name = name;
batch->ice = ice;
iris: fix in fences backend for ext_external_objects edge case EXT_external_objects require we call glSignalSemaphoreEXT followed by a glFlush. If the rendering workload is small when Signal and Flush take place the relevant batch buffers with the actual rendering might have been submitted already. In that case the following condition is met: (iris_batch_bytes_used(batch) == 0). This causes: glFlush() --> iris_fence_flush() -> iris_batch_flush() -> _iris_batch_flush() to no-op, and so the fence doesn't get submitted to the kernel. Then when anv tries to submit an execuf2 that must wait on the shared VkSempahore / drm_syncobj fence, there isn't one and the kernel rejects the batchbuffer causing an -EINVAL return of the execbuf2 ioctl and a VK_DEVICE_LOST error. Empty batch buffers do have typically one fence attached, but the ones carrying the extra fence from a glSignalSempahore() call do have at least 2. See also: the discussion in MR!4337. v2: Changed the batch struct to have a contains_fence_signal variable that is set to true when i915_EXEC_FENCE_SIGNAL fence is added to the batch and off when batch is reset (Tapani Pälli) Authored-by: Mario Kleiner <mario.kleiner.de@gmail.com> Reported-by: Mario Kleiner <mario.kleiner.de@gmail.com> Tested-by: Mario Kleiner <mario.kleiner.de@gmail.com> Signed-off-by: Eleni Maria Stea <elene.mst@gmail.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Reviewed-by: Mario Kleiner <mario.kleiner.de@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8861>
2021-02-14 20:28:02 +02:00
batch->contains_fence_signal = false;
batch->fine_fences.uploader =
u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM,
PIPE_USAGE_STAGING, 0);
iris_fine_fence_init(batch);
batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr);
assert(batch->hw_ctx_id);
iris_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
batch->exec_count = 0;
batch->exec_array_size = 128;
batch->exec_bos =
malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
batch->bos_written =
rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(batch->exec_array_size));
batch->cache.render = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
_mesa_key_pointer_equal);
memset(batch->other_batches, 0, sizeof(batch->other_batches));
for (int i = 0, j = 0; i < IRIS_BATCH_COUNT; i++) {
if (i != name)
batch->other_batches[j++] = &ice->batches[i];
}
if (INTEL_DEBUG) {
2018-04-06 16:21:21 -07:00
const unsigned decode_flags =
INTEL_BATCH_DECODE_FULL |
((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
INTEL_BATCH_DECODE_OFFSETS |
INTEL_BATCH_DECODE_FLOATS;
2018-04-06 16:21:21 -07:00
intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo,
stderr, decode_flags, NULL,
decode_get_bo, decode_get_state_size, batch);
batch->decoder.dynamic_base = IRIS_MEMZONE_DYNAMIC_START;
batch->decoder.instruction_base = IRIS_MEMZONE_SHADER_START;
batch->decoder.max_vbo_decoded_lines = 32;
}
iris_init_batch_measure(ice, batch);
iris_batch_reset(batch);
}
static int
find_exec_index(struct iris_batch *batch, struct iris_bo *bo)
{
unsigned index = READ_ONCE(bo->index);
if (index < batch->exec_count && batch->exec_bos[index] == bo)
return index;
/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
return index;
}
return -1;
}
static void
ensure_exec_obj_space(struct iris_batch *batch, uint32_t count)
{
while (batch->exec_count + count > batch->exec_array_size) {
unsigned old_size = batch->exec_array_size;
batch->exec_array_size *= 2;
batch->exec_bos =
realloc(batch->exec_bos,
batch->exec_array_size * sizeof(batch->exec_bos[0]));
batch->bos_written =
rerzalloc(NULL, batch->bos_written, BITSET_WORD,
BITSET_WORDS(old_size),
BITSET_WORDS(batch->exec_array_size));
}
}
static void
add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable)
{
assert(batch->exec_array_size > batch->exec_count);
iris_bo_reference(bo);
batch->exec_bos[batch->exec_count] = bo;
if (writable)
BITSET_SET(batch->bos_written, batch->exec_count);
bo->index = batch->exec_count;
batch->exec_count++;
batch->aperture_space += bo->size;
}
/**
* Add a buffer to the current batch's validation list.
*
* You must call this on any BO you wish to use in this batch, to ensure
* that it's resident when the GPU commands execute.
*/
void
iris_use_pinned_bo(struct iris_batch *batch,
struct iris_bo *bo,
bool writable, enum iris_domain access)
{
iris: Move some iris_bo entries into a union We would like to start performing slab allocation of resources, where multiple resources can be backed by a single GEM object. Originally, I had thought to move busy tracking, cache domain tracking, and so on into resources themselves, instead of having them at the BO level. Multiple resources would point at the same BO with an offset. Unfortunately, this meant adjusting the batch BO pinning code to take resources rather than BOs. That cascades into needing iris_address for genxml packing to store resources, not BOs. Which means that places which have use raw BOs would need to start creating resources instead. Except some places, like aux BO handling, really don't make sense as pipe resources and really would rather use raw BOs. So iris_address would need to store both, which convolutes the genxml field. And, having a BO and resource means that every place in the code needs to handle that offset correctly. It sounds simple, but is a giant mess. Instead, we take a different route: adjust iris_bo itself, so that BOs are either be backed by a GEM object (as is the case today), or backed by another underlying BO. "Real" BOs have bo->gem_handle != 0. "Slab allocated" or "fake" or "wrapper" BOs have bo->gem_handle == 0. We move fields into a union based on these cases. amdgpu takes this approach. This sounds complex at first glance---in theory, every place that interacts with BOs might need to handle the wrapper BO special case. But in practice, they don't. For suballocated BOs, we can set the wrapper's address field to the underlying BO's address plus any offset, at which point it looks like any other BO. Most other properties are easily queried; the main code that needs updating is execbuf handling and bufmgr internals. For now, we simply move the fields. Any code that accesses either bo->real.* or bo->gem_handle will need updating in future patches to actually handle the slab-allocated case. Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12623>
2021-07-19 22:46:45 -07:00
assert(bo->real.kflags & EXEC_OBJECT_PINNED);
assert(bo != batch->bo);
/* Never mark the workaround BO with EXEC_OBJECT_WRITE. We don't care
* about the order of any writes to that buffer, and marking it writable
* would introduce data dependencies between multiple batches which share
* the buffer. It is added directly to the batch using add_bo_to_batch()
* during batch reset time.
*/
if (bo == batch->screen->workaround_bo)
return;
if (access < NUM_IRIS_DOMAINS) {
assert(batch->sync_region_depth);
iris_bo_bump_seqno(bo, batch->next_seqno, access);
}
int existing_index = find_exec_index(batch, bo);
if (existing_index != -1) {
/* The BO is already in the list; mark it writable */
if (writable)
BITSET_SET(batch->bos_written, existing_index);
return;
}
if (!batch->measure || bo != batch->measure->bo) {
/* This is the first time our batch has seen this BO. Before we use it,
* we may need to flush and synchronize with other batches.
*/
for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
struct iris_batch *other_batch = batch->other_batches[b];
int other_index = find_exec_index(other_batch, bo);
/* If the buffer is referenced by another batch, and either batch
* intends to write it, then flush the other batch and synchronize.
*
* Consider these cases:
*
* 1. They read, we read => No synchronization required.
* 2. They read, we write => Synchronize (they need the old value)
* 3. They write, we read => Synchronize (we need their new value)
* 4. They write, we write => Synchronize (order writes)
*
* The read/read case is very common, as multiple batches usually
* share a streaming state buffer or shader assembly buffer, and
* we want to avoid synchronizing in this case.
*/
if (other_index != -1 &&
(writable || BITSET_TEST(other_batch->bos_written, other_index)))
iris_batch_flush(other_batch);
}
2018-11-06 21:34:58 -08:00
}
ensure_exec_obj_space(batch, 1);
add_bo_to_batch(batch, bo, writable);
}
static void
2018-05-02 19:54:23 -07:00
create_batch(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
2018-05-02 19:54:23 -07:00
batch->bo = iris_bo_alloc(bufmgr, "command buffer",
BATCH_SZ + BATCH_RESERVED, 1,
IRIS_MEMZONE_OTHER, 0);
iris: Move some iris_bo entries into a union We would like to start performing slab allocation of resources, where multiple resources can be backed by a single GEM object. Originally, I had thought to move busy tracking, cache domain tracking, and so on into resources themselves, instead of having them at the BO level. Multiple resources would point at the same BO with an offset. Unfortunately, this meant adjusting the batch BO pinning code to take resources rather than BOs. That cascades into needing iris_address for genxml packing to store resources, not BOs. Which means that places which have use raw BOs would need to start creating resources instead. Except some places, like aux BO handling, really don't make sense as pipe resources and really would rather use raw BOs. So iris_address would need to store both, which convolutes the genxml field. And, having a BO and resource means that every place in the code needs to handle that offset correctly. It sounds simple, but is a giant mess. Instead, we take a different route: adjust iris_bo itself, so that BOs are either be backed by a GEM object (as is the case today), or backed by another underlying BO. "Real" BOs have bo->gem_handle != 0. "Slab allocated" or "fake" or "wrapper" BOs have bo->gem_handle == 0. We move fields into a union based on these cases. amdgpu takes this approach. This sounds complex at first glance---in theory, every place that interacts with BOs might need to handle the wrapper BO special case. But in practice, they don't. For suballocated BOs, we can set the wrapper's address field to the underlying BO's address plus any offset, at which point it looks like any other BO. Most other properties are easily queried; the main code that needs updating is execbuf handling and bufmgr internals. For now, we simply move the fields. Any code that accesses either bo->real.* or bo->gem_handle will need updating in future patches to actually handle the slab-allocated case. Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12623>
2021-07-19 22:46:45 -07:00
batch->bo->real.kflags |= EXEC_OBJECT_CAPTURE;
2018-05-02 19:54:23 -07:00
batch->map = iris_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
batch->map_next = batch->map;
ensure_exec_obj_space(batch, 1);
add_bo_to_batch(batch, batch->bo, false);
2018-05-02 19:54:23 -07:00
}
static void
iris_batch_maybe_noop(struct iris_batch *batch)
{
/* We only insert the NOOP at the beginning of the batch. */
assert(iris_batch_bytes_used(batch) == 0);
if (batch->noop_enabled) {
/* Emit MI_BATCH_BUFFER_END to prevent any further command to be
* executed.
*/
uint32_t *map = batch->map_next;
map[0] = (0xA << 23);
batch->map_next += 4;
}
}
2018-05-02 19:54:23 -07:00
static void
iris_batch_reset(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
2018-11-19 14:16:03 -08:00
iris_bo_unreference(batch->bo);
2018-05-02 19:54:23 -07:00
batch->primary_batch_size = 0;
batch->total_chained_batch_size = 0;
batch->contains_draw = false;
iris: fix in fences backend for ext_external_objects edge case EXT_external_objects require we call glSignalSemaphoreEXT followed by a glFlush. If the rendering workload is small when Signal and Flush take place the relevant batch buffers with the actual rendering might have been submitted already. In that case the following condition is met: (iris_batch_bytes_used(batch) == 0). This causes: glFlush() --> iris_fence_flush() -> iris_batch_flush() -> _iris_batch_flush() to no-op, and so the fence doesn't get submitted to the kernel. Then when anv tries to submit an execuf2 that must wait on the shared VkSempahore / drm_syncobj fence, there isn't one and the kernel rejects the batchbuffer causing an -EINVAL return of the execbuf2 ioctl and a VK_DEVICE_LOST error. Empty batch buffers do have typically one fence attached, but the ones carrying the extra fence from a glSignalSempahore() call do have at least 2. See also: the discussion in MR!4337. v2: Changed the batch struct to have a contains_fence_signal variable that is set to true when i915_EXEC_FENCE_SIGNAL fence is added to the batch and off when batch is reset (Tapani Pälli) Authored-by: Mario Kleiner <mario.kleiner.de@gmail.com> Reported-by: Mario Kleiner <mario.kleiner.de@gmail.com> Tested-by: Mario Kleiner <mario.kleiner.de@gmail.com> Signed-off-by: Eleni Maria Stea <elene.mst@gmail.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Reviewed-by: Mario Kleiner <mario.kleiner.de@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8861>
2021-02-14 20:28:02 +02:00
batch->contains_fence_signal = false;
batch->decoder.surface_base = batch->last_surface_base_address;
2018-05-02 19:54:23 -07:00
create_batch(batch);
assert(batch->bo->index == 0);
struct iris_syncobj *syncobj = iris_create_syncobj(bufmgr);
iris_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
iris_syncobj_reference(bufmgr, &syncobj, NULL);
assert(!batch->sync_region_depth);
iris_batch_sync_boundary(batch);
iris_batch_mark_reset_sync(batch);
/* Always add the workaround BO, it contains a driver identifier at the
* beginning quite helpful to debug error states.
*/
add_bo_to_batch(batch, screen->workaround_bo, false);
iris_batch_maybe_noop(batch);
}
void
iris_batch_free(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
for (int i = 0; i < batch->exec_count; i++) {
iris_bo_unreference(batch->exec_bos[i]);
}
free(batch->exec_bos);
ralloc_free(batch->bos_written);
ralloc_free(batch->exec_fences.mem_ctx);
pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s)
iris_syncobj_reference(bufmgr, s, NULL);
ralloc_free(batch->syncobjs.mem_ctx);
iris_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
u_upload_destroy(batch->fine_fences.uploader);
2018-05-02 19:54:23 -07:00
iris_bo_unreference(batch->bo);
batch->bo = NULL;
batch->map = NULL;
batch->map_next = NULL;
iris_destroy_hw_context(bufmgr, batch->hw_ctx_id);
iris_destroy_batch_measure(batch->measure);
batch->measure = NULL;
_mesa_hash_table_destroy(batch->cache.render, NULL);
if (INTEL_DEBUG)
intel_batch_decode_ctx_finish(&batch->decoder);
}
2018-05-02 19:54:23 -07:00
/**
* If we've chained to a secondary batch, or are getting near to the end,
* then flush. This should only be called between draws.
*/
void
iris_batch_maybe_flush(struct iris_batch *batch, unsigned estimate)
2017-12-27 02:25:20 -08:00
{
2018-05-02 19:54:23 -07:00
if (batch->bo != batch->exec_bos[0] ||
iris_batch_bytes_used(batch) + estimate >= BATCH_SZ) {
2017-12-27 02:25:20 -08:00
iris_batch_flush(batch);
}
}
static void
record_batch_sizes(struct iris_batch *batch)
{
unsigned batch_size = iris_batch_bytes_used(batch);
VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->map, batch_size));
if (batch->bo == batch->exec_bos[0])
batch->primary_batch_size = batch_size;
batch->total_chained_batch_size += batch_size;
}
void
iris_chain_to_new_batch(struct iris_batch *batch)
{
uint32_t *cmd = batch->map_next;
uint64_t *addr = batch->map_next + 4;
batch->map_next += 12;
2017-12-27 02:25:20 -08:00
record_batch_sizes(batch);
/* No longer held by batch->bo, still held by validation list */
iris_bo_unreference(batch->bo);
create_batch(batch);
/* Emit MI_BATCH_BUFFER_START to chain to another batch. */
*cmd = (0x31 << 23) | (1 << 8) | (3 - 2);
*addr = batch->bo->address;
}
static void
add_aux_map_bos_to_batch(struct iris_batch *batch)
{
void *aux_map_ctx = iris_bufmgr_get_aux_map_context(batch->screen->bufmgr);
if (!aux_map_ctx)
return;
uint32_t count = intel_aux_map_get_num_buffers(aux_map_ctx);
ensure_exec_obj_space(batch, count);
intel_aux_map_fill_bos(aux_map_ctx,
(void**)&batch->exec_bos[batch->exec_count], count);
for (uint32_t i = 0; i < count; i++) {
struct iris_bo *bo = batch->exec_bos[batch->exec_count];
add_bo_to_batch(batch, bo, false);
}
}
static void
finish_seqno(struct iris_batch *batch)
{
struct iris_fine_fence *sq = iris_fine_fence_new(batch, IRIS_FENCE_END);
if (!sq)
return;
iris_fine_fence_reference(batch->screen, &batch->last_fence, sq);
iris_fine_fence_reference(batch->screen, &sq, NULL);
}
/**
2018-06-23 12:18:11 -07:00
* Terminate a batch with MI_BATCH_BUFFER_END.
*/
static void
iris_finish_batch(struct iris_batch *batch)
{
const struct intel_device_info *devinfo = &batch->screen->devinfo;
if (devinfo->ver == 12 && batch->name == IRIS_BATCH_RENDER) {
/* We re-emit constants at the beginning of every batch as a hardware
* bug workaround, so invalidate indirect state pointers in order to
* save ourselves the overhead of restoring constants redundantly when
* the next render batch is executed.
*/
iris_emit_pipe_control_flush(batch, "ISP invalidate at batch end",
PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
PIPE_CONTROL_STALL_AT_SCOREBOARD |
PIPE_CONTROL_CS_STALL);
}
add_aux_map_bos_to_batch(batch);
finish_seqno(batch);
/* Emit MI_BATCH_BUFFER_END to finish our batch. */
uint32_t *map = batch->map_next;
map[0] = (0xA << 23);
batch->map_next += 4;
record_batch_sizes(batch);
}
/**
* Replace our current GEM context with a new one (in case it got banned).
*/
static bool
replace_hw_ctx(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
uint32_t new_ctx = iris_clone_hw_context(bufmgr, batch->hw_ctx_id);
if (!new_ctx)
return false;
iris_destroy_hw_context(bufmgr, batch->hw_ctx_id);
batch->hw_ctx_id = new_ctx;
/* Notify the context that state must be re-initialized. */
iris_lost_context_state(batch);
return true;
}
enum pipe_reset_status
iris_batch_check_for_reset(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
enum pipe_reset_status status = PIPE_NO_RESET;
struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id };
if (intel_ioctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno));
if (stats.batch_active != 0) {
/* A reset was observed while a batch from this hardware context was
* executing. Assume that this context was at fault.
*/
status = PIPE_GUILTY_CONTEXT_RESET;
} else if (stats.batch_pending != 0) {
/* A reset was observed while a batch from this context was in progress,
* but the batch was not executing. In this case, assume that the
* context was not at fault.
*/
status = PIPE_INNOCENT_CONTEXT_RESET;
}
if (status != PIPE_NO_RESET) {
/* Our context is likely banned, or at least in an unknown state.
* Throw it away and start with a fresh context. Ideally this may
* catch the problem before our next execbuf fails with -EIO.
*/
replace_hw_ctx(batch);
}
return status;
}
static void
move_syncobj_to_batch(struct iris_batch *batch,
struct iris_syncobj **p_syncobj,
unsigned flags)
{
struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
if (!*p_syncobj)
return;
bool found = false;
util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s) {
if (*p_syncobj == *s) {
found = true;
break;
}
}
if (!found)
iris_batch_add_syncobj(batch, *p_syncobj, flags);
iris_syncobj_reference(bufmgr, p_syncobj, NULL);
}
static void
update_bo_syncobjs(struct iris_batch *batch, struct iris_bo *bo, bool write)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
/* Make sure bo->deps is big enough */
if (screen->id >= bo->deps_size) {
int new_size = screen->id + 1;
bo->deps= realloc(bo->deps, new_size * sizeof(bo->deps[0]));
memset(&bo->deps[bo->deps_size], 0,
sizeof(bo->deps[0]) * (new_size - bo->deps_size));
bo->deps_size = new_size;
}
/* When it comes to execbuf submission of non-shared buffers, we only need
* to care about the reads and writes done by the other batches of our own
* screen, and we also don't care about the reads and writes done by our
* own batch, although we need to track them. Just note that other places of
* our code may need to care about all the operations done by every batch
* on every screen.
*/
struct iris_bo_screen_deps *deps = &bo->deps[screen->id];
int batch_idx = batch->name;
#if IRIS_BATCH_COUNT == 2
/* Due to the above, we exploit the fact that IRIS_NUM_BATCHES is actually
* 2, which means there's only one other batch we need to care about.
*/
int other_batch_idx = 1 - batch_idx;
#else
/* For IRIS_BATCH_COUNT == 3 we can do:
* int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
* (batch_idx ^ 1) & 1,
* (batch_idx ^ 2) & 2,
* };
* For IRIS_BATCH_COUNT == 4 we can do:
* int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
* (batch_idx + 1) & 3,
* (batch_idx + 2) & 3,
* (batch_idx + 3) & 3,
* };
*/
#error "Implement me."
#endif
/* If it is being written to by others, wait on it. */
if (deps->write_syncobjs[other_batch_idx])
move_syncobj_to_batch(batch, &deps->write_syncobjs[other_batch_idx],
I915_EXEC_FENCE_WAIT);
struct iris_syncobj *batch_syncobj = iris_batch_get_signal_syncobj(batch);
if (write) {
/* If we're writing to it, set our batch's syncobj as write_syncobj so
* others can wait on us. Also wait every reader we care about before
* writing.
*/
iris_syncobj_reference(bufmgr, &deps->write_syncobjs[batch_idx],
batch_syncobj);
move_syncobj_to_batch(batch, &deps->read_syncobjs[other_batch_idx],
I915_EXEC_FENCE_WAIT);
} else {
/* If we're reading, replace the other read from our batch index. */
iris_syncobj_reference(bufmgr, &deps->read_syncobjs[batch_idx],
batch_syncobj);
}
}
static void
update_batch_syncobjs(struct iris_batch *batch)
{
struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
simple_mtx_t *bo_deps_lock = iris_bufmgr_get_bo_deps_lock(bufmgr);
simple_mtx_lock(bo_deps_lock);
for (int i = 0; i < batch->exec_count; i++) {
struct iris_bo *bo = batch->exec_bos[i];
bool write = BITSET_TEST(batch->bos_written, i);
if (bo == batch->screen->workaround_bo)
continue;
update_bo_syncobjs(batch, bo, write);
}
simple_mtx_unlock(bo_deps_lock);
}
/**
* Submit the batch to the GPU via execbuffer2.
*/
static int
submit_batch(struct iris_batch *batch)
{
2018-05-02 19:54:23 -07:00
iris_bo_unmap(batch->bo);
struct drm_i915_gem_exec_object2 *validation_list =
malloc(batch->exec_count * sizeof(*validation_list));
for (int i = 0; i < batch->exec_count; i++) {
struct iris_bo *bo = batch->exec_bos[i];
bool written = BITSET_TEST(batch->bos_written, i);
unsigned extra_flags = 0;
if (written)
extra_flags |= EXEC_OBJECT_WRITE;
if (!iris_bo_is_external(bo))
extra_flags |= EXEC_OBJECT_ASYNC;
validation_list[i] = (struct drm_i915_gem_exec_object2) {
.handle = bo->gem_handle,
.offset = bo->address,
iris: Move some iris_bo entries into a union We would like to start performing slab allocation of resources, where multiple resources can be backed by a single GEM object. Originally, I had thought to move busy tracking, cache domain tracking, and so on into resources themselves, instead of having them at the BO level. Multiple resources would point at the same BO with an offset. Unfortunately, this meant adjusting the batch BO pinning code to take resources rather than BOs. That cascades into needing iris_address for genxml packing to store resources, not BOs. Which means that places which have use raw BOs would need to start creating resources instead. Except some places, like aux BO handling, really don't make sense as pipe resources and really would rather use raw BOs. So iris_address would need to store both, which convolutes the genxml field. And, having a BO and resource means that every place in the code needs to handle that offset correctly. It sounds simple, but is a giant mess. Instead, we take a different route: adjust iris_bo itself, so that BOs are either be backed by a GEM object (as is the case today), or backed by another underlying BO. "Real" BOs have bo->gem_handle != 0. "Slab allocated" or "fake" or "wrapper" BOs have bo->gem_handle == 0. We move fields into a union based on these cases. amdgpu takes this approach. This sounds complex at first glance---in theory, every place that interacts with BOs might need to handle the wrapper BO special case. But in practice, they don't. For suballocated BOs, we can set the wrapper's address field to the underlying BO's address plus any offset, at which point it looks like any other BO. Most other properties are easily queried; the main code that needs updating is execbuf handling and bufmgr internals. For now, we simply move the fields. Any code that accesses either bo->real.* or bo->gem_handle will need updating in future patches to actually handle the slab-allocated case. Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12623>
2021-07-19 22:46:45 -07:00
.flags = bo->real.kflags | extra_flags,
};
}
if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
dump_fence_list(batch);
dump_validation_list(batch, validation_list);
}
if (INTEL_DEBUG & DEBUG_BATCH) {
decode_batch(batch);
}
/* The requirement for using I915_EXEC_NO_RELOC are:
*
* The addresses written in the objects must match the corresponding
* reloc.address which in turn must match the corresponding
* execobject.offset.
*
* Any render targets written to in the batch must be flagged with
* EXEC_OBJECT_WRITE.
*
* To avoid stalling, execobject.offset should match the current
* address of that object within the active context.
*/
struct drm_i915_gem_execbuffer2 execbuf = {
.buffers_ptr = (uintptr_t) validation_list,
.buffer_count = batch->exec_count,
.batch_start_offset = 0,
/* This must be QWord aligned. */
.batch_len = ALIGN(batch->primary_batch_size, 8),
.flags = I915_EXEC_RENDER |
I915_EXEC_NO_RELOC |
2017-12-27 02:25:20 -08:00
I915_EXEC_BATCH_FIRST |
I915_EXEC_HANDLE_LUT,
.rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */
};
if (num_fences(batch)) {
execbuf.flags |= I915_EXEC_FENCE_ARRAY;
execbuf.num_cliprects = num_fences(batch);
execbuf.cliprects_ptr =
(uintptr_t)util_dynarray_begin(&batch->exec_fences);
}
int ret = 0;
if (!batch->screen->devinfo.no_hw &&
intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
ret = -errno;
for (int i = 0; i < batch->exec_count; i++) {
struct iris_bo *bo = batch->exec_bos[i];
bo->idle = false;
bo->index = -1;
iris_bo_unreference(bo);
}
free(validation_list);
return ret;
}
static const char *
batch_name_to_string(enum iris_batch_name name)
{
const char *names[IRIS_BATCH_COUNT] = {
[IRIS_BATCH_RENDER] = "render",
[IRIS_BATCH_COMPUTE] = "compute",
};
return names[name];
}
/**
* Flush the batch buffer, submitting it to the GPU and resetting it so
* we're ready to emit the next batch.
*/
void
_iris_batch_flush(struct iris_batch *batch, const char *file, int line)
{
struct iris_screen *screen = batch->screen;
iris: fix in fences backend for ext_external_objects edge case EXT_external_objects require we call glSignalSemaphoreEXT followed by a glFlush. If the rendering workload is small when Signal and Flush take place the relevant batch buffers with the actual rendering might have been submitted already. In that case the following condition is met: (iris_batch_bytes_used(batch) == 0). This causes: glFlush() --> iris_fence_flush() -> iris_batch_flush() -> _iris_batch_flush() to no-op, and so the fence doesn't get submitted to the kernel. Then when anv tries to submit an execuf2 that must wait on the shared VkSempahore / drm_syncobj fence, there isn't one and the kernel rejects the batchbuffer causing an -EINVAL return of the execbuf2 ioctl and a VK_DEVICE_LOST error. Empty batch buffers do have typically one fence attached, but the ones carrying the extra fence from a glSignalSempahore() call do have at least 2. See also: the discussion in MR!4337. v2: Changed the batch struct to have a contains_fence_signal variable that is set to true when i915_EXEC_FENCE_SIGNAL fence is added to the batch and off when batch is reset (Tapani Pälli) Authored-by: Mario Kleiner <mario.kleiner.de@gmail.com> Reported-by: Mario Kleiner <mario.kleiner.de@gmail.com> Tested-by: Mario Kleiner <mario.kleiner.de@gmail.com> Signed-off-by: Eleni Maria Stea <elene.mst@gmail.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Reviewed-by: Mario Kleiner <mario.kleiner.de@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8861>
2021-02-14 20:28:02 +02:00
/* If a fence signals we need to flush it. */
if (iris_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal)
return;
iris_measure_batch_end(batch->ice, batch);
iris_finish_batch(batch);
update_batch_syncobjs(batch);
if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
const char *basefile = strstr(file, "iris/");
if (basefile)
file = basefile + 5;
fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5db (%0.1f%%) "
"(cmds), %4d BOs (%0.1fMb aperture)\n",
file, line, batch_name_to_string(batch->name), batch->hw_ctx_id,
batch->total_chained_batch_size,
100.0f * batch->total_chained_batch_size / BATCH_SZ,
batch->exec_count,
(float) batch->aperture_space / (1024 * 1024));
2018-05-02 19:54:23 -07:00
}
2018-04-06 16:21:21 -07:00
int ret = submit_batch(batch);
2021-08-31 14:16:32 -07:00
/* When batch submission fails, our end-of-batch syncobj remains
* unsignalled, and in fact is not even considered submitted.
*
* In the hang recovery case (-EIO) or -ENOMEM, we recreate our context and
* attempt to carry on. In that case, we need to signal our syncobj,
* dubiously claiming that this batch completed, because future batches may
* depend on it. If we don't, then execbuf would fail with -EINVAL for
* those batches, because they depend on a syncobj that's considered to be
* "never submitted". This would lead to an abort(). So here, we signal
* the failing batch's syncobj to try and allow further progress to be
* made, knowing we may have broken our dependency tracking.
*/
if (ret < 0)
iris_syncobj_signal(screen->bufmgr, iris_batch_get_signal_syncobj(batch));
batch->exec_count = 0;
batch->aperture_space = 0;
util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s)
iris_syncobj_reference(screen->bufmgr, s, NULL);
util_dynarray_clear(&batch->syncobjs);
util_dynarray_clear(&batch->exec_fences);
if (INTEL_DEBUG & DEBUG_SYNC) {
dbg_printf("waiting for idle\n");
iris_bo_wait_rendering(batch->bo); /* if execbuf failed; this is a nop */
}
/* Start a new batch buffer. */
iris_batch_reset(batch);
/* EIO means our context is banned. In this case, try and replace it
* with a new logical context, and inform iris_context that all state
* has been lost and needs to be re-initialized. If this succeeds,
* dubiously claim success...
* Also handle ENOMEM here.
*/
if ((ret == -EIO || ret == -ENOMEM) && replace_hw_ctx(batch)) {
if (batch->reset->reset) {
/* Tell gallium frontends the device is lost and it was our fault. */
batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET);
}
ret = 0;
}
2019-07-03 15:12:17 -07:00
if (ret < 0) {
#ifdef DEBUG
const bool color = INTEL_DEBUG & DEBUG_COLOR;
fprintf(stderr, "%siris: Failed to submit batchbuffer: %-80s%s\n",
color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
#endif
abort();
}
}
/**
* Does the current batch refer to the given BO?
*
* (In other words, is the BO in the current batch's validation list?)
*/
bool
iris_batch_references(struct iris_batch *batch, struct iris_bo *bo)
{
return find_exec_index(batch, bo) != -1;
}
/**
* Updates the state of the noop feature. Returns true if there was a noop
* transition that led to state invalidation.
*/
bool
iris_batch_prepare_noop(struct iris_batch *batch, bool noop_enable)
{
if (batch->noop_enabled == noop_enable)
return 0;
batch->noop_enabled = noop_enable;
iris_batch_flush(batch);
/* If the batch was empty, flush had no effect, so insert our noop. */
if (iris_batch_bytes_used(batch) == 0)
iris_batch_maybe_noop(batch);
/* We only need to update the entire state if we transition from noop ->
* not-noop.
*/
return !batch->noop_enabled;
}