nvk: Use VM_BIND for contiguous heaps instead of copying

This gets rid of our (fairly sketchy) heap resizing via stall-and-copy
and replaces it with VM_BIND.  We couldn't do this on the old nouveau
API but now that we can assume VM_BIND, it makes everything simpler.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27205>
This commit is contained in:
Faith Ekstrand 2024-01-22 16:59:22 -06:00 committed by Marge Bot
parent f0fad6ed17
commit e162c2e78e
7 changed files with 96 additions and 183 deletions

View file

@ -55,6 +55,15 @@ nvk_push_dispatch_state_init(struct nvk_device *dev, struct nv_push *p)
if (pdev->info.cls_compute == MAXWELL_COMPUTE_A) if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE); P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
uint64_t shader_base_addr =
nvk_heap_contiguous_base_address(&dev->shader_heap);
P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
}
return VK_SUCCESS; return VK_SUCCESS;
} }

View file

@ -367,6 +367,15 @@ nvk_push_draw_state_init(struct nvk_device *dev, struct nv_push *p)
P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE); P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
if (pdev->info.cls_eng3d < VOLTA_A) {
uint64_t shader_base_addr =
nvk_heap_contiguous_base_address(&dev->shader_heap);
P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
}
for (uint32_t i = 0; i < 6; i++) { for (uint32_t i = 0; i < 6; i++) {
P_IMMD(p, NV9097, SET_PIPELINE_SHADER(i), { P_IMMD(p, NV9097, SET_PIPELINE_SHADER(i), {
.enable = ENABLE_FALSE, .enable = ENABLE_FALSE,

View file

@ -26,7 +26,17 @@ nvk_heap_init(struct nvk_device *dev, struct nvk_heap *heap,
heap->bo_flags |= NOUVEAU_WS_BO_MAP; heap->bo_flags |= NOUVEAU_WS_BO_MAP;
heap->map_flags = map_flags; heap->map_flags = map_flags;
heap->overalloc = overalloc; heap->overalloc = overalloc;
heap->contiguous = contiguous;
if (contiguous) {
heap->base_addr = nouveau_ws_alloc_vma(dev->ws_dev, 0,
NVK_HEAP_MAX_SIZE,
0, false /* bda */,
false /* sparse */);
if (heap->base_addr == 0) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to allocate VMA for heap");
}
}
simple_mtx_init(&heap->mutex, mtx_plain); simple_mtx_init(&heap->mutex, mtx_plain);
util_vma_heap_init(&heap->heap, 0, 0); util_vma_heap_init(&heap->heap, 0, 0);
@ -41,12 +51,21 @@ void
nvk_heap_finish(struct nvk_device *dev, struct nvk_heap *heap) nvk_heap_finish(struct nvk_device *dev, struct nvk_heap *heap)
{ {
for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) { for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) {
if (heap->base_addr != 0) {
nouveau_ws_bo_unbind_vma(dev->ws_dev, heap->bos[bo_idx].addr,
heap->bos[bo_idx].bo->size);
}
nouveau_ws_bo_unmap(heap->bos[bo_idx].bo, heap->bos[bo_idx].map); nouveau_ws_bo_unmap(heap->bos[bo_idx].bo, heap->bos[bo_idx].map);
nouveau_ws_bo_destroy(heap->bos[bo_idx].bo); nouveau_ws_bo_destroy(heap->bos[bo_idx].bo);
} }
util_vma_heap_finish(&heap->heap); util_vma_heap_finish(&heap->heap);
simple_mtx_destroy(&heap->mutex); simple_mtx_destroy(&heap->mutex);
if (heap->base_addr != 0) {
nouveau_ws_free_vma(dev->ws_dev, heap->base_addr, NVK_HEAP_MAX_SIZE,
false /* bda */, false /* sparse */);
}
} }
static uint64_t static uint64_t
@ -74,88 +93,6 @@ vma_bo_offset(uint64_t offset)
static VkResult static VkResult
nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap) nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap)
{ {
VkResult result;
if (heap->contiguous) {
if (heap->total_size >= NVK_HEAP_MAX_SIZE) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Heap has already hit its maximum size");
}
const uint64_t new_bo_size =
MAX2(heap->total_size * 2, NVK_HEAP_MIN_SIZE);
void *new_bo_map;
struct nouveau_ws_bo *new_bo =
nouveau_ws_bo_new_mapped(dev->ws_dev,
new_bo_size + heap->overalloc, 0,
heap->bo_flags, heap->map_flags,
&new_bo_map);
if (new_bo == NULL) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to allocate a heap BO: %m");
}
if (heap->bo_count > 0) {
assert(heap->bo_count == 1);
struct nouveau_ws_bo *old_bo = heap->bos[0].bo;
assert(util_is_power_of_two_nonzero64(heap->total_size));
assert(heap->total_size >= NVK_HEAP_MIN_SIZE);
assert(heap->total_size <= old_bo->size);
assert(heap->total_size < new_bo_size);
unsigned line_bytes = MIN2(heap->total_size, 1 << 17);
assert(heap->total_size % line_bytes == 0);
unsigned line_count = heap->total_size / line_bytes;
uint32_t push_dw[12];
struct nv_push push;
nv_push_init(&push, push_dw, ARRAY_SIZE(push_dw));
struct nv_push *p = &push;
P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
P_NV90B5_OFFSET_IN_UPPER(p, old_bo->offset >> 32);
P_NV90B5_OFFSET_IN_LOWER(p, old_bo->offset & 0xffffffff);
P_NV90B5_OFFSET_OUT_UPPER(p, new_bo->offset >> 32);
P_NV90B5_OFFSET_OUT_LOWER(p, new_bo->offset & 0xffffffff);
P_NV90B5_PITCH_IN(p, line_bytes);
P_NV90B5_PITCH_OUT(p, line_bytes);
P_NV90B5_LINE_LENGTH_IN(p, line_bytes);
P_NV90B5_LINE_COUNT(p, line_count);
P_IMMD(p, NV90B5, LAUNCH_DMA, {
.data_transfer_type = DATA_TRANSFER_TYPE_NON_PIPELINED,
.multi_line_enable = MULTI_LINE_ENABLE_TRUE,
.flush_enable = FLUSH_ENABLE_TRUE,
.src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
.dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
});
struct nouveau_ws_bo *push_bos[] = { new_bo, old_bo, };
result = nvk_queue_submit_simple(&dev->queue,
nv_push_dw_count(&push), push_dw,
ARRAY_SIZE(push_bos), push_bos);
if (result != VK_SUCCESS) {
nouveau_ws_bo_unmap(new_bo, new_bo_map);
nouveau_ws_bo_destroy(new_bo);
return result;
}
nouveau_ws_bo_unmap(heap->bos[0].bo, heap->bos[0].map);
nouveau_ws_bo_destroy(heap->bos[0].bo);
}
uint64_t vma = encode_vma(0, heap->total_size);
util_vma_heap_free(&heap->heap, vma, new_bo_size - heap->total_size);
heap->total_size = new_bo_size;
heap->bo_count = 1;
heap->bos[0].bo = new_bo;
heap->bos[0].map = new_bo_map;
return VK_SUCCESS;
} else {
if (heap->bo_count >= NVK_HEAP_MAX_BO_COUNT) { if (heap->bo_count >= NVK_HEAP_MAX_BO_COUNT) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Heap has already hit its maximum size"); "Heap has already hit its maximum size");
@ -175,6 +112,15 @@ nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap)
"Failed to allocate a heap BO: %m"); "Failed to allocate a heap BO: %m");
} }
if (heap->base_addr != 0) {
heap->bos[heap->bo_count].addr = heap->base_addr + heap->total_size;
nouveau_ws_bo_bind_vma(dev->ws_dev, heap->bos[heap->bo_count].bo,
heap->bos[heap->bo_count].addr,
new_bo_size, 0, 0);
} else {
heap->bos[heap->bo_count].addr = heap->bos[heap->bo_count].bo->offset;
}
uint64_t vma = encode_vma(heap->bo_count, 0); uint64_t vma = encode_vma(heap->bo_count, 0);
util_vma_heap_free(&heap->heap, vma, new_bo_size); util_vma_heap_free(&heap->heap, vma, new_bo_size);
@ -183,7 +129,6 @@ nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap)
return VK_SUCCESS; return VK_SUCCESS;
} }
}
static VkResult static VkResult
nvk_heap_alloc_locked(struct nvk_device *dev, struct nvk_heap *heap, nvk_heap_alloc_locked(struct nvk_device *dev, struct nvk_heap *heap,
@ -201,7 +146,7 @@ nvk_heap_alloc_locked(struct nvk_device *dev, struct nvk_heap *heap,
assert(bo_offset + size + heap->overalloc <= assert(bo_offset + size + heap->overalloc <=
heap->bos[bo_idx].bo->size); heap->bos[bo_idx].bo->size);
if (heap->contiguous) { if (heap->base_addr != 0) {
assert(bo_idx == 0); assert(bo_idx == 0);
*addr_out = bo_offset; *addr_out = bo_offset;
} else { } else {
@ -224,15 +169,6 @@ nvk_heap_free_locked(struct nvk_device *dev, struct nvk_heap *heap,
{ {
assert(addr + size > addr); assert(addr + size > addr);
if (heap->contiguous) {
assert(heap->bo_count == 1);
uint64_t bo_offset = addr;
assert(bo_offset + size <= heap->bos[0].bo->size);
uint64_t vma = encode_vma(0, bo_offset);
util_vma_heap_free(&heap->heap, vma, size);
} else {
for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) { for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) {
if (addr < heap->bos[bo_idx].bo->offset) if (addr < heap->bos[bo_idx].bo->offset)
continue; continue;
@ -249,19 +185,12 @@ nvk_heap_free_locked(struct nvk_device *dev, struct nvk_heap *heap,
} }
assert(!"Failed to find heap BO"); assert(!"Failed to find heap BO");
} }
}
VkResult VkResult
nvk_heap_alloc(struct nvk_device *dev, struct nvk_heap *heap, nvk_heap_alloc(struct nvk_device *dev, struct nvk_heap *heap,
uint64_t size, uint32_t alignment, uint64_t size, uint32_t alignment,
uint64_t *addr_out, void **map_out) uint64_t *addr_out, void **map_out)
{ {
/* We can't return maps from contiguous heaps because the the map may go
* away at any time when the lock isn't taken and we don't want to trust
* the caller with racy maps.
*/
assert(!heap->contiguous);
simple_mtx_lock(&heap->mutex); simple_mtx_lock(&heap->mutex);
VkResult result = nvk_heap_alloc_locked(dev, heap, size, alignment, VkResult result = nvk_heap_alloc_locked(dev, heap, size, alignment,
addr_out, map_out); addr_out, map_out);

View file

@ -23,17 +23,20 @@ struct nvk_device;
struct nvk_heap_bo { struct nvk_heap_bo {
struct nouveau_ws_bo *bo; struct nouveau_ws_bo *bo;
void *map; void *map;
uint64_t addr;
}; };
struct nvk_heap { struct nvk_heap {
enum nouveau_ws_bo_flags bo_flags; enum nouveau_ws_bo_flags bo_flags;
enum nouveau_ws_bo_map_flags map_flags; enum nouveau_ws_bo_map_flags map_flags;
uint32_t overalloc; uint32_t overalloc;
bool contiguous;
simple_mtx_t mutex; simple_mtx_t mutex;
struct util_vma_heap heap; struct util_vma_heap heap;
/* Base address for contiguous heaps, 0 otherwise */
uint64_t base_addr;
uint64_t total_size; uint64_t total_size;
uint32_t bo_count; uint32_t bo_count;
@ -58,19 +61,11 @@ VkResult nvk_heap_upload(struct nvk_device *dev, struct nvk_heap *heap,
void nvk_heap_free(struct nvk_device *dev, struct nvk_heap *heap, void nvk_heap_free(struct nvk_device *dev, struct nvk_heap *heap,
uint64_t addr, uint64_t size); uint64_t addr, uint64_t size);
static inline struct nouveau_ws_bo * static inline uint64_t
nvk_heap_get_contiguous_bo_ref(struct nvk_heap *heap) nvk_heap_contiguous_base_address(struct nvk_heap *heap)
{ {
assert(heap->contiguous); assert(heap->base_addr != 0);
assert(heap->bo_count <= 1); return heap->base_addr;
simple_mtx_lock(&heap->mutex);
struct nouveau_ws_bo *bo = heap->bos[0].bo;
if (bo)
nouveau_ws_bo_ref(bo);
simple_mtx_unlock(&heap->mutex);
return bo;
} }
#endif /* define NVK_HEAP_H */ #endif /* define NVK_HEAP_H */

View file

@ -35,8 +35,6 @@ nvk_queue_state_finish(struct nvk_device *dev,
nouveau_ws_bo_destroy(qs->images.bo); nouveau_ws_bo_destroy(qs->images.bo);
if (qs->samplers.bo) if (qs->samplers.bo)
nouveau_ws_bo_destroy(qs->samplers.bo); nouveau_ws_bo_destroy(qs->samplers.bo);
if (qs->shaders.bo)
nouveau_ws_bo_destroy(qs->shaders.bo);
if (qs->slm.bo) if (qs->slm.bo)
nouveau_ws_bo_destroy(qs->slm.bo); nouveau_ws_bo_destroy(qs->slm.bo);
if (qs->push.bo) { if (qs->push.bo) {
@ -90,19 +88,6 @@ nvk_queue_state_update(struct nvk_device *dev,
nouveau_ws_bo_destroy(bo); nouveau_ws_bo_destroy(bo);
} }
if (dev->shader_heap.contiguous) {
bo = nvk_heap_get_contiguous_bo_ref(&dev->shader_heap);
if (qs->shaders.bo != bo) {
if (qs->shaders.bo)
nouveau_ws_bo_destroy(qs->shaders.bo);
qs->shaders.bo = bo;
dirty = true;
} else {
if (bo)
nouveau_ws_bo_destroy(bo);
}
}
bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc); bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc);
if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp || if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
qs->slm.bytes_per_tpc != bytes_per_tpc) { qs->slm.bytes_per_tpc != bytes_per_tpc) {
@ -182,20 +167,6 @@ nvk_queue_state_update(struct nvk_device *dev,
}); });
} }
if (qs->shaders.bo) {
/* Compute */
assert(dev->pdev->info.cls_compute < VOLTA_COMPUTE_A);
P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
P_NVA0C0_SET_PROGRAM_REGION_A(p, qs->shaders.bo->offset >> 32);
P_NVA0C0_SET_PROGRAM_REGION_B(p, qs->shaders.bo->offset);
/* 3D */
assert(dev->pdev->info.cls_eng3d < VOLTA_A);
P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
P_NV9097_SET_PROGRAM_REGION_A(p, qs->shaders.bo->offset >> 32);
P_NV9097_SET_PROGRAM_REGION_B(p, qs->shaders.bo->offset);
}
if (qs->slm.bo) { if (qs->slm.bo) {
const uint64_t slm_addr = qs->slm.bo->offset; const uint64_t slm_addr = qs->slm.bo->offset;
const uint64_t slm_size = qs->slm.bo->size; const uint64_t slm_size = qs->slm.bo->size;

View file

@ -26,10 +26,6 @@ struct nvk_queue_state {
uint32_t alloc_count; uint32_t alloc_count;
} samplers; } samplers;
struct {
struct nouveau_ws_bo *bo;
} shaders;
struct { struct {
struct nouveau_ws_bo *bo; struct nouveau_ws_bo *bo;
uint32_t bytes_per_warp; uint32_t bytes_per_warp;

View file

@ -63,6 +63,10 @@ nouveau_ws_alloc_vma(struct nouveau_ws_device *dev,
{ {
assert(dev->has_vm_bind); assert(dev->has_vm_bind);
/* if the caller doesn't care, use the GPU page size */
if (align == 0)
align = 0x1000;
uint64_t offset; uint64_t offset;
simple_mtx_lock(&dev->vma_mutex); simple_mtx_lock(&dev->vma_mutex);
if (bda_capture_replay) { if (bda_capture_replay) {