diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index eb66897a686..c3f057d8ff6 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -95,4 +95,11 @@ #define RADV_MAX_HIT_ATTRIB_SIZE 32 +#define RADV_SHADER_ALLOC_ALIGNMENT 256 +#define RADV_SHADER_ALLOC_MIN_ARENA_SIZE (256 * 1024) +#define RADV_SHADER_ALLOC_MIN_SIZE_CLASS 8 +#define RADV_SHADER_ALLOC_MAX_SIZE_CLASS 15 +#define RADV_SHADER_ALLOC_NUM_FREE_LISTS \ + (RADV_SHADER_ALLOC_MAX_SIZE_CLASS - RADV_SHADER_ALLOC_MIN_SIZE_CLASS + 1) + #endif /* RADV_CONSTANTS_H */ diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 8cf934e35db..6cc96e00d9b 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2904,8 +2904,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr device->image_float32_atomics = image_float32_atomics; - mtx_init(&device->shader_slab_mutex, mtx_plain); - list_inithead(&device->shader_slabs); + radv_init_shader_arenas(device); device->overallocation_disallowed = overallocation_disallowed; mtx_init(&device->overallocation_mutex, mtx_plain); @@ -3212,7 +3211,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) radv_trap_handler_finish(device); radv_finish_trace(device); - radv_destroy_shader_slabs(device); + radv_destroy_shader_arenas(device); u_cnd_monotonic_destroy(&device->timeline_cond); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 1a64a439c5e..c65acb1cd64 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -770,8 +770,12 @@ struct radv_device { */ uint32_t image_mrt_offset_counter; uint32_t fmask_mrt_offset_counter; - struct list_head shader_slabs; - mtx_t shader_slab_mutex; + + struct list_head shader_arenas; + uint8_t shader_free_list_mask; + struct list_head shader_free_lists[RADV_SHADER_ALLOC_NUM_FREE_LISTS]; + struct list_head shader_block_obj_pool; + mtx_t shader_arena_mutex; /* For detecting VM faults reported by dmesg. */ uint64_t dmesg_timestamp; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index d5a8986663b..defa4298a0d 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1023,84 +1023,265 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, } } -static void * -radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader) +static unsigned +get_size_class(unsigned size, bool round_up) { - mtx_lock(&device->shader_slab_mutex); - list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs) - { - uint64_t offset = 0; + size = round_up ? util_logbase2_ceil(size) : util_logbase2(size); + unsigned size_class = + MAX2(size, RADV_SHADER_ALLOC_MIN_SIZE_CLASS) - RADV_SHADER_ALLOC_MIN_SIZE_CLASS; + return MIN2(size_class, RADV_SHADER_ALLOC_NUM_FREE_LISTS - 1); +} -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" -#endif - list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list) +static void +remove_hole(struct radv_device *device, union radv_shader_arena_block *hole) +{ + unsigned size_class = get_size_class(hole->size, false); + list_del(&hole->freelist); + if (list_is_empty(&device->shader_free_lists[size_class])) + device->shader_free_list_mask &= ~(1u << size_class); +} + +static void +add_hole(struct radv_device *device, union radv_shader_arena_block *hole) +{ + unsigned size_class = get_size_class(hole->size, false); + list_addtail(&hole->freelist, &device->shader_free_lists[size_class]); + device->shader_free_list_mask |= 1u << size_class; +} + +static union radv_shader_arena_block * +alloc_block_obj(struct radv_device *device) +{ + if (!list_is_empty(&device->shader_block_obj_pool)) { + union radv_shader_arena_block *block = + list_first_entry(&device->shader_block_obj_pool, union radv_shader_arena_block, pool); + list_del(&block->pool); + return block; + } + + return malloc(sizeof(union radv_shader_arena_block)); +} + +static void +free_block_obj(struct radv_device *device, union radv_shader_arena_block *block) +{ + list_add(&block->pool, &device->shader_block_obj_pool); +} + +/* Segregated fit allocator, implementing a good-fit allocation policy. + * + * This is an variation of sequential fit allocation with several lists of free blocks ("holes") + * instead of one. Each list of holes only contains holes of a certain range of sizes, so holes that + * are too small can easily be ignored while allocating. Because this also ignores holes that are + * larger than necessary (approximating best-fit allocation), this could be described as a + * "good-fit" allocator. + * + * Typically, shaders are allocated and only free'd when the device is destroyed. For this pattern, + * this should allocate blocks for shaders fast and with no fragmentation, while still allowing + * free'd memory to be re-used. + */ +static union radv_shader_arena_block * +alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr) +{ + size = align(size, RADV_SHADER_ALLOC_ALIGNMENT); + + mtx_lock(&device->shader_arena_mutex); + + /* Try to use an existing hole. Unless the shader is very large, this should only have to look + * at the first one available. + */ + unsigned free_list_mask = BITFIELD_MASK(RADV_SHADER_ALLOC_NUM_FREE_LISTS); + unsigned size_class = + ffs(device->shader_free_list_mask & (free_list_mask << get_size_class(size, true))); + if (size_class) { + size_class--; + + list_for_each_entry(union radv_shader_arena_block, hole, + &device->shader_free_lists[size_class], freelist) { -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - if (s->bo_offset - offset >= shader->code_size) { - shader->bo = slab->bo; - shader->bo_offset = offset; - list_addtail(&shader->slab_list, &s->slab_list); - mtx_unlock(&device->shader_slab_mutex); - return slab->ptr + offset; + if (hole->size < size) + continue; + + assert(hole->offset % RADV_SHADER_ALLOC_ALIGNMENT == 0); + + if (size == hole->size) { + remove_hole(device, hole); + hole->freelist.next = ptr; + mtx_unlock(&device->shader_arena_mutex); + return hole; + } else { + union radv_shader_arena_block *alloc = alloc_block_obj(device); + if (!alloc) { + mtx_unlock(&device->shader_arena_mutex); + return NULL; + } + list_addtail(&alloc->list, &hole->list); + alloc->freelist.prev = NULL; + alloc->freelist.next = ptr; + alloc->arena = hole->arena; + alloc->offset = hole->offset; + alloc->size = size; + + remove_hole(device, hole); + hole->offset += size; + hole->size -= size; + add_hole(device, hole); + + mtx_unlock(&device->shader_arena_mutex); + return alloc; } - offset = align_u64(s->bo_offset + s->code_size, 256); - } - if (offset <= slab->size && slab->size - offset >= shader->code_size) { - shader->bo = slab->bo; - shader->bo_offset = offset; - list_addtail(&shader->slab_list, &slab->shaders); - mtx_unlock(&device->shader_slab_mutex); - return slab->ptr + offset; } } - mtx_unlock(&device->shader_slab_mutex); - struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab)); + /* Allocate a new shader arena. */ + struct radv_shader_arena *arena = calloc(1, sizeof(struct radv_shader_arena)); + union radv_shader_arena_block *alloc = NULL, *hole = NULL; + if (!arena) + goto fail; - slab->size = MAX2(256 * 1024, shader->code_size); + unsigned arena_size = MAX2(RADV_SHADER_ALLOC_MIN_ARENA_SIZE, size); VkResult result = device->ws->buffer_create( - device->ws, slab->size, 256, RADEON_DOMAIN_VRAM, + device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0 : RADEON_FLAG_READ_ONLY), - RADV_BO_PRIORITY_SHADER, 0, &slab->bo); - if (result != VK_SUCCESS) { - free(slab); - return NULL; + RADV_BO_PRIORITY_SHADER, 0, &arena->bo); + if (result != VK_SUCCESS) + goto fail; + + list_inithead(&arena->entries); + + arena->ptr = (char *)device->ws->buffer_map(arena->bo); + if (!arena->ptr) + goto fail; + + alloc = alloc_block_obj(device); + hole = arena_size - size > 0 ? alloc_block_obj(device) : alloc; + if (!alloc || !hole) + goto fail; + list_addtail(&alloc->list, &arena->entries); + alloc->freelist.prev = NULL; + alloc->freelist.next = ptr; + alloc->arena = arena; + alloc->offset = 0; + alloc->size = size; + + if (hole != alloc) { + hole->arena = arena; + hole->offset = size; + hole->size = arena_size - size; + + list_addtail(&hole->list, &arena->entries); + add_hole(device, hole); } - slab->ptr = (char *)device->ws->buffer_map(slab->bo); - if (!slab->ptr) { - device->ws->buffer_destroy(device->ws, slab->bo); - free(slab); + list_addtail(&arena->list, &device->shader_arenas); + + mtx_unlock(&device->shader_arena_mutex); + return alloc; + +fail: + mtx_unlock(&device->shader_arena_mutex); + free(alloc); + free(hole); + if (arena && arena->bo) + device->ws->buffer_destroy(device->ws, arena->bo); + free(arena); + return NULL; +} + +static union radv_shader_arena_block * +get_hole(struct radv_shader_arena *arena, struct list_head *head) +{ + if (head == &arena->entries) return NULL; + + union radv_shader_arena_block *hole = LIST_ENTRY(union radv_shader_arena_block, head, list); + return hole->freelist.prev ? hole : NULL; +} + +static void +free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc) +{ + mtx_lock(&device->shader_arena_mutex); + + union radv_shader_arena_block *hole_prev = get_hole(alloc->arena, alloc->list.prev); + union radv_shader_arena_block *hole_next = get_hole(alloc->arena, alloc->list.next); + + union radv_shader_arena_block *hole = alloc; + + /* merge with previous hole */ + if (hole_prev) { + remove_hole(device, hole_prev); + + hole_prev->size += hole->size; + list_del(&hole->list); + free_block_obj(device, hole); + + hole = hole_prev; } - list_inithead(&slab->shaders); + /* merge with next hole */ + if (hole_next) { + remove_hole(device, hole_next); - mtx_lock(&device->shader_slab_mutex); - list_add(&slab->slabs, &device->shader_slabs); + hole_next->offset -= hole->size; + hole_next->size += hole->size; + list_del(&hole->list); + free_block_obj(device, hole); - shader->bo = slab->bo; - shader->bo_offset = 0; - list_add(&shader->slab_list, &slab->shaders); - mtx_unlock(&device->shader_slab_mutex); - return slab->ptr; + hole = hole_next; + } + + if (list_is_singular(&hole->list)) { + struct radv_shader_arena *arena = hole->arena; + free_block_obj(device, hole); + + device->ws->buffer_destroy(device->ws, arena->bo); + list_del(&arena->list); + free(arena); + } else { + add_hole(device, hole); + } + + mtx_unlock(&device->shader_arena_mutex); +} + +static void * +radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader) +{ + shader->alloc = alloc_shader_memory(device, shader->code_size, shader); + if (!shader->alloc) + return NULL; + shader->bo = shader->alloc->arena->bo; + return shader->alloc->arena->ptr + shader->alloc->offset; } void -radv_destroy_shader_slabs(struct radv_device *device) +radv_init_shader_arenas(struct radv_device *device) { - list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs) + mtx_init(&device->shader_arena_mutex, mtx_plain); + + device->shader_free_list_mask = 0; + + list_inithead(&device->shader_arenas); + list_inithead(&device->shader_block_obj_pool); + for (unsigned i = 0; i < RADV_SHADER_ALLOC_NUM_FREE_LISTS; i++) + list_inithead(&device->shader_free_lists[i]); +} + +void +radv_destroy_shader_arenas(struct radv_device *device) +{ + list_for_each_entry_safe(union radv_shader_arena_block, block, &device->shader_block_obj_pool, + pool) free(block); + + list_for_each_entry_safe(struct radv_shader_arena, arena, &device->shader_arenas, list) { - device->ws->buffer_destroy(device->ws, slab->bo); - free(slab); + device->ws->buffer_destroy(device->ws, arena->bo); + free(arena); } - mtx_destroy(&device->shader_slab_mutex); + mtx_destroy(&device->shader_arena_mutex); } /* For the UMR disassembler. */ @@ -1735,9 +1916,7 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia if (!p_atomic_dec_zero(&variant->ref_count)) return; - mtx_lock(&device->shader_slab_mutex); - list_del(&variant->slab_list); - mtx_unlock(&device->shader_slab_mutex); + free_shader_memory(device, variant->alloc); free(variant->spirv); free(variant->nir_string); @@ -1750,36 +1929,33 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant) { - return radv_buffer_get_va(variant->bo) + variant->bo_offset; + return radv_buffer_get_va(variant->bo) + variant->alloc->offset; } struct radv_shader_variant * radv_find_shader_variant(struct radv_device *device, uint64_t pc) { - mtx_lock(&device->shader_slab_mutex); - - list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs) + mtx_lock(&device->shader_arena_mutex); + list_for_each_entry(struct radv_shader_arena, arena, &device->shader_arenas, list) { #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" #endif - list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list) + list_for_each_entry(union radv_shader_arena_block, block, &arena->entries, list) { #ifdef __GNUC__ #pragma GCC diagnostic pop #endif - uint64_t offset = align_u64(s->bo_offset + s->code_size, 256); - uint64_t va = radv_buffer_get_va(s->bo); - - if (pc >= va + s->bo_offset && pc < va + offset) { - mtx_unlock(&device->shader_slab_mutex); - return s; + uint64_t start = radv_buffer_get_va(block->arena->bo) + block->offset; + if (!block->freelist.prev && pc >= start && pc < start + block->size) { + mtx_unlock(&device->shader_arena_mutex); + return (struct radv_shader_variant *)block->freelist.next; } } } - mtx_unlock(&device->shader_slab_mutex); + mtx_unlock(&device->shader_arena_mutex); return NULL; } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 7fd12108440..c7fc550f49b 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -387,11 +387,33 @@ struct radv_shader_binary_rtld { uint8_t data[0]; }; +struct radv_shader_arena { + struct list_head list; + struct list_head entries; + struct radeon_winsys_bo *bo; + char *ptr; +}; + +union radv_shader_arena_block { + struct list_head pool; + struct { + /* List of blocks in the arena, sorted by address. */ + struct list_head list; + /* For holes, a list_head for the free-list. For allocations, freelist.prev=NULL and + * freelist.next is a pointer associated with the allocation. + */ + struct list_head freelist; + struct radv_shader_arena *arena; + uint32_t offset; + uint32_t size; + }; +}; + struct radv_shader_variant { uint32_t ref_count; struct radeon_winsys_bo *bo; - uint64_t bo_offset; + union radv_shader_arena_block *alloc; struct ac_shader_config config; uint8_t *code_ptr; uint32_t code_size; @@ -405,16 +427,6 @@ struct radv_shader_variant { char *disasm_string; char *ir_string; uint32_t *statistics; - - struct list_head slab_list; -}; - -struct radv_shader_slab { - struct list_head slabs; - struct list_head shaders; - struct radeon_winsys_bo *bo; - uint64_t size; - char *ptr; }; void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader, @@ -428,7 +440,8 @@ nir_shader *radv_shader_compile_to_nir(struct radv_device *device, struct vk_sha const struct radv_pipeline_layout *layout, const struct radv_pipeline_key *key); -void radv_destroy_shader_slabs(struct radv_device *device); +void radv_init_shader_arenas(struct radv_device *device); +void radv_destroy_shader_arenas(struct radv_device *device); VkResult radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,