diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index d88b521e160..636d9114d0d 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -29,6 +29,7 @@ #include "anv_private.h" +#include "common/gen_aux_map.h" #include "util/anon_file.h" #ifdef HAVE_VALGRIND @@ -1522,12 +1523,11 @@ anv_bo_alloc_flags_to_bo_flags(struct anv_device *device, } static uint32_t -anv_device_get_bo_align(struct anv_device *device) +anv_device_get_bo_align(struct anv_device *device, + enum anv_bo_alloc_flags alloc_flags) { - /* Gen12 CCS surface addresses need to be 64K aligned. We have no way of - * telling what this allocation is for so pick the largest alignment. - */ - if (device->info.gen >= 12) + /* Gen12 CCS surface addresses need to be 64K aligned. */ + if (device->info.gen >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) return 64 * 1024; return 4096; @@ -1540,6 +1540,9 @@ anv_device_alloc_bo(struct anv_device *device, uint64_t explicit_address, struct anv_bo **bo_out) { + if (!device->physical->has_implicit_ccs) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + const uint32_t bo_flags = anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); @@ -1547,9 +1550,20 @@ anv_device_alloc_bo(struct anv_device *device, /* The kernel is going to give us whole pages anyway */ size = align_u64(size, 4096); - const uint32_t align = anv_device_get_bo_align(device); + const uint32_t align = anv_device_get_bo_align(device, alloc_flags); - uint32_t gem_handle = anv_gem_create(device, size); + uint64_t ccs_size = 0; + if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) { + /* Align the size up to the next multiple of 64K so we don't have any + * AUX-TT entries pointing from a 64K page to itself. + */ + size = align_u64(size, 64 * 1024); + + /* See anv_bo::_ccs_size */ + ccs_size = align_u64(DIV_ROUND_UP(size, GEN_AUX_MAP_GEN12_CCS_SCALE), 4096); + } + + uint32_t gem_handle = anv_gem_create(device, size + ccs_size); if (gem_handle == 0) return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); @@ -1558,10 +1572,12 @@ anv_device_alloc_bo(struct anv_device *device, .refcount = 1, .offset = -1, .size = size, + ._ccs_size = ccs_size, .flags = bo_flags, .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL), .has_client_visible_address = (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + .has_implicit_ccs = ccs_size > 0, }; if (alloc_flags & ANV_BO_ALLOC_MAPPED) { @@ -1596,8 +1612,8 @@ anv_device_alloc_bo(struct anv_device *device, new_bo.has_fixed_address = true; new_bo.offset = explicit_address; } else if (new_bo.flags & EXEC_OBJECT_PINNED) { - new_bo.offset = anv_vma_alloc(device, new_bo.size, align, - alloc_flags, explicit_address); + new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size, + align, alloc_flags, explicit_address); if (new_bo.offset == 0) { if (new_bo.map) anv_gem_munmap(new_bo.map, size); @@ -1609,6 +1625,14 @@ anv_device_alloc_bo(struct anv_device *device, assert(!new_bo.has_client_visible_address); } + if (new_bo._ccs_size > 0) { + assert(device->info.has_aux_map); + gen_aux_map_add_mapping(device->aux_map_ctx, + gen_canonical_address(new_bo.offset), + gen_canonical_address(new_bo.offset + new_bo.size), + new_bo.size, 0 /* format_bits */); + } + assert(new_bo.gem_handle); /* If we just got this gem_handle from anv_bo_init_new then we know no one @@ -1633,6 +1657,10 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device, ANV_BO_ALLOC_SNOOPED | ANV_BO_ALLOC_FIXED_ADDRESS))); + /* We can't do implicit CCS with an aux table on shared memory */ + if (!device->physical->has_implicit_ccs || device->info.has_aux_map) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + struct anv_bo_cache *cache = &device->bo_cache; const uint32_t bo_flags = anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); @@ -1689,15 +1717,10 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device, assert(client_address == gen_48b_address(client_address)); if (new_bo.flags & EXEC_OBJECT_PINNED) { - /* Gen12 CCS surface addresses need to be 64K aligned. We have no way - * of telling what this allocation is for so pick the largest - * alignment. - */ - const uint32_t align = device->info.gen >= 12 ? (64 * 1024) : - (4 * 1024); - + assert(new_bo._ccs_size == 0); new_bo.offset = anv_vma_alloc(device, new_bo.size, - anv_device_get_bo_align(device), + anv_device_get_bo_align(device, + alloc_flags), alloc_flags, client_address); if (new_bo.offset == 0) { anv_gem_close(device, new_bo.gem_handle); @@ -1729,6 +1752,10 @@ anv_device_import_bo(struct anv_device *device, ANV_BO_ALLOC_SNOOPED | ANV_BO_ALLOC_FIXED_ADDRESS))); + /* We can't do implicit CCS with an aux table on shared memory */ + if (!device->physical->has_implicit_ccs || device->info.has_aux_map) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + struct anv_bo_cache *cache = &device->bo_cache; const uint32_t bo_flags = anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); @@ -1822,8 +1849,10 @@ anv_device_import_bo(struct anv_device *device, assert(client_address == gen_48b_address(client_address)); if (new_bo.flags & EXEC_OBJECT_PINNED) { + assert(new_bo._ccs_size == 0); new_bo.offset = anv_vma_alloc(device, new_bo.size, - anv_device_get_bo_align(device), + anv_device_get_bo_align(device, + alloc_flags), alloc_flags, client_address); if (new_bo.offset == 0) { anv_gem_close(device, new_bo.gem_handle); @@ -1914,8 +1943,17 @@ anv_device_release_bo(struct anv_device *device, if (bo->map && !bo->from_host_ptr) anv_gem_munmap(bo->map, bo->size); + if (bo->_ccs_size > 0) { + assert(device->physical->has_implicit_ccs); + assert(device->info.has_aux_map); + assert(bo->has_implicit_ccs); + gen_aux_map_unmap_range(device->aux_map_ctx, + gen_canonical_address(bo->offset), + bo->size); + } + if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address) - anv_vma_free(device, bo->offset, bo->size); + anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size); uint32_t gem_handle = bo->gem_handle; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index a0d9e9616db..b6f941d669f 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -470,6 +470,8 @@ anv_physical_device_try_create(struct anv_instance *instance, */ device->has_bindless_samplers = device->info.gen >= 8; + device->has_implicit_ccs = device->info.has_aux_map; + device->has_mem_available = get_available_system_memory() != 0; device->always_flush_cache = @@ -3357,9 +3359,27 @@ VkResult anv_AllocateMemory( } } + /* By default, we want all VkDeviceMemory objects to support CCS */ + if (device->physical->has_implicit_ccs) + alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS; + if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR) alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS; + if ((export_info && export_info->handleTypes) || + (fd_info && fd_info->handleType) || + (host_ptr_info && host_ptr_info->handleType)) { + /* Anything imported or exported is EXTERNAL */ + alloc_flags |= ANV_BO_ALLOC_EXTERNAL; + + /* We can't have implicit CCS on external memory with an AUX-table. + * Doing so would require us to sync the aux tables across processes + * which is impractical. + */ + if (device->info.has_aux_map) + alloc_flags &= ~ANV_BO_ALLOC_IMPLICIT_CCS; + } + /* Check if we need to support Android HW buffer export. If so, * create AHardwareBuffer and import memory from it. */ @@ -3460,9 +3480,6 @@ VkResult anv_AllocateMemory( /* Regular allocate (not importing memory). */ - if (export_info && export_info->handleTypes) - alloc_flags |= ANV_BO_ALLOC_EXTERNAL; - result = anv_device_alloc_bo(device, pAllocateInfo->allocationSize, alloc_flags, client_address, &mem->bo); if (result != VK_SUCCESS) diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c index 4be8f5aee6c..28db8951534 100644 --- a/src/intel/vulkan/anv_image.c +++ b/src/intel/vulkan/anv_image.c @@ -34,8 +34,6 @@ #include "vk_util.h" #include "util/u_math.h" -#include "common/gen_aux_map.h" - #include "vk_format_info.h" static isl_surf_usage_flags_t @@ -503,7 +501,9 @@ make_surface(struct anv_device *dev, image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D; } - add_surface(image, &image->planes[plane].aux_surface, plane); + if (!dev->physical->has_implicit_ccs) + add_surface(image, &image->planes[plane].aux_surface, plane); + add_aux_state_tracking_buffer(image, plane, dev); } } @@ -805,12 +805,6 @@ anv_DestroyImage(VkDevice _device, VkImage _image, return; for (uint32_t p = 0; p < image->n_planes; ++p) { - if (anv_image_plane_uses_aux_map(device, image, p) && - image->planes[p].address.bo) { - gen_aux_map_unmap_range(device->aux_map_ctx, - image->planes[p].aux_map_surface_address, - image->planes[p].surface.isl.size_B); - } if (image->planes[p].bo_is_owned) { assert(image->planes[p].address.bo != NULL); anv_device_release_bo(device, image->planes[p].address.bo); @@ -829,12 +823,6 @@ static void anv_image_bind_memory_plane(struct anv_device *device, assert(!image->planes[plane].bo_is_owned); if (!memory) { - if (anv_image_plane_uses_aux_map(device, image, plane) && - image->planes[plane].address.bo) { - gen_aux_map_unmap_range(device->aux_map_ctx, - image->planes[plane].aux_map_surface_address, - image->planes[plane].surface.isl.size_B); - } image->planes[plane].address = ANV_NULL_ADDRESS; return; } @@ -844,19 +832,11 @@ static void anv_image_bind_memory_plane(struct anv_device *device, .offset = memory_offset, }; - if (anv_image_plane_uses_aux_map(device, image, plane)) { - image->planes[plane].aux_map_surface_address = - anv_address_physical( - anv_address_add(image->planes[plane].address, - image->planes[plane].surface.offset)); - - gen_aux_map_add_image(device->aux_map_ctx, - &image->planes[plane].surface.isl, - image->planes[plane].aux_map_surface_address, - anv_address_physical( - anv_address_add(image->planes[plane].address, - image->planes[plane].aux_surface.offset))); - } + /* If we're on a platform that uses implicit CCS and our buffer does not + * have any implicit CCS data, disable compression on that image. + */ + if (device->physical->has_implicit_ccs && !memory->bo->has_implicit_ccs) + image->planes[plane].aux_usage = ISL_AUX_USAGE_NONE; } /* We are binding AHardwareBuffer. Get a description, resolve the diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 8f63882bdac..d7edcc89927 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -645,6 +645,7 @@ struct anv_bo { */ uint64_t offset; + /** Size of the buffer not including implicit aux */ uint64_t size; /* Map for internally mapped BOs. @@ -653,6 +654,30 @@ struct anv_bo { */ void *map; + /** Size of the implicit CCS range at the end of the buffer + * + * On Gen12, CCS data is always a direct 1/256 scale-down. A single 64K + * page of main surface data maps to a 256B chunk of CCS data and that + * mapping is provided on TGL-LP by the AUX table which maps virtual memory + * addresses in the main surface to virtual memory addresses for CCS data. + * + * Because we can't change these maps around easily and because Vulkan + * allows two VkImages to be bound to overlapping memory regions (as long + * as the app is careful), it's not feasible to make this mapping part of + * the image. (On Gen11 and earlier, the mapping was provided via + * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.) + * Instead, we attach the CCS data directly to the buffer object and setup + * the AUX table mapping at BO creation time. + * + * This field is for internal tracking use by the BO allocator only and + * should not be touched by other parts of the code. If something wants to + * know if a BO has implicit CCS data, it should instead look at the + * has_implicit_ccs boolean below. + * + * This data is not included in maps of this buffer. + */ + uint32_t _ccs_size; + /** Flags to pass to the kernel through drm_i915_exec_object2::flags */ uint32_t flags; @@ -676,6 +701,9 @@ struct anv_bo { /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */ bool has_client_visible_address:1; + + /** True if this BO has implicit CCS data attached to it */ + bool has_implicit_ccs:1; }; static inline struct anv_bo * @@ -1019,6 +1047,13 @@ struct anv_physical_device { /** True if we can use bindless access for samplers */ bool has_bindless_samplers; + /** True if this device has implicit AUX + * + * If true, CCS is handled as an implicit attachment to the BO rather than + * as an explicitly bound surface. + */ + bool has_implicit_ccs; + bool always_flush_cache; struct anv_device_extension_table supported_extensions; @@ -1380,6 +1415,9 @@ enum anv_bo_alloc_flags { /** Has an address which is visible to the client */ ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8), + + /** This buffer has implicit CCS data attached to it */ + ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9), }; VkResult anv_device_alloc_bo(struct anv_device *device, uint64_t size, @@ -3473,13 +3511,6 @@ struct anv_image { */ struct anv_address address; - /** - * Address of the main surface used to fill the aux map table. This is - * used at destruction of the image since the Vulkan spec does not - * guarantee that the address.bo field we still be valid at destruction. - */ - uint64_t aux_map_surface_address; - /** * When destroying the image, also free the bo. * */ diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index e2f846a0ab0..1a706b8c065 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -259,10 +259,6 @@ genX(blorp_exec)(struct blorp_batch *batch, genX(flush_pipeline_select_3d)(cmd_buffer); -#if GEN_GEN >= 12 - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; -#endif - genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); /* BLORP doesn't do anything fancy with depth such as discards, so we want diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index ff213dc3b8a..ddc5494e79f 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -990,6 +990,102 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, } } +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +#if GEN_GEN == 12 +static void +anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) +{ + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + assert(isl_aux_usage_has_ccs(image->planes[plane].aux_usage)); + + uint64_t base_address = + anv_address_physical(image->planes[plane].address); + + const struct isl_surf *isl_surf = &image->planes[plane].surface.isl; + uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf); + + /* We're about to live-update the AUX-TT. We really don't want anyone else + * trying to read it while we're doing this. We could probably get away + * with not having this stall in some cases if we were really careful but + * it's better to play it safe. Full stall the GPU. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t logical_array_layer, logical_z_offset_px; + if (image->type == VK_IMAGE_TYPE_3D) { + logical_array_layer = 0; + + /* If the given miplevel does not have this layer, then any higher + * miplevels won't either because miplevels only get smaller the + * higher the LOD. + */ + assert(layer < image->extent.depth); + if (layer >= anv_minify(image->extent.depth, level)) + break; + logical_z_offset_px = layer; + } else { + assert(layer < image->array_size); + logical_array_layer = layer; + logical_z_offset_px = 0; + } + + uint32_t slice_start_offset_B, slice_end_offset_B; + isl_surf_get_image_range_B_tile(isl_surf, level, + logical_array_layer, + logical_z_offset_px, + &slice_start_offset_B, + &slice_end_offset_B); + + start_offset_B = MIN2(start_offset_B, slice_start_offset_B); + end_offset_B = MAX2(end_offset_B, slice_end_offset_B); + } + + /* Aux operates 64K at a time */ + start_offset_B = align_down_u64(start_offset_B, 64 * 1024); + end_offset_B = align_u64(end_offset_B, 64 * 1024); + + for (uint64_t offset = start_offset_B; + offset < end_offset_B; offset += 64 * 1024) { + uint64_t address = base_address + offset; + + uint64_t aux_entry_address, *aux_entry_map; + aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, + address, &aux_entry_address); + + const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); + uint64_t new_aux_entry = + (old_aux_entry & ~GEN_AUX_MAP_FORMAT_BITS_MASK) | format_bits; + + /* We're only going to update the top 32 bits */ + assert((uint32_t)old_aux_entry == (uint32_t)new_aux_entry); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = (struct anv_address) { + .bo = NULL, + .offset = aux_entry_address + 4, + }; + sdi.ImmediateData = new_aux_entry >> 32; + } + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; +} +#endif /* GEN_GEN == 12 */ + /** * @brief Transitions a color buffer from one layout to another. * @@ -1010,7 +1106,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, VkImageLayout initial_layout, VkImageLayout final_layout) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; + struct anv_device *device = cmd_buffer->device; + const struct gen_device_info *devinfo = &device->info; /* Validate the inputs. */ assert(cmd_buffer); assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); @@ -1059,6 +1156,17 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { +#if GEN_GEN == 12 + if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage) && + device->physical->has_implicit_ccs && devinfo->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, aspect, + base_level, level_count, + base_layer, layer_count); + } +#else + assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); +#endif + /* A subresource in the undefined layout may have been aliased and * populated with any arrangement of bits. Therefore, we must initialize * the related aux buffer and clear buffer entry with desirable values. @@ -2881,10 +2989,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) genX(flush_pipeline_select_3d)(cmd_buffer); -#if GEN_GEN >= 12 - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; -#endif - if (vb_emit) { const uint32_t num_buffers = __builtin_popcount(vb_emit); const uint32_t num_dwords = 1 + num_buffers * 4; @@ -3774,10 +3878,6 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) genX(flush_pipeline_select_gpgpu)(cmd_buffer); -#if GEN_GEN >= 12 - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; -#endif - if (cmd_buffer->state.compute.pipeline_dirty) { /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: *