anv: re-introduce BO CCS allocations

On Gfx12.0, CCS allocations have to be allocated per image because the
format of the image goes into the AUX-TT PTEs. The effect on memory
allocations is limited since the main surface granularity in the
AUX-TT PTE is 64KB.

On Gfx12.5, the granularity of the AUX-TT PTE is 1MB. This creates a
lot of waste in the application memory allocations. Fortunately the HW
doesn't care about the format put into the PTEs anymore. So it becomes
possible to have 2 images share the same PTE.

To implement this we bring back an earlier version of AUX-TT mappings
where we used to allocate additional CCS space at the end of the
VkDeviceMemory objects. On Gfx12.5, if the BO has additional CCS
space, we will now map the main surface to that space.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Jianxun Zhang <jianxun.zhang@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26822>
This commit is contained in:
Lionel Landwerlin 2023-12-27 11:16:50 +02:00 committed by Marge Bot
parent bd197c6bcf
commit 646a7c864d
5 changed files with 123 additions and 16 deletions

View file

@ -1491,6 +1491,13 @@ anv_device_alloc_bo(struct anv_device *device,
/* The kernel is going to give us whole pages anyway. */
size = align64(size, 4096);
const uint64_t ccs_offset = size;
if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
assert(device->info->has_aux_map);
size += DIV_ROUND_UP(size, intel_aux_get_main_to_aux_ratio(device->aux_map_ctx));
size = align64(size, 4096);
}
const struct intel_memory_class_instance *regions[2];
uint32_t nregions = 0;
@ -1532,6 +1539,7 @@ anv_device_alloc_bo(struct anv_device *device,
.refcount = 1,
.offset = -1,
.size = size,
.ccs_offset = ccs_offset,
.actual_size = actual_size,
.flags = bo_flags,
.alloc_flags = alloc_flags,
@ -1614,6 +1622,7 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device,
assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_CACHED |
ANV_BO_ALLOC_HOST_COHERENT |
ANV_BO_ALLOC_AUX_CCS |
ANV_BO_ALLOC_PROTECTED |
ANV_BO_ALLOC_FIXED_ADDRESS)));
assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);

View file

@ -2298,6 +2298,8 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
!device->uses_ex_bso ||
driQueryOptionb(&instance->dri_options, "force_indirect_descriptors");
device->alloc_aux_tt_mem =
device->info.has_aux_map && device->info.verx10 >= 125;
/* Check if we can read the GPU timestamp register from the CPU */
uint64_t u64_ignore;
device->has_reg_timestamp = intel_gem_read_render_timestamp(fd,
@ -4102,6 +4104,18 @@ VkResult anv_AllocateMemory(
if (device->info->has_aux_map)
alloc_flags |= ANV_BO_ALLOC_AUX_TT_ALIGNED;
/* If the allocation is not dedicated, allocate additional CCS space.
*
* TODO: If we ever ship VK_EXT_descriptor_buffer (ahahah... :() we could
* drop this flag in the descriptor buffer case as we don't need any
* compression there.
*
* TODO: We could also create new memory types for allocations that don't
* need any compression.
*/
if (device->physical->alloc_aux_tt_mem && dedicated_info == NULL)
alloc_flags |= ANV_BO_ALLOC_AUX_CCS;
/* TODO: Android, ChromeOS and other applications may need another way to
* allocate buffers that can be scanout to display but it should pretty
* easy to catch those as Xe KMD driver will print warnings in dmesg when

View file

@ -1873,7 +1873,8 @@ void anv_GetPhysicalDeviceSparseImageFormatProperties2(
VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
isl_surf_usage_flags_t isl_usage =
anv_image_choose_isl_surf_usage(vk_create_flags, pFormatInfo->usage,
anv_image_choose_isl_surf_usage(physical_device,
vk_create_flags, pFormatInfo->usage,
0, aspect);
const enum isl_surf_dim isl_surf_dim =

View file

@ -203,13 +203,23 @@ memory_range_merge(struct anv_image_memory_range *a,
}
isl_surf_usage_flags_t
anv_image_choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
VkImageCreateFlags vk_create_flags,
VkImageUsageFlags vk_usage,
isl_surf_usage_flags_t isl_extra_usage,
VkImageAspectFlagBits aspect)
{
isl_surf_usage_flags_t isl_usage = isl_extra_usage;
/* On platform like MTL, we choose to allocate additional CCS memory at the
* back of the VkDeviceMemory objects since different images can share the
* AUX-TT PTE because the HW doesn't care about the image format in the
* PTE. That means we can always ignore the AUX-TT alignment requirement
* from an ISL point of view.
*/
if (device->alloc_aux_tt_mem)
isl_usage |= ISL_SURF_USAGE_NO_AUX_TT_ALIGNMENT_BIT;
if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
@ -1312,7 +1322,8 @@ add_all_surfaces_implicit_layout(
VkImageUsageFlags vk_usage = vk_image_usage(&image->vk, aspect);
isl_surf_usage_flags_t isl_usage =
anv_image_choose_isl_surf_usage(image->vk.create_flags, vk_usage,
anv_image_choose_isl_surf_usage(device->physical,
image->vk.create_flags, vk_usage,
isl_extra_usage_flags, aspect);
result = add_primary_surface(device, image, plane, plane_format,
@ -1710,8 +1721,8 @@ anv_image_init(struct anv_device *device, struct anv_image *image,
devinfo, image->emu_plane_format, 0, image->vk.tiling);
isl_surf_usage_flags_t isl_usage = anv_image_choose_isl_surf_usage(
image->vk.create_flags, image->vk.usage, isl_extra_usage_flags,
VK_IMAGE_ASPECT_COLOR_BIT);
device->physical, image->vk.create_flags, image->vk.usage,
isl_extra_usage_flags, VK_IMAGE_ASPECT_COLOR_BIT);
r = add_primary_surface(device, image, plane, plane_format,
ANV_OFFSET_IMPLICIT, 0,
@ -2261,23 +2272,65 @@ anv_image_map_aux_tt(struct anv_device *device,
struct anv_bo *bo = main_addr.bo;
assert(bo != NULL);
if (anv_address_allows_aux_map(device, main_addr)) {
const struct anv_address aux_addr =
anv_image_address(image,
&image->planes[plane].compr_ctrl_memory_range);
const struct isl_surf *surf =
&image->planes[plane].primary_surface.isl;
/* If the additional memory padding was added at the end of the BO for CCS
* data, map this region at the granularity of the main/CCS pages.
*
* Otherwise the image should have additional CCS data at the computed
* offset.
*/
if (device->physical->alloc_aux_tt_mem &&
(bo->alloc_flags & ANV_BO_ALLOC_AUX_CCS)) {
uint64_t main_aux_alignment =
intel_aux_map_get_alignment(device->aux_map_ctx);
assert(bo->offset % main_aux_alignment == 0);
const struct anv_address start_addr = (struct anv_address) {
.bo = bo,
.offset = ROUND_DOWN_TO(main_addr.offset, main_aux_alignment),
};
const struct anv_address aux_addr = (struct anv_address) {
.bo = bo,
.offset = bo->ccs_offset +
intel_aux_main_to_aux_offset(device->aux_map_ctx,
start_addr.offset),
};
const struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
const uint64_t format_bits =
intel_aux_map_format_bits_for_isl_surf(surf);
/* Make sure to have the mapping cover the entire image from the aux
* aligned start.
*/
const uint64_t main_size = align(
(main_addr.offset - start_addr.offset) + surf->size_B,
main_aux_alignment);
if (intel_aux_map_add_mapping(device->aux_map_ctx,
anv_address_physical(main_addr),
anv_address_physical(start_addr),
anv_address_physical(aux_addr),
surf->size_B, format_bits)) {
image->planes[plane].aux_tt.addr = anv_address_physical(main_addr);
image->planes[plane].aux_tt.size = surf->size_B;
main_size, format_bits)) {
image->planes[plane].aux_tt.mapped = true;
image->planes[plane].aux_tt.addr = anv_address_physical(start_addr);
image->planes[plane].aux_tt.size = main_size;
return true;
}
} else {
if (anv_address_allows_aux_map(device, main_addr)) {
const struct anv_address aux_addr =
anv_image_address(image,
&image->planes[plane].compr_ctrl_memory_range);
const struct isl_surf *surf =
&image->planes[plane].primary_surface.isl;
const uint64_t format_bits =
intel_aux_map_format_bits_for_isl_surf(surf);
if (intel_aux_map_add_mapping(device->aux_map_ctx,
anv_address_physical(main_addr),
anv_address_physical(aux_addr),
surf->size_B, format_bits)) {
image->planes[plane].aux_tt.mapped = true;
image->planes[plane].aux_tt.addr = anv_address_physical(main_addr);
image->planes[plane].aux_tt.size = surf->size_B;
return true;
}
}
}
return false;

View file

@ -447,6 +447,13 @@ enum anv_bo_alloc_flags {
/** Specify whether this BO is internal to the driver */
ANV_BO_ALLOC_INTERNAL = (1 << 19),
/** Allocate with CCS AUX requirements
*
* This pads the BO include CCS data mapppable through the AUX-TT and
* aligned to the AUX-TT requirements.
*/
ANV_BO_ALLOC_AUX_CCS = (1 << 20),
};
/** Specifies that the BO should be cached and coherent. */
@ -486,6 +493,9 @@ struct anv_bo {
/** Size of the buffer */
uint64_t size;
/** Offset at which the CCS data is stored */
uint64_t ccs_offset;
/* Map for internally mapped BOs.
*
* If ANV_BO_ALLOC_MAPPED is set in flags, this is the map for the whole
@ -1018,6 +1028,25 @@ struct anv_physical_device {
bool uses_ex_bso;
bool always_flush_cache;
/** True if application memory is allocated with extra AUX memory
*
* Applications quite often pool image allocations together in a single
* VkDeviceMemory object. On platforms like MTL, the alignment of images
* with compression mapped through the AUX translation tables is large :
* 1MB. This can create a lot of wasted space in the application memory
* objects.
*
* To workaround this problem, we allocate CCS data at the end of
* VkDeviceMemory objects. This would not work well for TGL-like platforms
* because the AUX translation tables also contain the format of the
* images, but on MTL the HW ignore those values. So we can share the AUX
* TT entries between different images without problem.
*
* This should be only true for platforms with AUX TT.
*/
bool alloc_aux_tt_mem;
/**
* True if the descriptors buffers are holding one of the following :
* - anv_sampled_image_descriptor
@ -5232,7 +5261,8 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
bool predicate);
isl_surf_usage_flags_t
anv_image_choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
VkImageCreateFlags vk_create_flags,
VkImageUsageFlags vk_usage,
isl_surf_usage_flags_t isl_extra_usage,
VkImageAspectFlagBits aspect);