diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 32907f31dbc..16a97a3c2d5 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -4713,6 +4713,16 @@ gfx10_surface_copy_mem_surface(struct ac_addrlib *addrlib, const struct radeon_i input.pbXor = surf->tile_swizzle; input.pMappedSurface = (char *)surf_copy_region->surf_ptr + (surf_copy_region->is_stencil_only ? surf->u.gfx9.zs.stencil_offset : 0); + if (surf_copy_region->memcpy) { + if (surf->blk_w == 4 && surf->blk_h == 4) { + /* The hybrid memcpy seems to perform better with block compressed + * formats due to the 256B alignment. + */ + input.copyFlags.hybridMemcpy = true; + } else { + input.copyFlags.blockMemcpy = true; + } + } ADDR_E_RETURNCODE res; ADDR2_COPY_MEMSURFACE_REGION region = {0}; @@ -4786,6 +4796,16 @@ gfx12_surface_copy_mem_surface(struct ac_addrlib *addrlib, const struct radeon_i input.pbXor = surf->tile_swizzle; input.pMappedSurface = (char *)surf_copy_region->surf_ptr + (surf_copy_region->is_stencil_only ? surf->u.gfx9.zs.stencil_offset : 0); + if (surf_copy_region->memcpy) { + if (surf->blk_w == 4 && surf->blk_h == 4) { + /* The hybrid memcpy seems to perform better with block compressed + * formats due to the 256B alignment. + */ + input.copyFlags.hybridMemcpy = true; + } else { + input.copyFlags.blockMemcpy = true; + } + } ADDR_E_RETURNCODE res; ADDR3_COPY_MEMSURFACE_REGION region = {0}; diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h index 3f6705ef10a..5e52c1df1bd 100644 --- a/src/amd/common/ac_surface.h +++ b/src/amd/common/ac_surface.h @@ -560,6 +560,7 @@ struct ac_surface_copy_region { uint64_t mem_slice_pitch; bool is_stencil_only; + bool memcpy; }; bool ac_surface_copy_mem_to_surface(struct ac_addrlib *addrlib, const struct radeon_info *info,