From bc55d150a915d5b2e91cd6ee11af4992d18fcf4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-Francis=20Ratt=C3=A9-Boulianne?= Date: Thu, 31 Aug 2023 23:33:45 -0400 Subject: [PATCH] panfrost: Add support for AFBC packing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GPU is converting a texture from linear/tiled to compressed AFBC, it uses a sparse memory layout. That means that the superblocks are stored starting at intervals equal to the size of an uncompressed superblock. When memory usage needs to be optimized, it is possible to pack the resource by trimming each superblock as much as possible. The GPU will still be able to read from these packed textures, but won't be able to write directly to them. If the layout is AFBC-tiled, the packing process will also de-tile as tiled+packed is not supported by Mali GPUs. No new modifier flag has been added as the absence of the `AFBC_FORMAT_MOD_SPARSE` flag means the resource will be packed. Signed-off-by: Louis-Francis Ratté-Boulianne Part-of: --- src/gallium/drivers/panfrost/pan_afbc_cso.c | 125 +++++++++++++++++++ src/gallium/drivers/panfrost/pan_afbc_cso.h | 11 ++ src/gallium/drivers/panfrost/pan_cmdstream.c | 26 ++++ src/gallium/drivers/panfrost/pan_resource.c | 114 +++++++++++++++++ src/gallium/drivers/panfrost/pan_resource.h | 3 + src/gallium/drivers/panfrost/pan_screen.h | 7 ++ 6 files changed, 286 insertions(+) diff --git a/src/gallium/drivers/panfrost/pan_afbc_cso.c b/src/gallium/drivers/panfrost/pan_afbc_cso.c index 62597b31dcc..7ea74252b98 100644 --- a/src/gallium/drivers/panfrost/pan_afbc_cso.c +++ b/src/gallium/drivers/panfrost/pan_afbc_cso.c @@ -50,6 +50,34 @@ read_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx) AFBC_HEADER_BYTES_PER_TILE / 4, 32); } +static void +write_afbc_header(nir_builder *b, nir_def *buf, nir_def *idx, nir_def *hdr) +{ + nir_def *offset = nir_imul_imm(b, idx, AFBC_HEADER_BYTES_PER_TILE); + nir_store_global(b, nir_iadd(b, buf, nir_u2u64(b, offset)), 16, hdr, 0xF); +} + +static nir_def * +get_morton_index(nir_builder *b, nir_def *idx, nir_def *src_stride, + nir_def *dst_stride) +{ + nir_def *x = nir_umod(b, idx, dst_stride); + nir_def *y = nir_udiv(b, idx, dst_stride); + + nir_def *offset = nir_imul(b, nir_iand_imm(b, y, ~0x7), src_stride); + offset = nir_iadd(b, offset, nir_ishl_imm(b, nir_ushr_imm(b, x, 3), 6)); + + x = nir_iand_imm(b, x, 0x7); + x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 2)), 0x13); + x = nir_iand_imm(b, nir_ior(b, x, nir_ishl_imm(b, x, 1)), 0x15); + y = nir_iand_imm(b, y, 0x7); + y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 2)), 0x13); + y = nir_iand_imm(b, nir_ior(b, y, nir_ishl_imm(b, y, 1)), 0x15); + nir_def *tile_idx = nir_ior(b, x, nir_ishl_imm(b, y, 1)); + + return nir_iadd(b, offset, tile_idx); +} + static nir_def * get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr, nir_def *uncompressed_size) @@ -99,6 +127,71 @@ get_superblock_size(nir_builder *b, unsigned arch, nir_def *hdr, : size; } +static nir_def * +get_packed_offset(nir_builder *b, nir_def *metadata, nir_def *idx, + nir_def **out_size) +{ + nir_def *metadata_offset = + nir_u2u64(b, nir_imul_imm(b, idx, sizeof(struct pan_afbc_block_info))); + nir_def *range_ptr = nir_iadd(b, metadata, metadata_offset); + nir_def *entry = nir_load_global(b, range_ptr, 4, + sizeof(struct pan_afbc_block_info) / 4, 32); + nir_def *offset = + nir_channel(b, entry, offsetof(struct pan_afbc_block_info, offset) / 4); + + if (out_size) + *out_size = + nir_channel(b, entry, offsetof(struct pan_afbc_block_info, size) / 4); + + return nir_u2u64(b, offset); +} + +#define MAX_LINE_SIZE 16 + +static void +copy_superblock(nir_builder *b, nir_def *dst, nir_def *dst_idx, nir_def *hdr_sz, + nir_def *src, nir_def *src_idx, nir_def *metadata, + nir_def *meta_idx, unsigned align) +{ + nir_def *hdr = read_afbc_header(b, src, src_idx); + nir_def *src_body_base_ptr = nir_u2u64(b, nir_channel(b, hdr, 0)); + nir_def *src_bodyptr = nir_iadd(b, src, src_body_base_ptr); + + nir_def *size; + nir_def *dst_offset = get_packed_offset(b, metadata, meta_idx, &size); + nir_def *dst_body_base_ptr = nir_iadd(b, dst_offset, hdr_sz); + nir_def *dst_bodyptr = nir_iadd(b, dst, dst_body_base_ptr); + + /* Replace the `base_body_ptr` field if not zero (solid color) */ + nir_def *hdr2 = + nir_vector_insert_imm(b, hdr, nir_u2u32(b, dst_body_base_ptr), 0); + hdr = nir_bcsel(b, nir_ieq_imm(b, src_body_base_ptr, 0), hdr, hdr2); + write_afbc_header(b, dst, dst_idx, hdr); + + nir_variable *offset_var = + nir_local_variable_create(b->impl, glsl_uint_type(), "offset"); + nir_store_var(b, offset_var, nir_imm_int(b, 0), 1); + nir_loop *loop = nir_push_loop(b); + { + nir_def *offset = nir_load_var(b, offset_var); + nir_if *loop_check = nir_push_if(b, nir_uge(b, offset, size)); + nir_jump(b, nir_jump_break); + nir_push_else(b, loop_check); + unsigned line_sz = align <= MAX_LINE_SIZE ? align : MAX_LINE_SIZE; + for (unsigned i = 0; i < align / line_sz; ++i) { + nir_def *src_line = nir_iadd(b, src_bodyptr, nir_u2u64(b, offset)); + nir_def *dst_line = nir_iadd(b, dst_bodyptr, nir_u2u64(b, offset)); + nir_store_global( + b, dst_line, line_sz, + nir_load_global(b, src_line, line_sz, line_sz / 4, 32), ~0); + offset = nir_iadd_imm(b, offset, line_sz); + } + nir_store_var(b, offset_var, offset, 0x1); + nir_pop_if(b, loop_check); + } + nir_pop_loop(b, loop); +} + #define panfrost_afbc_size_get_info_field(b, field) \ panfrost_afbc_get_info_field(size, b, field) @@ -135,6 +228,37 @@ panfrost_afbc_create_size_shader(struct panfrost_screen *screen, unsigned bpp, return b.shader; } +#define panfrost_afbc_pack_get_info_field(b, field) \ + panfrost_afbc_get_info_field(pack, b, field) + +static nir_shader * +panfrost_afbc_create_pack_shader(struct panfrost_screen *screen, unsigned align, + bool tiled) +{ + nir_builder b = nir_builder_init_simple_shader( + MESA_SHADER_COMPUTE, screen->vtbl.get_compiler_options(), + "panfrost_afbc_pack"); + + panfrost_afbc_add_info_ubo(pack, b); + + nir_def *coord = nir_load_global_invocation_id(&b, 32); + nir_def *src_stride = panfrost_afbc_pack_get_info_field(&b, src_stride); + nir_def *dst_stride = panfrost_afbc_pack_get_info_field(&b, dst_stride); + nir_def *dst_idx = nir_channel(&b, coord, 0); + nir_def *src_idx = + tiled ? get_morton_index(&b, dst_idx, src_stride, dst_stride) : dst_idx; + nir_def *src = panfrost_afbc_pack_get_info_field(&b, src); + nir_def *dst = panfrost_afbc_pack_get_info_field(&b, dst); + nir_def *header_size = + nir_u2u64(&b, panfrost_afbc_pack_get_info_field(&b, header_size)); + nir_def *metadata = panfrost_afbc_pack_get_info_field(&b, metadata); + + copy_superblock(&b, dst, dst_idx, header_size, src, src_idx, metadata, + src_idx, align); + + return b.shader; +} + struct pan_afbc_shader_data * panfrost_afbc_get_shaders(struct panfrost_context *ctx, struct panfrost_resource *rsrc, unsigned align) @@ -171,6 +295,7 @@ panfrost_afbc_get_shaders(struct panfrost_context *ctx, } COMPILE_SHADER(size, key.bpp, key.align); + COMPILE_SHADER(pack, key.align, key.tiled); #undef COMPILE_SHADER diff --git a/src/gallium/drivers/panfrost/pan_afbc_cso.h b/src/gallium/drivers/panfrost/pan_afbc_cso.h index 86e09c57735..4b9f324ac2d 100644 --- a/src/gallium/drivers/panfrost/pan_afbc_cso.h +++ b/src/gallium/drivers/panfrost/pan_afbc_cso.h @@ -42,6 +42,7 @@ struct pan_afbc_shader_key { struct pan_afbc_shader_data { struct pan_afbc_shader_key key; void *size_cso; + void *pack_cso; }; struct pan_afbc_shaders { @@ -59,6 +60,16 @@ struct panfrost_afbc_size_info { mali_ptr metadata; } PACKED; +struct panfrost_afbc_pack_info { + mali_ptr src; + mali_ptr dst; + mali_ptr metadata; + uint32_t header_size; + uint32_t src_stride; + uint32_t dst_stride; + uint32_t padding[3]; // FIXME +} PACKED; + void panfrost_afbc_context_init(struct panfrost_context *ctx); void panfrost_afbc_context_destroy(struct panfrost_context *ctx); diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index c2b1a69d4ca..a5c4ed2c6e8 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -3953,6 +3953,31 @@ panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src, LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks); } +static void +panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src, + struct panfrost_bo *dst, + struct pan_image_slice_layout *dst_slice, + struct panfrost_bo *metadata, unsigned metadata_offset, + unsigned level) +{ + struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level]; + struct panfrost_afbc_pack_info consts = { + .src = src->image.data.bo->ptr.gpu + src->image.data.offset + + src_slice->offset, + .dst = dst->ptr.gpu + dst_slice->offset, + .metadata = metadata->ptr.gpu + metadata_offset, + .header_size = dst_slice->afbc.header_size, + .src_stride = src_slice->afbc.stride, + .dst_stride = dst_slice->afbc.stride, + }; + + panfrost_batch_write_rsrc(batch, src, PIPE_SHADER_COMPUTE); + panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE); + panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE); + + LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks); +} + static void * panfrost_create_rasterizer_state(struct pipe_context *pctx, const struct pipe_rasterizer_state *cso) @@ -4570,6 +4595,7 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); screen->vtbl.compile_shader = GENX(pan_shader_compile); screen->vtbl.afbc_size = panfrost_afbc_size; + screen->vtbl.afbc_pack = panfrost_afbc_pack; GENX(pan_blitter_init) (dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base); diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c index b628670835f..cb493a4a462 100644 --- a/src/gallium/drivers/panfrost/pan_resource.c +++ b/src/gallium/drivers/panfrost/pan_resource.c @@ -907,6 +907,16 @@ panfrost_load_tiled_images(struct panfrost_transfer *transfer, } } +/* Get scan-order index from (x, y) position when blocks are + * arranged in z-order in 8x8 tiles */ +static unsigned +get_morton_index(unsigned x, unsigned y, unsigned stride) +{ + unsigned i = ((x << 0) & 1) | ((y << 1) & 2) | ((x << 1) & 4) | + ((y << 2) & 8) | ((x << 2) & 16) | ((y << 3) & 32); + return (((y & ~7) * stride) + ((x & ~7) << 3)) + i; +} + static void panfrost_store_tiled_images(struct panfrost_transfer *transfer, struct panfrost_resource *rsrc) @@ -1303,6 +1313,110 @@ panfrost_get_afbc_superblock_sizes(struct panfrost_context *ctx, return bo; } +void +panfrost_pack_afbc(struct panfrost_context *ctx, + struct panfrost_resource *prsrc) +{ + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_bo *metadata_bo; + unsigned metadata_offsets[PIPE_MAX_TEXTURE_LEVELS]; + + uint64_t src_modifier = prsrc->image.layout.modifier; + uint64_t dst_modifier = + src_modifier & ~(AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE); + bool is_tiled = src_modifier & AFBC_FORMAT_MOD_TILED; + unsigned last_level = prsrc->base.last_level; + struct pan_image_slice_layout slice_infos[PIPE_MAX_TEXTURE_LEVELS] = {0}; + unsigned total_size = 0; + + /* It doesn't make sense to pack everything if we need to unpack right + * away to upload data to another level */ + for (int i = 0; i <= last_level; i++) { + if (!BITSET_TEST(prsrc->valid.data, i)) + return; + } + + metadata_bo = panfrost_get_afbc_superblock_sizes(ctx, prsrc, 0, last_level, + metadata_offsets); + panfrost_bo_wait(metadata_bo, INT64_MAX, false); + + for (unsigned level = 0; level <= last_level; ++level) { + struct pan_image_slice_layout *src_slice = + &prsrc->image.layout.slices[level]; + struct pan_image_slice_layout *dst_slice = &slice_infos[level]; + + unsigned width = u_minify(prsrc->base.width0, level); + unsigned height = u_minify(prsrc->base.height0, level); + unsigned src_stride = + pan_afbc_stride_blocks(src_modifier, src_slice->row_stride); + unsigned dst_stride = + DIV_ROUND_UP(width, panfrost_afbc_superblock_width(dst_modifier)); + unsigned dst_height = + DIV_ROUND_UP(height, panfrost_afbc_superblock_height(dst_modifier)); + + uint32_t offset = 0; + struct pan_afbc_block_info *meta = + metadata_bo->ptr.cpu + metadata_offsets[level]; + + for (unsigned y = 0, i = 0; y < dst_height; ++y) { + for (unsigned x = 0; x < dst_stride; ++x, ++i) { + unsigned idx = is_tiled ? get_morton_index(x, y, src_stride) : i; + uint32_t size = meta[idx].size; + meta[idx].offset = offset; /* write the start offset */ + offset += size; + } + } + + total_size = ALIGN_POT(total_size, pan_slice_align(dst_modifier)); + { + dst_slice->afbc.stride = dst_stride; + dst_slice->afbc.nr_blocks = dst_stride * dst_height; + dst_slice->afbc.header_size = + ALIGN_POT(dst_stride * dst_height * AFBC_HEADER_BYTES_PER_TILE, + pan_afbc_body_align(dst_modifier)); + dst_slice->afbc.body_size = offset; + dst_slice->afbc.surface_stride = dst_slice->afbc.header_size + offset; + + dst_slice->offset = total_size; + dst_slice->row_stride = dst_stride * AFBC_HEADER_BYTES_PER_TILE; + dst_slice->surface_stride = dst_slice->afbc.surface_stride; + dst_slice->size = dst_slice->afbc.surface_stride; + } + total_size += dst_slice->afbc.surface_stride; + } + + unsigned new_size = ALIGN_POT(total_size, 4096); // FIXME + unsigned old_size = prsrc->image.data.bo->size; + + if (new_size == old_size) + return; + + if (dev->debug & PAN_DBG_PERF) { + printf("%i%%: %i KB -> %i KB\n", 100 * new_size / old_size, + old_size / 1024, new_size / 1024); + } + + struct panfrost_bo *dst = + panfrost_bo_create(dev, new_size, 0, "AFBC compact texture"); + struct panfrost_batch *batch = + panfrost_get_fresh_batch_for_fbo(ctx, "AFBC compaction"); + + for (unsigned level = 0; level <= last_level; ++level) { + struct pan_image_slice_layout *slice = &slice_infos[level]; + screen->vtbl.afbc_pack(batch, prsrc, dst, slice, metadata_bo, + metadata_offsets[level], level); + prsrc->image.layout.slices[level] = *slice; + } + + panfrost_flush_batches_accessing_rsrc(ctx, prsrc, "AFBC compaction flush"); + + prsrc->image.layout.modifier = dst_modifier; + panfrost_bo_unreference(prsrc->image.data.bo); + prsrc->image.data.bo = dst; + panfrost_bo_unreference(metadata_bo); +} + static void panfrost_ptr_unmap(struct pipe_context *pctx, struct pipe_transfer *transfer) { diff --git a/src/gallium/drivers/panfrost/pan_resource.h b/src/gallium/drivers/panfrost/pan_resource.h index e2bdd202823..d6d1593e587 100644 --- a/src/gallium/drivers/panfrost/pan_resource.h +++ b/src/gallium/drivers/panfrost/pan_resource.h @@ -181,6 +181,9 @@ struct panfrost_bo *panfrost_get_afbc_superblock_sizes( struct panfrost_context *ctx, struct panfrost_resource *rsrc, unsigned first_level, unsigned last_level, unsigned *out_offsets); +void panfrost_pack_afbc(struct panfrost_context *ctx, + struct panfrost_resource *prsrc); + void pan_resource_modifier_convert(struct panfrost_context *ctx, struct panfrost_resource *rsrc, uint64_t modifier, const char *reason); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index 3f5c690d4b0..9c378d6a4a3 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -105,6 +105,13 @@ struct panfrost_vtable { struct panfrost_resource *src, struct panfrost_bo *metadata, unsigned offset, unsigned level); + + /* Run a compute shader to compact a sparse layout afbc resource */ + void (*afbc_pack)(struct panfrost_batch *batch, + struct panfrost_resource *src, struct panfrost_bo *dst, + struct pan_image_slice_layout *slice, + struct panfrost_bo *metadata, unsigned metadata_offset, + unsigned level); }; struct panfrost_screen {