mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-18 23:48:22 +02:00
libagx: add in-place decompression kernel
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30981>
This commit is contained in:
parent
3497cd2ed2
commit
ed08fe94d2
6 changed files with 180 additions and 0 deletions
|
|
@ -1628,3 +1628,16 @@ agx_nir_predicate_indirect(nir_builder *b, const void *data)
|
|||
|
||||
libagx_predicate_indirect(b, params, thread, indexed);
|
||||
}
|
||||
|
||||
void
|
||||
agx_nir_decompress(nir_builder *b, const void *data)
|
||||
{
|
||||
const struct agx_decompress_key *key = data;
|
||||
|
||||
nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
|
||||
nir_def *tile = nir_load_workgroup_id(b);
|
||||
nir_def *local = nir_channel(b, nir_load_local_invocation_id(b), 0);
|
||||
nir_def *samples = nir_imm_int(b, key->nr_samples);
|
||||
|
||||
libagx_decompress(b, params, tile, local, samples);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -97,3 +97,10 @@ struct agx_predicate_indirect_key {
|
|||
static_assert(sizeof(struct agx_predicate_indirect_key) == 1, "padded");
|
||||
|
||||
void agx_nir_predicate_indirect(struct nir_builder *b, const void *data);
|
||||
|
||||
struct agx_decompress_key {
|
||||
uint8_t nr_samples;
|
||||
};
|
||||
static_assert(sizeof(struct agx_decompress_key) == 1, "padded");
|
||||
|
||||
void agx_nir_decompress(struct nir_builder *b, const void *data);
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ libasahi_decode_files = files(
|
|||
|
||||
libagx_shader_files = files(
|
||||
'shaders/libagx.h',
|
||||
'shaders/compression.cl',
|
||||
'shaders/draws.cl',
|
||||
'shaders/draws.h',
|
||||
'shaders/geometry.cl',
|
||||
|
|
|
|||
126
src/asahi/lib/shaders/compression.cl
Normal file
126
src/asahi/lib/shaders/compression.cl
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright 2024 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "agx_pack.h"
|
||||
#include "compression.h"
|
||||
#include "libagx.h"
|
||||
|
||||
/*
|
||||
* Decompress in place. The metadata is updated, so other processes can read the
|
||||
* image with a compressed texture descriptor.
|
||||
*
|
||||
* Each workgroup processes one 16x16 tile, avoiding races. We use 32x1
|
||||
* workgroups, matching the warp size, meaning each work-item must process
|
||||
* (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp
|
||||
* barriers. It also minimizes launched threads, accelerating the early exit.
|
||||
*/
|
||||
|
||||
/* Our compiler represents a bindless handle as a uint2 of a uniform base and an
|
||||
* offset in bytes. Since the descriptors are all in the u0_u1 push, the former
|
||||
* is hardcoded and the latter is an offsetof.
|
||||
*/
|
||||
#define HANDLE(field) (uint2)(0, offsetof(struct libagx_decompress_push, field))
|
||||
|
||||
/*
|
||||
* The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits.
|
||||
* While dimensions are padded to powers-of-two, they are not padded to a
|
||||
* square. If the width is more than 2x the height or vice versa, the additional
|
||||
* bits are linear. So we interleave as much as possible, and then add what's
|
||||
* remaining. Finally, layers are strided linear and added at the end.
|
||||
*/
|
||||
uint
|
||||
index_metadata(uint3 c, uint width, uint height, uint layer_stride)
|
||||
{
|
||||
uint major_coord = width > height ? c.x : c.y;
|
||||
uint minor_dim = min(width, height);
|
||||
|
||||
uint intl_bits = libagx_logbase2_ceil(minor_dim);
|
||||
uint intl_mask = (1 << intl_bits) - 1;
|
||||
uint2 intl_coords = c.xy & intl_mask;
|
||||
|
||||
return nir_interleave_agx(intl_coords.x, intl_coords.y) +
|
||||
((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z);
|
||||
}
|
||||
|
||||
/*
|
||||
* For multisampled images, a 2x2 or 1x2 group of samples form a single pixel.
|
||||
* The following two helpers convert a coordinate in samples into a coordinate
|
||||
* in pixels and a sample ID, respectively. They each assume that samples > 1.
|
||||
*/
|
||||
int4
|
||||
decompose_px(int4 c, uint samples)
|
||||
{
|
||||
if (samples == 4)
|
||||
c.xy >>= 1;
|
||||
else
|
||||
c.y >>= 1;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
uint
|
||||
sample_id(int4 c, uint samples)
|
||||
{
|
||||
if (samples == 4)
|
||||
return (c.x & 1) | ((c.y & 1) << 1);
|
||||
else
|
||||
return c.y & 1;
|
||||
}
|
||||
|
||||
void
|
||||
libagx_decompress(constant struct libagx_decompress_push *push, uint3 coord_tl,
|
||||
uint local_id, uint samples)
|
||||
{
|
||||
/* Index into the metadata buffer */
|
||||
uint index_tl =
|
||||
index_metadata(coord_tl, push->metadata_width_tl,
|
||||
push->metadata_height_tl, push->metadata_layer_stride_tl);
|
||||
|
||||
/* If the tile is already uncompressed, there's nothing to do. */
|
||||
if (push->metadata[index_tl] == push->tile_uncompressed)
|
||||
return;
|
||||
|
||||
/* Tiles are 16x16 */
|
||||
uint2 coord_sa = (coord_tl.xy * 16);
|
||||
uint layer = coord_tl.z;
|
||||
|
||||
/* Since we use a 32x1 workgroup, each work-item handles half of a row. */
|
||||
uint offs_y_sa = local_id >> 1;
|
||||
uint offs_x_sa = (local_id & 1) ? 8 : 0;
|
||||
|
||||
int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa);
|
||||
int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0);
|
||||
|
||||
/* Read our half-row into registers. */
|
||||
uint4 texels[8];
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
|
||||
if (samples == 1) {
|
||||
texels[i] = nir_bindless_image_load_array(HANDLE(compressed), c_sa);
|
||||
} else {
|
||||
int4 dec_px = decompose_px(c_sa, samples);
|
||||
texels[i] = nir_bindless_image_load_ms_array(
|
||||
HANDLE(compressed), dec_px, sample_id(c_sa, samples));
|
||||
}
|
||||
}
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* Now that the whole tile is read, we write without racing. */
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
|
||||
if (samples == 1) {
|
||||
nir_bindless_image_store_array(HANDLE(uncompressed), c_sa, texels[i]);
|
||||
} else {
|
||||
int4 dec_px = decompose_px(c_sa, samples);
|
||||
nir_bindless_image_store_ms_array(HANDLE(uncompressed), dec_px,
|
||||
sample_id(c_sa, samples), texels[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* We've replaced the body buffer. Mark the tile as uncompressed. */
|
||||
if (local_id == 0) {
|
||||
push->metadata[index_tl] = push->tile_uncompressed;
|
||||
}
|
||||
}
|
||||
19
src/asahi/lib/shaders/compression.h
Normal file
19
src/asahi/lib/shaders/compression.h
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Copyright 2024 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "agx_pack.h"
|
||||
#include "libagx.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
struct libagx_decompress_push {
|
||||
struct agx_texture_packed compressed;
|
||||
struct agx_pbe_packed uncompressed;
|
||||
GLOBAL(uint64_t) metadata;
|
||||
uint64_t tile_uncompressed;
|
||||
uint32_t metadata_layer_stride_tl;
|
||||
uint16_t metadata_width_tl;
|
||||
uint16_t metadata_height_tl;
|
||||
};
|
||||
AGX_STATIC_ASSERT(sizeof(struct libagx_decompress_push) == 72);
|
||||
|
|
@ -40,6 +40,12 @@ uint32_t nir_load_helper_arg_lo_agx(void);
|
|||
uint32_t nir_load_helper_arg_hi_agx(void);
|
||||
uint32_t nir_fence_helper_exit_agx(void);
|
||||
|
||||
uint4 nir_bindless_image_load_array(uint2 handle, int4 coord);
|
||||
void nir_bindless_image_store_array(uint2 handle, int4 coord, uint4 datum);
|
||||
uint4 nir_bindless_image_load_ms_array(uint2 handle, int4 coord, uint sample);
|
||||
void nir_bindless_image_store_ms_array(uint2 handle, int4 coord, uint sample,
|
||||
uint4 datum);
|
||||
|
||||
uint libagx_load_index_buffer_internal(uintptr_t index_buffer,
|
||||
uint32_t index_buffer_range_el, uint id,
|
||||
uint index_size);
|
||||
|
|
@ -59,6 +65,14 @@ align(uint x, uint y)
|
|||
return (x + y - 1) & ~(y - 1);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
libagx_logbase2_ceil(uint32_t n)
|
||||
{
|
||||
return (n <= 1) ? 0 : 32 - clz(n - 1);
|
||||
}
|
||||
|
||||
#define offsetof(x, y) __builtin_offsetof(x, y)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue