panvk: Implement vkCmdCopyImageToBuffer()

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12095>
This commit is contained in:
Boris Brezillon 2021-09-08 10:14:21 +02:00
parent 4ce85cd25a
commit f73ae1a6b5
2 changed files with 512 additions and 1 deletions

View file

@ -130,6 +130,7 @@ panvk_logi_v(const char *format, va_list va);
#define panvk_stub() assert(!"stub")
#define PANVK_META_COPY_BUF2IMG_NUM_FORMATS 12
#define PANVK_META_COPY_IMG2BUF_NUM_FORMATS 12
#define PANVK_META_COPY_IMG2IMG_NUM_FORMATS 14
#define PANVK_META_COPY_NUM_TEX_TYPES 5
@ -164,6 +165,10 @@ struct panvk_meta {
mali_ptr rsd;
struct panfrost_ubo_push pushmap;
} buf2img[PANVK_META_COPY_BUF2IMG_NUM_FORMATS];
struct {
mali_ptr rsd;
struct panfrost_ubo_push pushmap;
} img2buf[PANVK_META_COPY_NUM_TEX_TYPES][PANVK_META_COPY_IMG2BUF_NUM_FORMATS];
struct {
mali_ptr rsd;
} img2img[PANVK_META_COPY_NUM_TEX_TYPES][PANVK_META_COPY_IMG2IMG_NUM_FORMATS];

View file

@ -191,6 +191,41 @@ panvk_meta_copy_emit_tiler_job(struct pan_pool *desc_pool,
return job;
}
static struct panfrost_ptr
panvk_meta_copy_emit_compute_job(struct pan_pool *desc_pool,
struct pan_scoreboard *scoreboard,
const struct pan_compute_dim *num_wg,
const struct pan_compute_dim *wg_sz,
mali_ptr texture, mali_ptr sampler,
mali_ptr ubo, mali_ptr push_constants,
mali_ptr rsd, mali_ptr tsd)
{
struct panfrost_ptr job =
pan_pool_alloc_desc(desc_pool, COMPUTE_JOB);
void *invoc = pan_section_ptr(job.cpu,
COMPUTE_JOB,
INVOCATION);
panfrost_pack_work_groups_compute(invoc, num_wg->x, num_wg->y, num_wg->z,
wg_sz->x, wg_sz->y, wg_sz->z,
false, false);
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
cfg.job_task_split = 8;
}
panvk_meta_copy_emit_dcd(desc_pool, 0, 0, texture, sampler,
0, tsd, rsd, ubo, push_constants,
pan_section_ptr(job.cpu, COMPUTE_JOB, DRAW));
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
false, false, 0, 0, &job, false);
return job;
}
#if PAN_ARCH >= 6
static uint32_t
panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)
@ -339,6 +374,28 @@ panvk_meta_copy_emit_push_constants(struct panfrost_device *pdev,
return pan_pool_upload_aligned(pool, pushvals, size, 16);
}
static mali_ptr
panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device *pdev,
struct pan_pool *desc_pool,
mali_ptr shader,
const struct pan_shader_info *shader_info,
bool from_img)
{
struct panfrost_ptr rsd_ptr =
pan_pool_alloc_desc_aggregate(desc_pool,
PAN_DESC(RENDERER_STATE));
pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
pan_shader_prepare_rsd(shader_info, shader, &cfg);
if (from_img) {
cfg.shader.texture_count = 1;
cfg.shader.sampler_count = 1;
}
}
return rsd_ptr.gpu;
}
static mali_ptr
panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
struct pan_pool *bin_pool,
@ -1235,6 +1292,448 @@ panvk_per_arch(CmdCopyBufferToImage)(VkCommandBuffer commandBuffer,
}
}
static const struct panvk_meta_copy_format_info panvk_meta_copy_img2buf_fmts[] = {
{ PIPE_FORMAT_R8_UINT, 0x1 },
{ PIPE_FORMAT_R8G8_UINT, 0x3 },
{ PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
{ PIPE_FORMAT_R8G8B8A8_UINT, 0xf },
{ PIPE_FORMAT_R16G16B16_UINT, 0x7 },
{ PIPE_FORMAT_R32G32_UINT, 0x3 },
{ PIPE_FORMAT_R32G32B32_UINT, 0x7 },
{ PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
/* S8 -> Z24S8 */
{ PIPE_FORMAT_R8G8B8A8_UINT, 0x8 },
/* S8 -> Z32_S8X24 */
{ PIPE_FORMAT_R32G32_UINT, 0x2 },
/* Z24X8 -> Z24S8 */
{ PIPE_FORMAT_R8G8B8A8_UINT, 0x7 },
/* Z32 -> Z32_S8X24 */
{ PIPE_FORMAT_R32G32_UINT, 0x1 },
};
static enum pipe_format
panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)
{
/* Pick blendable formats when we can, and the FLOAT variant matching the
* texelsize otherwise.
*/
switch (util_format_get_blocksize(imgfmt)) {
case 1: return PIPE_FORMAT_R8_UINT;
/* AFBC stores things differently for RGB565,
* we can't simply map to R8G8 in that case */
case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UINT;
case 4: return PIPE_FORMAT_R8G8B8A8_UINT;
case 6: return PIPE_FORMAT_R16G16B16_UINT;
case 8: return PIPE_FORMAT_R32G32_UINT;
case 12: return PIPE_FORMAT_R32G32B32_UINT;
case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
default: unreachable("Invalid format\n");
}
}
struct panvk_meta_copy_img2buf_info {
struct {
mali_ptr ptr;
struct {
unsigned line;
unsigned surf;
} stride;
} buf;
struct {
struct {
unsigned x, y, z;
} offset;
struct {
unsigned minx, miny, maxx, maxy;
} extent;
} img;
};
#define panvk_meta_copy_img2buf_get_info_field(b, field) \
nir_load_ubo((b), 1, \
sizeof(((struct panvk_meta_copy_img2buf_info *)0)->field) * 8, \
nir_imm_int(b, 0), \
nir_imm_int(b, offsetof(struct panvk_meta_copy_img2buf_info, field)), \
.align_mul = 4, \
.align_offset = 0, \
.range_base = 0, \
.range = ~0)
static mali_ptr
panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
struct pan_pool *bin_pool,
struct panvk_meta_copy_format_info key,
unsigned texdim, unsigned texisarray,
struct pan_shader_info *shader_info)
{
unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
/* FIXME: Won't work on compute queues, but we can't do that with
* a compute shader if the destination is an AFBC surface.
*/
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
GENX(pan_shader_get_compiler_options)(),
"panvk_meta_copy_img2buf(dim=%dD%s,imgfmt=%s,mask=%x)",
texdim, texisarray ? "[]" : "",
util_format_name(key.imgfmt),
key.mask);
b.shader->info.internal = true;
b.shader->info.num_ubos = 1;
nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
nir_ssa_def *bufptr =
panvk_meta_copy_img2buf_get_info_field(&b, buf.ptr);
nir_ssa_def *buflinestride =
panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.line);
nir_ssa_def *bufsurfstride =
panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.surf);
nir_ssa_def *imgminx =
panvk_meta_copy_img2buf_get_info_field(&b, img.extent.minx);
nir_ssa_def *imgminy =
panvk_meta_copy_img2buf_get_info_field(&b, img.extent.miny);
nir_ssa_def *imgmaxx =
panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxx);
nir_ssa_def *imgmaxy =
panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxy);
nir_ssa_def *imgcoords, *inbounds;
switch (texdim + texisarray) {
case 1:
imgcoords =
nir_iadd(&b,
nir_channel(&b, coord, 0),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x));
inbounds =
nir_iand(&b,
nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx));
break;
case 2:
imgcoords =
nir_vec2(&b,
nir_iadd(&b,
nir_channel(&b, coord, 0),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
nir_iadd(&b,
nir_channel(&b, coord, 1),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
inbounds =
nir_iand(&b,
nir_iand(&b,
nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
nir_iand(&b,
nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
break;
case 3:
imgcoords =
nir_vec3(&b,
nir_iadd(&b,
nir_channel(&b, coord, 0),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
nir_iadd(&b,
nir_channel(&b, coord, 1),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)),
nir_iadd(&b,
nir_channel(&b, coord, 2),
panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
inbounds =
nir_iand(&b,
nir_iand(&b,
nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
nir_iand(&b,
nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
break;
default:
unreachable("Invalid texture dimension\n");
}
nir_push_if(&b, inbounds);
/* FIXME: doesn't work for tiled+compressed formats since blocks are 4x4
* blocks instead of 16x16 texels in that case, and there's nothing we can
* do to force the tile size to 4x4 in the render path.
* This being said, compressed textures are not compatible with AFBC, so we
* could use a compute shader arranging the blocks properly.
*/
nir_ssa_def *offset =
nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
offset = nir_iadd(&b, offset,
nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
offset = nir_iadd(&b, offset,
nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
unsigned imgcompsz = imgtexelsz <= 4 ?
1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
unsigned nimgcomps = imgtexelsz / imgcompsz;
assert(nimgcomps <= 4);
nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
tex->op = nir_texop_txf;
tex->texture_index = 0;
tex->is_array = texisarray;
tex->dest_type = util_format_is_unorm(key.imgfmt) ?
nir_type_float32 : nir_type_uint32;
switch (texdim) {
case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
default: unreachable("Invalid texture dimension");
}
tex->src[0].src_type = nir_tex_src_coord;
tex->src[0].src = nir_src_for_ssa(imgcoords);
tex->coord_components = texdim + texisarray;
nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
nir_alu_type_get_type_size(tex->dest_type), NULL);
nir_builder_instr_insert(&b, &tex->instr);
nir_ssa_def *texel = &tex->dest.ssa;
unsigned fullmask = (1 << util_format_get_nr_components(key.imgfmt)) - 1;
unsigned nbufcomps = util_bitcount(fullmask);
if (key.mask != fullmask) {
nir_ssa_def *bufcomps[4];
nbufcomps = 0;
for (unsigned i = 0; i < nimgcomps; i++) {
if (key.mask & BITFIELD_BIT(i))
bufcomps[nbufcomps++] = nir_channel(&b, texel, i);
}
texel = nir_vec(&b, bufcomps, nbufcomps);
}
unsigned bufcompsz = buftexelsz / nbufcomps;
if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
texel = nir_fmul(&b, texel,
nir_vec3(&b,
nir_imm_float(&b, 31),
nir_imm_float(&b, 63),
nir_imm_float(&b, 31)));
texel = nir_f2u16(&b, texel);
texel = nir_ior(&b, nir_channel(&b, texel, 0),
nir_ior(&b,
nir_ishl(&b, nir_channel(&b, texel, 1), nir_imm_int(&b, 5)),
nir_ishl(&b, nir_channel(&b, texel, 2), nir_imm_int(&b, 11))));
imgcompsz = 2;
bufcompsz = 2;
nbufcomps = 1;
nimgcomps = 1;
} else if (imgcompsz == 1) {
nir_ssa_def *packed = nir_channel(&b, texel, 0);
for (unsigned i = 1; i < nbufcomps; i++) {
packed = nir_ior(&b, packed,
nir_ishl(&b, nir_iand_imm(&b, nir_channel(&b, texel, i), 0xff),
nir_imm_int(&b, i * 8)));
}
texel = packed;
bufcompsz = nbufcomps == 3 ? 4 : nbufcomps;
nbufcomps = 1;
}
assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
assert(nbufcomps <= 4 && nimgcomps <= 4);
texel = nir_u2uN(&b, texel, bufcompsz * 8);
nir_store_global(&b, bufptr, bufcompsz, texel, (1 << nbufcomps) - 1);
nir_pop_if(&b, NULL);
struct panfrost_compile_inputs inputs = {
.gpu_id = pdev->gpu_id,
.is_blit = true,
};
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
/* Make sure UBO words have been upgraded to push constants and everything
* is at the right place.
*/
assert(shader_info->ubo_count == 1);
assert(shader_info->push.count <= (sizeof(struct panvk_meta_copy_img2buf_info) / 4));
mali_ptr shader =
pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
PAN_ARCH >= 6 ? 128 : 64);
util_dynarray_fini(&binary);
ralloc_free(b.shader);
return shader;
}
static unsigned
panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)
{
for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
if (!memcmp(&key, &panvk_meta_copy_img2buf_fmts[i], sizeof(key)))
return i;
}
unreachable("Invalid texel size\n");
}
static void
panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_buffer *buf,
const struct panvk_image *img,
const VkBufferImageCopy *region)
{
struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
struct panvk_meta_copy_format_info key = {
.imgfmt = panvk_meta_copy_img2buf_format(img->pimage.layout.format),
.mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
region->imageSubresource.aspectMask),
};
unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
unsigned texdimidx =
panvk_meta_copy_tex_type(img->pimage.layout.dim,
img->pimage.layout.array_size > 1);
unsigned fmtidx = panvk_meta_copy_img2buf_format_idx(key);
mali_ptr rsd =
cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].rsd;
const struct panfrost_ubo_push *pushmap =
&cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].pushmap;
struct panvk_meta_copy_img2buf_info info = {
.buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
.buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
.img.offset.x = MAX2(region->imageOffset.x & ~15, 0),
.img.offset.y = MAX2(region->imageOffset.y & ~15, 0),
.img.offset.z = MAX2(region->imageOffset.z, 0),
.img.extent.minx = MAX2(region->imageOffset.x, 0),
.img.extent.miny = MAX2(region->imageOffset.y, 0),
.img.extent.maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0),
.img.extent.maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0),
};
info.buf.stride.surf = (region->bufferImageHeight ? : region->imageExtent.height) *
info.buf.stride.line;
mali_ptr pushconsts =
panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
&info, sizeof(info));
mali_ptr ubo =
panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
struct pan_image_view view = {
.format = key.imgfmt,
.dim = img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
MALI_TEXTURE_DIMENSION_2D : img->pimage.layout.dim,
.image = &img->pimage,
.nr_samples = img->pimage.layout.nr_samples,
.first_level = region->imageSubresource.mipLevel,
.last_level = region->imageSubresource.mipLevel,
.first_layer = region->imageSubresource.baseArrayLayer,
.last_layer = region->imageSubresource.baseArrayLayer + region->imageSubresource.layerCount - 1,
.swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
};
mali_ptr texture =
panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &view);
mali_ptr sampler =
panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
if (cmdbuf->state.batch)
panvk_per_arch(cmd_close_batch)(cmdbuf);
panvk_cmd_open_batch(cmdbuf);
struct panvk_batch *batch = cmdbuf->state.batch;
struct pan_tls_info tlsinfo = { 0 };
batch->blit.src = img->pimage.data.bo;
batch->blit.dst = buf->bo;
batch->tls =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, LOCAL_STORAGE);
GENX(pan_emit_tls)(&tlsinfo, batch->tls.cpu);
mali_ptr tsd = batch->tls.gpu;
struct pan_compute_dim wg_sz = {
16,
img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ? 1 : 16,
1,
};
struct pan_compute_dim num_wg = {
(ALIGN_POT(info.img.extent.maxx + 1, 16) - info.img.offset.x) / 16,
img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ?
1 : (ALIGN_POT(info.img.extent.maxy + 1, 16) - info.img.offset.y) / 16,
MAX2(region->imageSubresource.layerCount, region->imageExtent.depth),
};
struct panfrost_ptr job =
panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
&batch->scoreboard, &num_wg, &wg_sz,
texture, sampler,
ubo, pushconsts,
rsd, tsd);
util_dynarray_append(&batch->jobs, void *, job.cpu);
if (cmdbuf->state.batch)
panvk_per_arch(cmd_close_batch)(cmdbuf);
}
static void
panvk_meta_copy_img2buf_init(struct panvk_physical_device *dev)
{
STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2buf_fmts) == PANVK_META_COPY_IMG2BUF_NUM_FORMATS);
for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
for (unsigned texdim = 1; texdim <= 3; texdim++) {
unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
struct pan_shader_info shader_info;
mali_ptr shader =
panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
panvk_meta_copy_img2buf_fmts[i],
texdim, false, &shader_info);
dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
dev->meta.copy.img2buf[texdimidx][i].rsd =
panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
&dev->meta.desc_pool.base,
shader, &shader_info, true);
if (texdim == 3)
continue;
memset(&shader_info, 0, sizeof(shader_info));
texdimidx = panvk_meta_copy_tex_type(texdim, true);
assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
shader =
panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
panvk_meta_copy_img2buf_fmts[i],
texdim, true, &shader_info);
dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
dev->meta.copy.img2buf[texdimidx][i].rsd =
panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
&dev->meta.desc_pool.base,
shader, &shader_info, true);
}
}
}
void
panvk_per_arch(CmdCopyImageToBuffer)(VkCommandBuffer commandBuffer,
VkImage srcImage,
@ -1243,7 +1742,13 @@ panvk_per_arch(CmdCopyImageToBuffer)(VkCommandBuffer commandBuffer,
uint32_t regionCount,
const VkBufferImageCopy *pRegions)
{
panvk_stub();
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
VK_FROM_HANDLE(panvk_buffer, buf, destBuffer);
VK_FROM_HANDLE(panvk_image, img, srcImage);
for (unsigned i = 0; i < regionCount; i++) {
panvk_meta_copy_img2buf(cmdbuf, buf, img, &pRegions[i]);
}
}
void
@ -1281,4 +1786,5 @@ panvk_per_arch(meta_copy_init)(struct panvk_physical_device *dev)
{
panvk_meta_copy_img2img_init(dev);
panvk_meta_copy_buf2img_init(dev);
panvk_meta_copy_img2buf_init(dev);
}