diff --git a/src/asahi/vulkan/hk_cmd_meta.c b/src/asahi/vulkan/hk_cmd_meta.c index 53b9058da75..9d06c6cb0f9 100644 --- a/src/asahi/vulkan/hk_cmd_meta.c +++ b/src/asahi/vulkan/hk_cmd_meta.c @@ -17,12 +17,19 @@ #include "layout.h" #include "nir_builder.h" +#include "nir_builder_opcodes.h" #include "nir_format_convert.h" #include "shader_enums.h" #include "vk_format.h" #include "vk_meta.h" #include "vk_pipeline.h" +/* For block based blit kernels, we hardcode the maximum tile size which we can + * always achieve. This simplifies our life. + */ +#define TILE_WIDTH 32 +#define TILE_HEIGHT 32 + static VkResult hk_cmd_bind_map_buffer(struct vk_command_buffer *vk_cmd, struct vk_meta_device *meta, VkBuffer _buffer, @@ -250,9 +257,9 @@ aspect_format(VkFormat fmt, VkImageAspectFlags aspect) * the unfortunate exception). */ static enum pipe_format -canonical_format_pipe(enum pipe_format fmt) +canonical_format_pipe(enum pipe_format fmt, bool canonicalize_zs) { - if (util_format_is_depth_or_stencil(fmt)) + if (!canonicalize_zs && util_format_is_depth_or_stencil(fmt)) return fmt; assert(ail_is_valid_pixel_format(fmt)); @@ -269,7 +276,7 @@ canonical_format_pipe(enum pipe_format fmt) /* clang-format off */ static enum pipe_format map[] = { CASE(R8, R8_UINT), - CASE(R16, R16_UINT), + CASE(R16, R16_UNORM /* XXX: Hack for Z16 copies */), CASE(R8G8, R8G8_UINT), CASE(R5G6B5, R5G6B5_UNORM), CASE(R4G4B4A4, R4G4B4A4_UNORM), @@ -298,7 +305,7 @@ static VkFormat canonical_format(VkFormat fmt) { return vk_format_from_pipe_format( - canonical_format_pipe(vk_format_to_pipe_format(fmt))); + canonical_format_pipe(vk_format_to_pipe_format(fmt), false)); } enum copy_type { @@ -328,6 +335,7 @@ struct vk_meta_image_copy_key { enum pipe_format src_format, dst_format; unsigned block_size; unsigned nr_samples; + bool block_based; }; static nir_def * @@ -358,8 +366,8 @@ build_image_copy_shader(const struct vk_meta_image_copy_key *key) nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "vk-meta-copy"); nir_builder *b = &build; - b->shader->info.workgroup_size[0] = 32; - b->shader->info.workgroup_size[1] = 32; + b->shader->info.workgroup_size[0] = TILE_WIDTH; + b->shader->info.workgroup_size[1] = TILE_HEIGHT; bool src_is_buf = key->type == BUF2IMG; bool dst_is_buf = key->type == IMG2BUF; @@ -397,46 +405,119 @@ build_image_copy_shader(const struct vk_meta_image_copy_key *key) b, 3, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, dst_offset_el))); - nir_def *grid_el = nir_load_push_constant( - b, 3, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, grid_el))); + nir_def *grid_2d_el = nir_load_push_constant( + b, 2, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, grid_el))); /* We're done setting up variables, do the copy */ nir_def *coord = nir_load_global_invocation_id(b, 32); - nir_push_if(b, - nir_ball(b, nir_trim_vector(b, nir_ult(b, coord, grid_el), 2))); - { - nir_def *src_coord = nir_iadd(b, coord, src_offset_el); - nir_def *dst_coord = nir_iadd(b, coord, dst_offset_el); + /* The destination format is already canonical, convert to an ISA format */ + enum pipe_format isa_format; + if (key->block_based) { + isa_format = + ail_pixel_format[canonical_format_pipe(key->dst_format, true)] + .renderable; + assert(isa_format != PIPE_FORMAT_NONE); + } - /* Special case handle buffer indexing */ - if (dst_is_buf) { - dst_coord = linearize_coords(b, coord, key); - } else if (src_is_buf) { - src_coord = linearize_coords(b, coord, key); - } + nir_def *local_offset = nir_imm_intN_t(b, 0, 16); + nir_def *lid = nir_trim_vector(b, nir_load_local_invocation_id(b), 2); + lid = nir_u2u16(b, lid); - /* Copy formatted texel from texture to storage image */ - for (unsigned s = 0; s < key->nr_samples; ++s) { + nir_def *src_coord = src_is_buf ? coord : nir_iadd(b, coord, src_offset_el); + nir_def *dst_coord = dst_is_buf ? coord : nir_iadd(b, coord, dst_offset_el); + + nir_def *image_deref = &nir_build_deref_var(b, image)->def; + + nir_def *coord_2d_el = nir_trim_vector(b, coord, 2); + nir_def *in_bounds; + if (key->block_based) { + nir_def *offset_in_block_el = + nir_umod_imm(b, nir_trim_vector(b, dst_offset_el, 2), TILE_WIDTH); + + dst_coord = + nir_vector_insert_imm(b, nir_isub(b, dst_coord, offset_in_block_el), + nir_channel(b, dst_coord, 2), 2); + + src_coord = + nir_vector_insert_imm(b, nir_isub(b, src_coord, offset_in_block_el), + nir_channel(b, src_coord, 2), 2); + + in_bounds = nir_uge(b, coord_2d_el, offset_in_block_el); + in_bounds = nir_iand( + b, in_bounds, + nir_ult(b, coord_2d_el, nir_iadd(b, offset_in_block_el, grid_2d_el))); + } else { + in_bounds = nir_ult(b, coord_2d_el, grid_2d_el); + } + + /* Special case handle buffer indexing */ + if (dst_is_buf) { + assert(!key->block_based); + dst_coord = linearize_coords(b, dst_coord, key); + } else if (src_is_buf) { + src_coord = linearize_coords(b, src_coord, key); + } + + for (unsigned s = 0; s < key->nr_samples; ++s) { + nir_def *ms_index = nir_imm_int(b, s); + nir_def *value1, *value2; + + nir_push_if(b, nir_ball(b, in_bounds)); + { + /* Copy formatted texel from texture to storage image */ nir_deref_instr *deref = nir_build_deref_var(b, texture); - nir_def *ms_index = nir_imm_int(b, s); - nir_def *value = msaa ? nir_txf_ms_deref(b, deref, src_coord, ms_index) - : nir_txf_deref(b, deref, src_coord, NULL); + value1 = msaa ? nir_txf_ms_deref(b, deref, src_coord, ms_index) + : nir_txf_deref(b, deref, src_coord, NULL); /* Munge according to the implicit conversions so we get a bit copy */ if (key->src_format != key->dst_format) { - nir_def *packed = nir_format_pack_rgba(b, key->src_format, value); - value = nir_format_unpack_rgba(b, packed, key->dst_format); + nir_def *packed = nir_format_pack_rgba(b, key->src_format, value1); + value1 = nir_format_unpack_rgba(b, packed, key->dst_format); } - nir_image_deref_store(b, &nir_build_deref_var(b, image)->def, - nir_pad_vec4(b, dst_coord), ms_index, value, - nir_imm_int(b, 0), .image_dim = dim_dst, - .image_array = !dst_is_buf); + if (!key->block_based) { + nir_image_deref_store(b, image_deref, nir_pad_vec4(b, dst_coord), + ms_index, value1, nir_imm_int(b, 0), + .image_dim = dim_dst, + .image_array = !dst_is_buf); + } + } + nir_push_else(b, NULL); + if (key->block_based) { + /* Copy back the existing destination content */ + value2 = nir_image_deref_load(b, 4, 32, image_deref, + nir_pad_vec4(b, dst_coord), ms_index, + nir_imm_int(b, 0), .image_dim = dim_dst, + .image_array = !dst_is_buf); + } + nir_pop_if(b, NULL); + + if (key->block_based) { + nir_store_local_pixel_agx(b, nir_if_phi(b, value1, value2), + nir_imm_int(b, 1 << s), lid, .base = 0, + .write_mask = 0xf, .format = isa_format, + .explicit_coord = true); } } - nir_pop_if(b, NULL); + + if (key->block_based) { + assert(!dst_is_buf); + + nir_barrier(b, .execution_scope = SCOPE_WORKGROUP); + + nir_push_if(b, nir_ball(b, nir_ieq_imm(b, lid, 0))); + { + nir_image_deref_store_block_agx( + b, image_deref, local_offset, dst_coord, .format = isa_format, + .image_dim = dim_2d, .image_array = true, .explicit_coord = true); + } + nir_pop_if(b, NULL); + b->shader->info.cs.image_block_size_per_thread_agx = + util_format_get_blocksize(key->dst_format); + } + return b->shader; } @@ -765,6 +846,42 @@ hk_meta_copy_image_to_buffer2(struct vk_command_buffer *cmd, } } +static void +hk_meta_dispatch_to_image(struct vk_command_buffer *cmd, + const struct vk_device_dispatch_table *disp, + VkPipelineLayout pipeline_layout, + struct vk_meta_push_data *push, VkOffset3D offset, + VkExtent3D extent, bool per_layer, unsigned layers, + enum pipe_format p_dst_fmt, enum pipe_format p_format) +{ + push->dst_offset_el[0] = util_format_get_nblocksx(p_dst_fmt, offset.x); + push->dst_offset_el[1] = util_format_get_nblocksy(p_dst_fmt, offset.y); + push->dst_offset_el[2] = 0; + + push->grid_el[0] = util_format_get_nblocksx(p_format, extent.width); + push->grid_el[1] = util_format_get_nblocksy(p_format, extent.height); + push->grid_el[2] = per_layer ? 1 : layers; + + unsigned w_el = util_format_get_nblocksx(p_format, extent.width); + unsigned h_el = util_format_get_nblocksy(p_format, extent.height); + + /* Expand the grid so destinations are in tiles */ + unsigned expanded_x0 = push->dst_offset_el[0] & ~(TILE_WIDTH - 1); + unsigned expanded_y0 = push->dst_offset_el[1] & ~(TILE_HEIGHT - 1); + unsigned expanded_x1 = align(push->dst_offset_el[0] + w_el, TILE_WIDTH); + unsigned expanded_y1 = align(push->dst_offset_el[1] + h_el, TILE_HEIGHT); + + /* TODO: clamp to the destination size to save some redundant threads? */ + + disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(*push), push); + + disp->CmdDispatch(vk_command_buffer_to_handle(cmd), + (expanded_x1 - expanded_x0) / TILE_WIDTH, + (expanded_y1 - expanded_y0) / TILE_HEIGHT, + push->grid_el[2]); +} + static void hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta, @@ -808,7 +925,13 @@ hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd, .block_size = blocksize_B, .nr_samples = image->samples, .src_format = vk_format_to_pipe_format(canonical), - .dst_format = vk_format_to_pipe_format(canonical), + .dst_format = canonical_format_pipe( + vk_format_to_pipe_format(aspect_format(image->format, aspect)), + false), + + /* TODO: MSAA path */ + .block_based = + (image->image_type != VK_IMAGE_TYPE_1D) && image->samples == 1, }; VkPipelineLayout pipeline_layout; @@ -933,28 +1056,13 @@ hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd, .buffer_offset = region->bufferOffset, .row_extent = row_extent, .slice_or_layer_extent = is_3d ? slice_extent : layer_extent, - - .dst_offset_el[0] = - util_format_get_nblocksx(p_format, region->imageOffset.x), - .dst_offset_el[1] = - util_format_get_nblocksy(p_format, region->imageOffset.y), - - .grid_el[0] = - util_format_get_nblocksx(p_format, region->imageExtent.width), - .grid_el[1] = - util_format_get_nblocksy(p_format, region->imageExtent.height), - .grid_el[2] = per_layer ? 1 : layers, }; push.buffer_offset += push.slice_or_layer_extent * layer_offs; - disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), - pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(push), &push); - - disp->CmdDispatch(vk_command_buffer_to_handle(cmd), - DIV_ROUND_UP(push.grid_el[0], 32), - DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]); + hk_meta_dispatch_to_image(cmd, disp, pipeline_layout, &push, + region->imageOffset, region->imageExtent, + per_layer, layers, p_format, p_format); } } } @@ -1012,8 +1120,14 @@ hk_meta_copy_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta, .block_size = blocksize_B, .nr_samples = dst_image->samples, .src_format = vk_format_to_pipe_format(canonical), - .dst_format = canonical_format_pipe(vk_format_to_pipe_format( - aspect_format(dst_image->format, dst_aspect_mask))), + .dst_format = + canonical_format_pipe(vk_format_to_pipe_format(aspect_format( + dst_image->format, dst_aspect_mask)), + false), + + /* TODO: MSAA path */ + .block_based = (dst_image->image_type != VK_IMAGE_TYPE_1D) && + dst_image->samples == 1, }; assert(key.nr_samples == src_image->samples); @@ -1147,27 +1261,11 @@ hk_meta_copy_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta, util_format_get_nblocksx(p_src_fmt, region->srcOffset.x), .src_offset_el[1] = util_format_get_nblocksy(p_src_fmt, region->srcOffset.y), - - .dst_offset_el[0] = - util_format_get_nblocksx(p_dst_fmt, region->dstOffset.x), - .dst_offset_el[1] = - util_format_get_nblocksy(p_dst_fmt, region->dstOffset.y), - - .grid_el[0] = - util_format_get_nblocksx(p_format, region->extent.width), - .grid_el[1] = - util_format_get_nblocksy(p_format, region->extent.height), - .grid_el[2] = per_layer ? 1 : layers, }; - disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), - pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(push), &push); - - disp->CmdDispatch(vk_command_buffer_to_handle(cmd), - DIV_ROUND_UP(push.grid_el[0], 32), - DIV_ROUND_UP(push.grid_el[1], 32), - push.grid_el[2]); + hk_meta_dispatch_to_image(cmd, disp, pipeline_layout, &push, + region->dstOffset, region->extent, + per_layer, layers, p_dst_fmt, p_format); } } }