diff --git a/src/asahi/lib/agx_device.c b/src/asahi/lib/agx_device.c index b4b6ecbe66e..09752128364 100644 --- a/src/asahi/lib/agx_device.c +++ b/src/asahi/lib/agx_device.c @@ -50,7 +50,6 @@ static const struct debug_named_value agx_debug_options[] = { #ifndef NDEBUG {"dirty", AGX_DBG_DIRTY, "Disable dirty tracking"}, #endif - {"compblit", AGX_DBG_COMPBLIT, "Enable compute blitter"}, {"precompile",AGX_DBG_PRECOMPILE,"Precompile shaders for shader-db"}, {"nocompress",AGX_DBG_NOCOMPRESS,"Disable lossless compression"}, {"nocluster", AGX_DBG_NOCLUSTER,"Disable vertex clustering"}, diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h index bc489b108cf..db166c6124a 100644 --- a/src/asahi/lib/agx_device.h +++ b/src/asahi/lib/agx_device.h @@ -41,7 +41,7 @@ enum agx_dbg { AGX_DBG_NOSHADOW = BITFIELD_BIT(16), /* bit 17 unused */ AGX_DBG_SCRATCH = BITFIELD_BIT(18), - AGX_DBG_COMPBLIT = BITFIELD_BIT(19), + /* bit 19 unused */ AGX_DBG_FEEDBACK = BITFIELD_BIT(20), AGX_DBG_1QUEUE = BITFIELD_BIT(21), }; diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c index 1015d4bf241..a95b307c8d7 100644 --- a/src/gallium/drivers/asahi/agx_blit.c +++ b/src/gallium/drivers/asahi/agx_blit.c @@ -20,18 +20,43 @@ #include "pipe/p_state.h" #include "util/format/u_format.h" #include "util/format/u_formats.h" +#include "util/hash_table.h" #include "util/macros.h" +#include "util/ralloc.h" #include "util/u_sampler.h" #include "util/u_surface.h" #include "agx_formats.h" +#include "agx_internal_formats.h" #include "agx_state.h" +#include "glsl_types.h" +#include "nir.h" +#include "nir_builder_opcodes.h" #include "shader_enums.h" -#define BLIT_WG_SIZE 32 +/* For block based blit kernels, we hardcode the maximum tile size which we can + * always achieve. This simplifies our life. + */ +#define TILE_WIDTH 32 +#define TILE_HEIGHT 32 + +static enum pipe_format +effective_format(enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z24X8_UNORM: + return PIPE_FORMAT_R32_FLOAT; + case PIPE_FORMAT_Z16_UNORM: + return PIPE_FORMAT_R16_UNORM; + case PIPE_FORMAT_S8_UINT: + return PIPE_FORMAT_R8_UINT; + default: + return format; + } +} static void * -asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp, - bool array) +asahi_blit_compute_shader(struct pipe_context *ctx, struct asahi_blit_key *key) { const nir_shader_compiler_options *options = ctx->screen->get_compiler_options(ctx->screen, PIPE_SHADER_IR_NIR, @@ -40,8 +65,8 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp, nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "blit_cs"); nir_builder *b = &b_; - b->shader->info.workgroup_size[0] = BLIT_WG_SIZE; - b->shader->info.workgroup_size[1] = BLIT_WG_SIZE; + b->shader->info.workgroup_size[0] = TILE_WIDTH; + b->shader->info.workgroup_size[1] = TILE_HEIGHT; b->shader->info.num_ubos = 1; BITSET_SET(b->shader->info.textures_used, 0); @@ -50,51 +75,153 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp, nir_def *zero = nir_imm_int(b, 0); - nir_def *params[3]; + nir_def *params[4]; b->shader->num_uniforms = ARRAY_SIZE(params); for (unsigned i = 0; i < b->shader->num_uniforms; ++i) { params[i] = nir_load_ubo(b, 2, 32, zero, nir_imm_int(b, i * 8), .align_mul = 4, .range = ~0); } - nir_def *ids = - nir_trim_vector(b, nir_load_global_invocation_id(b, 32), array ? 3 : 2); + nir_def *trans_offs = params[0]; + nir_def *trans_scale = params[1]; + nir_def *dst_offs_2d = params[2]; + nir_def *dimensions_el_2d = params[3]; - nir_def *tex_pos = nir_u2f32(b, ids); - nir_def *pos2 = - nir_ffma(b, nir_trim_vector(b, tex_pos, 2), params[1], params[0]); - if (array) { - tex_pos = nir_vector_insert_imm(b, nir_pad_vector(b, pos2, 3), - nir_channel(b, tex_pos, 2), 2); - } else { - tex_pos = pos2; + nir_def *phys_id_el_nd = nir_trim_vector( + b, nir_load_global_invocation_id(b, 32), key->array ? 3 : 2); + nir_def *phys_id_el_2d = nir_trim_vector(b, phys_id_el_nd, 2); + nir_def *layer = key->array ? nir_channel(b, phys_id_el_nd, 2) : NULL; + + /* Offset within the tile. We're dispatched for the entire tile but the + * beginning might be out-of-bounds, so fix up. + */ + nir_def *offs_in_tile_el_2d = nir_iand_imm(b, dst_offs_2d, 31); + nir_def *logical_id_el_2d = nir_isub(b, phys_id_el_2d, offs_in_tile_el_2d); + + nir_def *image_pos_2d = nir_iadd(b, logical_id_el_2d, dst_offs_2d); + nir_def *image_pos_nd = image_pos_2d; + if (layer) { + image_pos_nd = + nir_vector_insert_imm(b, nir_pad_vector(b, image_pos_nd, 3), layer, 2); } - nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); - tex->dest_type = nir_type_uint32; /* irrelevant */ - tex->sampler_dim = GLSL_SAMPLER_DIM_2D; - tex->is_array = array; - tex->op = nir_texop_tex; - tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos); - tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP; - tex->coord_components = array ? 3 : 2; - tex->texture_index = 0; - tex->sampler_index = 0; - nir_def_init(&tex->instr, &tex->def, 4, 32); - nir_builder_instr_insert(b, &tex->instr); - nir_def *color = &tex->def; + nir_def *in_bounds = nir_ige(b, logical_id_el_2d, nir_imm_ivec2(b, 0, 0)); + in_bounds = + nir_iand(b, in_bounds, nir_ilt(b, logical_id_el_2d, dimensions_el_2d)); + + nir_def *colour0, *colour1; + nir_push_if(b, nir_ball(b, in_bounds)); + { + /* For pixels within the copy area, texture from the source */ + nir_def *coords_el_2d = + nir_ffma(b, nir_u2f32(b, logical_id_el_2d), trans_scale, trans_offs); + + nir_def *coords_el_nd = coords_el_2d; + if (layer) { + coords_el_nd = nir_vector_insert_imm( + b, nir_pad_vector(b, coords_el_nd, 3), nir_u2f32(b, layer), 2); + } + + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); + tex->dest_type = nir_type_uint32; /* irrelevant */ + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + tex->is_array = key->array; + tex->op = nir_texop_tex; + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coords_el_nd); + tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP; + tex->coord_components = coords_el_nd->num_components; + tex->texture_index = 0; + tex->sampler_index = 0; + nir_def_init(&tex->instr, &tex->def, 4, 32); + nir_builder_instr_insert(b, &tex->instr); + colour0 = &tex->def; + } + nir_push_else(b, NULL); + { + /* For out-of-bounds pixels, copy in the destination */ + colour1 = nir_image_load( + b, 4, 32, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos_nd), zero, zero, + .image_array = key->array, .image_dim = GLSL_SAMPLER_DIM_2D, + .access = ACCESS_IN_BOUNDS_AGX, .dest_type = nir_type_uint32); + } + nir_pop_if(b, NULL); + nir_def *color = nir_if_phi(b, colour0, colour1); + + enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE; + bool src_sint = util_format_is_pure_sint(key->src_format); + bool dst_sint = util_format_is_pure_sint(key->dst_format); + if (util_format_is_pure_integer(key->src_format) && + util_format_is_pure_integer(key->dst_format)) { + + if (src_sint && !dst_sint) + clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT; + else if (!src_sint && dst_sint) + clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT; + } if (clamp == ASAHI_BLIT_CLAMP_SINT_TO_UINT) color = nir_imax(b, color, nir_imm_int(b, 0)); else if (clamp == ASAHI_BLIT_CLAMP_UINT_TO_SINT) color = nir_umin(b, color, nir_imm_int(b, INT32_MAX)); - nir_def *image_pos = - nir_iadd(b, ids, nir_pad_vector_imm_int(b, params[2], 0, array ? 3 : 2)); + nir_def *local_offset = nir_imm_intN_t(b, 0, 16); + nir_def *lid = nir_trim_vector(b, nir_load_local_invocation_id(b), 2); + lid = nir_u2u16(b, lid); - nir_image_store(b, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos), zero, - color, zero, .image_dim = GLSL_SAMPLER_DIM_2D, - .access = ACCESS_NON_READABLE, .image_array = array); + /* Pure integer formatss need to be clamped in software, at least in some + * cases. We do so on store. Piglit gl-3.0-render-integer checks this, as + * does KHR-GL33.packed_pixels.*. + * + * TODO: Make this common code somehow. + */ + const struct util_format_description *desc = + util_format_description(key->dst_format); + unsigned c = util_format_get_first_non_void_channel(key->dst_format); + + if (desc->channel[c].size <= 16 && + util_format_is_pure_integer(key->dst_format)) { + + unsigned bits[4] = { + desc->channel[0].size ?: desc->channel[0].size, + desc->channel[1].size ?: desc->channel[0].size, + desc->channel[2].size ?: desc->channel[0].size, + desc->channel[3].size ?: desc->channel[0].size, + }; + + if (util_format_is_pure_sint(key->dst_format)) + color = nir_format_clamp_sint(b, color, bits); + else + color = nir_format_clamp_uint(b, color, bits); + + color = nir_u2u16(b, color); + } + + /* The source texel has been converted into a 32-bit value. We need to + * convert it to a tilebuffer format that can then be converted to the + * destination format in the PBE hardware. That's the renderable format for + * the destination format, which must exist along this path. This mirrors the + * flow of fragment and end-of-tile shaders. + */ + enum pipe_format tib_format = + agx_pixel_format[effective_format(key->dst_format)].renderable; + + nir_store_local_pixel_agx(b, color, nir_imm_int(b, 1), lid, .base = 0, + .write_mask = 0xf, .format = tib_format, + .explicit_coord = true); + + nir_barrier(b, .execution_scope = SCOPE_WORKGROUP); + + nir_push_if(b, nir_ball(b, nir_ieq_imm(b, lid, 0))); + { + nir_def *pbe_index = nir_imm_int(b, 2); + nir_block_image_store_agx( + b, pbe_index, local_offset, image_pos_nd, .format = tib_format, + .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = key->array, + .explicit_coord = true); + } + nir_pop_if(b, NULL); + b->shader->info.cs.image_block_size_per_thread_agx = + util_format_get_blocksize(key->dst_format); return pipe_shader_from_nir(ctx, b->shader); } @@ -219,6 +346,8 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info, fui(y_scale), info->dst.box.x, info->dst.box.y, + info->dst.box.width, + info->dst.box.height, }; struct pipe_constant_buffer cb = { @@ -273,32 +402,51 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info, src_view = ctx->create_sampler_view(ctx, src, &src_templ); ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, true, &src_view); - enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE; - bool src_sint = util_format_is_pure_sint(info->src.format); - bool dst_sint = util_format_is_pure_sint(info->dst.format); - if (util_format_is_pure_integer(info->src.format) && - util_format_is_pure_integer(info->dst.format)) { + struct asahi_blit_key key = { + .src_format = info->src.format, + .dst_format = info->dst.format, + .array = array, + }; + struct hash_entry *ent = _mesa_hash_table_search(blitter->blit_cs, &key); + void *cs = NULL; - if (src_sint && !dst_sint) - clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT; - else if (!src_sint && dst_sint) - clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT; + if (ent) { + cs = ent->data; + } else { + cs = asahi_blit_compute_shader(ctx, &key); + _mesa_hash_table_insert( + blitter->blit_cs, ralloc_memdup(blitter->blit_cs, &key, sizeof(key)), + cs); } - if (!blitter->blit_cs[clamp][array]) { - blitter->blit_cs[clamp][array] = - asahi_blit_compute_shader(ctx, clamp, array); - } + assert(cs != NULL); + ctx->bind_compute_state(ctx, cs); - ctx->bind_compute_state(ctx, blitter->blit_cs[clamp][array]); + /* Expand the grid so destinations are in tiles */ + unsigned expanded_x0 = info->dst.box.x & ~(TILE_WIDTH - 1); + unsigned expanded_y0 = info->dst.box.y & ~(TILE_HEIGHT - 1); + unsigned expanded_x1 = + align(info->dst.box.x + info->dst.box.width, TILE_WIDTH); + unsigned expanded_y1 = + align(info->dst.box.y + info->dst.box.height, TILE_HEIGHT); + + /* But clamp to the destination size to save some redundant threads */ + expanded_x1 = + MIN2(expanded_x1, u_minify(info->dst.resource->width0, info->dst.level)); + expanded_y1 = + MIN2(expanded_y1, u_minify(info->dst.resource->height0, info->dst.level)); + + /* Recalculate the width/height based on the expanded grid */ + width = expanded_x1 - expanded_x0; + height = expanded_y1 - expanded_y0; struct pipe_grid_info grid_info = { - .block = {BLIT_WG_SIZE, BLIT_WG_SIZE, 1}, - .last_block = {width % BLIT_WG_SIZE, height % BLIT_WG_SIZE, 1}, + .block = {TILE_WIDTH, TILE_HEIGHT, 1}, + .last_block = {width % TILE_WIDTH, height % TILE_HEIGHT, 1}, .grid = { - DIV_ROUND_UP(width, BLIT_WG_SIZE), - DIV_ROUND_UP(height, BLIT_WG_SIZE), + DIV_ROUND_UP(width, TILE_WIDTH), + DIV_ROUND_UP(height, TILE_HEIGHT), depth, }, }; @@ -378,11 +526,7 @@ agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) agx_legalize_compression(ctx, agx_resource(info->src.resource), info->src.format); - if (asahi_compute_blit_supported(info) && - (agx_device(pipe->screen)->debug & AGX_DBG_COMPBLIT) && - !(ail_is_compressed(&agx_resource(info->dst.resource)->layout) && - util_format_get_blocksize(info->dst.format) == 16)) { - + if (asahi_compute_blit_supported(info)) { asahi_compute_blit(pipe, info, &ctx->compute_blitter); return; } diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c index 906daa01ce6..8efa8e6a7fa 100644 --- a/src/gallium/drivers/asahi/agx_disk_cache.c +++ b/src/gallium/drivers/asahi/agx_disk_cache.c @@ -20,7 +20,7 @@ #include "agx_state.h" /* Flags that are allowed and do not disable the disk cache */ -#define ALLOWED_FLAGS (AGX_DBG_NO16 | AGX_DBG_COMPBLIT) +#define ALLOWED_FLAGS (AGX_DBG_NO16) /** * Compute a disk cache key for the given uncompiled shader and shader key. diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c index a6ce72b0f81..0713ecf2db0 100644 --- a/src/gallium/drivers/asahi/agx_pipe.c +++ b/src/gallium/drivers/asahi/agx_pipe.c @@ -1919,6 +1919,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags) agx_init_meta_shaders(ctx); ctx->blitter = util_blitter_create(pctx); + ctx->compute_blitter.blit_cs = asahi_blit_key_table_create(ctx); ctx->result_buf = agx_bo_create(agx_device(screen), diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 939828cd172..b9799ac4a4f 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -2767,9 +2767,13 @@ agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs, struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view); /* For the texture descriptor, lower cubes to 2D arrays. This matches the - * transform done in the compiler. + * transform done in the compiler. Also, force 2D arrays for internal + * blitter images, this helps reduce shader variants. */ - if (target_is_cube(sampler_view.target)) + bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL); + + if (target_is_cube(sampler_view.target) || + (sampler_view.target == PIPE_TEXTURE_3D && internal)) sampler_view.target = PIPE_TEXTURE_2D_ARRAY; agx_pack_texture(texture, agx_resource(view->resource), view->format, diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 496a50b1b2b..403ba72273b 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -575,11 +575,16 @@ enum asahi_blit_clamp { ASAHI_BLIT_CLAMP_COUNT, }; +struct asahi_blit_key { + enum pipe_format src_format, dst_format; + bool array; +}; + +DERIVE_HASH_TABLE(asahi_blit_key); + struct asahi_blitter { bool active; - - /* [clamp_type][is_array] */ - void *blit_cs[ASAHI_BLIT_CLAMP_COUNT][2]; + struct hash_table *blit_cs; /* [filter] */ void *sampler[2];