asahi: switch to block based compute blitter

This fixes the compute blitter with compression in the general case, and then
flips the switch since the compute blitter is faster / less buggy than the
traditional path.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30633>
This commit is contained in:
Alyssa Rosenzweig 2024-08-07 13:11:37 -04:00
parent 94cf7de88f
commit 6405153a07
7 changed files with 218 additions and 65 deletions

View file

@ -50,7 +50,6 @@ static const struct debug_named_value agx_debug_options[] = {
#ifndef NDEBUG
{"dirty", AGX_DBG_DIRTY, "Disable dirty tracking"},
#endif
{"compblit", AGX_DBG_COMPBLIT, "Enable compute blitter"},
{"precompile",AGX_DBG_PRECOMPILE,"Precompile shaders for shader-db"},
{"nocompress",AGX_DBG_NOCOMPRESS,"Disable lossless compression"},
{"nocluster", AGX_DBG_NOCLUSTER,"Disable vertex clustering"},

View file

@ -41,7 +41,7 @@ enum agx_dbg {
AGX_DBG_NOSHADOW = BITFIELD_BIT(16),
/* bit 17 unused */
AGX_DBG_SCRATCH = BITFIELD_BIT(18),
AGX_DBG_COMPBLIT = BITFIELD_BIT(19),
/* bit 19 unused */
AGX_DBG_FEEDBACK = BITFIELD_BIT(20),
AGX_DBG_1QUEUE = BITFIELD_BIT(21),
};

View file

@ -20,18 +20,43 @@
#include "pipe/p_state.h"
#include "util/format/u_format.h"
#include "util/format/u_formats.h"
#include "util/hash_table.h"
#include "util/macros.h"
#include "util/ralloc.h"
#include "util/u_sampler.h"
#include "util/u_surface.h"
#include "agx_formats.h"
#include "agx_internal_formats.h"
#include "agx_state.h"
#include "glsl_types.h"
#include "nir.h"
#include "nir_builder_opcodes.h"
#include "shader_enums.h"
#define BLIT_WG_SIZE 32
/* For block based blit kernels, we hardcode the maximum tile size which we can
* always achieve. This simplifies our life.
*/
#define TILE_WIDTH 32
#define TILE_HEIGHT 32
static enum pipe_format
effective_format(enum pipe_format format)
{
switch (format) {
case PIPE_FORMAT_Z32_FLOAT:
case PIPE_FORMAT_Z24X8_UNORM:
return PIPE_FORMAT_R32_FLOAT;
case PIPE_FORMAT_Z16_UNORM:
return PIPE_FORMAT_R16_UNORM;
case PIPE_FORMAT_S8_UINT:
return PIPE_FORMAT_R8_UINT;
default:
return format;
}
}
static void *
asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
bool array)
asahi_blit_compute_shader(struct pipe_context *ctx, struct asahi_blit_key *key)
{
const nir_shader_compiler_options *options =
ctx->screen->get_compiler_options(ctx->screen, PIPE_SHADER_IR_NIR,
@ -40,8 +65,8 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
nir_builder b_ =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "blit_cs");
nir_builder *b = &b_;
b->shader->info.workgroup_size[0] = BLIT_WG_SIZE;
b->shader->info.workgroup_size[1] = BLIT_WG_SIZE;
b->shader->info.workgroup_size[0] = TILE_WIDTH;
b->shader->info.workgroup_size[1] = TILE_HEIGHT;
b->shader->info.num_ubos = 1;
BITSET_SET(b->shader->info.textures_used, 0);
@ -50,51 +75,153 @@ asahi_blit_compute_shader(struct pipe_context *ctx, enum asahi_blit_clamp clamp,
nir_def *zero = nir_imm_int(b, 0);
nir_def *params[3];
nir_def *params[4];
b->shader->num_uniforms = ARRAY_SIZE(params);
for (unsigned i = 0; i < b->shader->num_uniforms; ++i) {
params[i] = nir_load_ubo(b, 2, 32, zero, nir_imm_int(b, i * 8),
.align_mul = 4, .range = ~0);
}
nir_def *ids =
nir_trim_vector(b, nir_load_global_invocation_id(b, 32), array ? 3 : 2);
nir_def *trans_offs = params[0];
nir_def *trans_scale = params[1];
nir_def *dst_offs_2d = params[2];
nir_def *dimensions_el_2d = params[3];
nir_def *tex_pos = nir_u2f32(b, ids);
nir_def *pos2 =
nir_ffma(b, nir_trim_vector(b, tex_pos, 2), params[1], params[0]);
if (array) {
tex_pos = nir_vector_insert_imm(b, nir_pad_vector(b, pos2, 3),
nir_channel(b, tex_pos, 2), 2);
} else {
tex_pos = pos2;
nir_def *phys_id_el_nd = nir_trim_vector(
b, nir_load_global_invocation_id(b, 32), key->array ? 3 : 2);
nir_def *phys_id_el_2d = nir_trim_vector(b, phys_id_el_nd, 2);
nir_def *layer = key->array ? nir_channel(b, phys_id_el_nd, 2) : NULL;
/* Offset within the tile. We're dispatched for the entire tile but the
* beginning might be out-of-bounds, so fix up.
*/
nir_def *offs_in_tile_el_2d = nir_iand_imm(b, dst_offs_2d, 31);
nir_def *logical_id_el_2d = nir_isub(b, phys_id_el_2d, offs_in_tile_el_2d);
nir_def *image_pos_2d = nir_iadd(b, logical_id_el_2d, dst_offs_2d);
nir_def *image_pos_nd = image_pos_2d;
if (layer) {
image_pos_nd =
nir_vector_insert_imm(b, nir_pad_vector(b, image_pos_nd, 3), layer, 2);
}
nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
tex->dest_type = nir_type_uint32; /* irrelevant */
tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
tex->is_array = array;
tex->op = nir_texop_tex;
tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP;
tex->coord_components = array ? 3 : 2;
tex->texture_index = 0;
tex->sampler_index = 0;
nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(b, &tex->instr);
nir_def *color = &tex->def;
nir_def *in_bounds = nir_ige(b, logical_id_el_2d, nir_imm_ivec2(b, 0, 0));
in_bounds =
nir_iand(b, in_bounds, nir_ilt(b, logical_id_el_2d, dimensions_el_2d));
nir_def *colour0, *colour1;
nir_push_if(b, nir_ball(b, in_bounds));
{
/* For pixels within the copy area, texture from the source */
nir_def *coords_el_2d =
nir_ffma(b, nir_u2f32(b, logical_id_el_2d), trans_scale, trans_offs);
nir_def *coords_el_nd = coords_el_2d;
if (layer) {
coords_el_nd = nir_vector_insert_imm(
b, nir_pad_vector(b, coords_el_nd, 3), nir_u2f32(b, layer), 2);
}
nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
tex->dest_type = nir_type_uint32; /* irrelevant */
tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
tex->is_array = key->array;
tex->op = nir_texop_tex;
tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coords_el_nd);
tex->backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP;
tex->coord_components = coords_el_nd->num_components;
tex->texture_index = 0;
tex->sampler_index = 0;
nir_def_init(&tex->instr, &tex->def, 4, 32);
nir_builder_instr_insert(b, &tex->instr);
colour0 = &tex->def;
}
nir_push_else(b, NULL);
{
/* For out-of-bounds pixels, copy in the destination */
colour1 = nir_image_load(
b, 4, 32, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos_nd), zero, zero,
.image_array = key->array, .image_dim = GLSL_SAMPLER_DIM_2D,
.access = ACCESS_IN_BOUNDS_AGX, .dest_type = nir_type_uint32);
}
nir_pop_if(b, NULL);
nir_def *color = nir_if_phi(b, colour0, colour1);
enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE;
bool src_sint = util_format_is_pure_sint(key->src_format);
bool dst_sint = util_format_is_pure_sint(key->dst_format);
if (util_format_is_pure_integer(key->src_format) &&
util_format_is_pure_integer(key->dst_format)) {
if (src_sint && !dst_sint)
clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT;
else if (!src_sint && dst_sint)
clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT;
}
if (clamp == ASAHI_BLIT_CLAMP_SINT_TO_UINT)
color = nir_imax(b, color, nir_imm_int(b, 0));
else if (clamp == ASAHI_BLIT_CLAMP_UINT_TO_SINT)
color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
nir_def *image_pos =
nir_iadd(b, ids, nir_pad_vector_imm_int(b, params[2], 0, array ? 3 : 2));
nir_def *local_offset = nir_imm_intN_t(b, 0, 16);
nir_def *lid = nir_trim_vector(b, nir_load_local_invocation_id(b), 2);
lid = nir_u2u16(b, lid);
nir_image_store(b, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos), zero,
color, zero, .image_dim = GLSL_SAMPLER_DIM_2D,
.access = ACCESS_NON_READABLE, .image_array = array);
/* Pure integer formatss need to be clamped in software, at least in some
* cases. We do so on store. Piglit gl-3.0-render-integer checks this, as
* does KHR-GL33.packed_pixels.*.
*
* TODO: Make this common code somehow.
*/
const struct util_format_description *desc =
util_format_description(key->dst_format);
unsigned c = util_format_get_first_non_void_channel(key->dst_format);
if (desc->channel[c].size <= 16 &&
util_format_is_pure_integer(key->dst_format)) {
unsigned bits[4] = {
desc->channel[0].size ?: desc->channel[0].size,
desc->channel[1].size ?: desc->channel[0].size,
desc->channel[2].size ?: desc->channel[0].size,
desc->channel[3].size ?: desc->channel[0].size,
};
if (util_format_is_pure_sint(key->dst_format))
color = nir_format_clamp_sint(b, color, bits);
else
color = nir_format_clamp_uint(b, color, bits);
color = nir_u2u16(b, color);
}
/* The source texel has been converted into a 32-bit value. We need to
* convert it to a tilebuffer format that can then be converted to the
* destination format in the PBE hardware. That's the renderable format for
* the destination format, which must exist along this path. This mirrors the
* flow of fragment and end-of-tile shaders.
*/
enum pipe_format tib_format =
agx_pixel_format[effective_format(key->dst_format)].renderable;
nir_store_local_pixel_agx(b, color, nir_imm_int(b, 1), lid, .base = 0,
.write_mask = 0xf, .format = tib_format,
.explicit_coord = true);
nir_barrier(b, .execution_scope = SCOPE_WORKGROUP);
nir_push_if(b, nir_ball(b, nir_ieq_imm(b, lid, 0)));
{
nir_def *pbe_index = nir_imm_int(b, 2);
nir_block_image_store_agx(
b, pbe_index, local_offset, image_pos_nd, .format = tib_format,
.image_dim = GLSL_SAMPLER_DIM_2D, .image_array = key->array,
.explicit_coord = true);
}
nir_pop_if(b, NULL);
b->shader->info.cs.image_block_size_per_thread_agx =
util_format_get_blocksize(key->dst_format);
return pipe_shader_from_nir(ctx, b->shader);
}
@ -219,6 +346,8 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info,
fui(y_scale),
info->dst.box.x,
info->dst.box.y,
info->dst.box.width,
info->dst.box.height,
};
struct pipe_constant_buffer cb = {
@ -273,32 +402,51 @@ asahi_compute_blit(struct pipe_context *ctx, const struct pipe_blit_info *info,
src_view = ctx->create_sampler_view(ctx, src, &src_templ);
ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, true, &src_view);
enum asahi_blit_clamp clamp = ASAHI_BLIT_CLAMP_NONE;
bool src_sint = util_format_is_pure_sint(info->src.format);
bool dst_sint = util_format_is_pure_sint(info->dst.format);
if (util_format_is_pure_integer(info->src.format) &&
util_format_is_pure_integer(info->dst.format)) {
struct asahi_blit_key key = {
.src_format = info->src.format,
.dst_format = info->dst.format,
.array = array,
};
struct hash_entry *ent = _mesa_hash_table_search(blitter->blit_cs, &key);
void *cs = NULL;
if (src_sint && !dst_sint)
clamp = ASAHI_BLIT_CLAMP_SINT_TO_UINT;
else if (!src_sint && dst_sint)
clamp = ASAHI_BLIT_CLAMP_UINT_TO_SINT;
if (ent) {
cs = ent->data;
} else {
cs = asahi_blit_compute_shader(ctx, &key);
_mesa_hash_table_insert(
blitter->blit_cs, ralloc_memdup(blitter->blit_cs, &key, sizeof(key)),
cs);
}
if (!blitter->blit_cs[clamp][array]) {
blitter->blit_cs[clamp][array] =
asahi_blit_compute_shader(ctx, clamp, array);
}
assert(cs != NULL);
ctx->bind_compute_state(ctx, cs);
ctx->bind_compute_state(ctx, blitter->blit_cs[clamp][array]);
/* Expand the grid so destinations are in tiles */
unsigned expanded_x0 = info->dst.box.x & ~(TILE_WIDTH - 1);
unsigned expanded_y0 = info->dst.box.y & ~(TILE_HEIGHT - 1);
unsigned expanded_x1 =
align(info->dst.box.x + info->dst.box.width, TILE_WIDTH);
unsigned expanded_y1 =
align(info->dst.box.y + info->dst.box.height, TILE_HEIGHT);
/* But clamp to the destination size to save some redundant threads */
expanded_x1 =
MIN2(expanded_x1, u_minify(info->dst.resource->width0, info->dst.level));
expanded_y1 =
MIN2(expanded_y1, u_minify(info->dst.resource->height0, info->dst.level));
/* Recalculate the width/height based on the expanded grid */
width = expanded_x1 - expanded_x0;
height = expanded_y1 - expanded_y0;
struct pipe_grid_info grid_info = {
.block = {BLIT_WG_SIZE, BLIT_WG_SIZE, 1},
.last_block = {width % BLIT_WG_SIZE, height % BLIT_WG_SIZE, 1},
.block = {TILE_WIDTH, TILE_HEIGHT, 1},
.last_block = {width % TILE_WIDTH, height % TILE_HEIGHT, 1},
.grid =
{
DIV_ROUND_UP(width, BLIT_WG_SIZE),
DIV_ROUND_UP(height, BLIT_WG_SIZE),
DIV_ROUND_UP(width, TILE_WIDTH),
DIV_ROUND_UP(height, TILE_HEIGHT),
depth,
},
};
@ -378,11 +526,7 @@ agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
agx_legalize_compression(ctx, agx_resource(info->src.resource),
info->src.format);
if (asahi_compute_blit_supported(info) &&
(agx_device(pipe->screen)->debug & AGX_DBG_COMPBLIT) &&
!(ail_is_compressed(&agx_resource(info->dst.resource)->layout) &&
util_format_get_blocksize(info->dst.format) == 16)) {
if (asahi_compute_blit_supported(info)) {
asahi_compute_blit(pipe, info, &ctx->compute_blitter);
return;
}

View file

@ -20,7 +20,7 @@
#include "agx_state.h"
/* Flags that are allowed and do not disable the disk cache */
#define ALLOWED_FLAGS (AGX_DBG_NO16 | AGX_DBG_COMPBLIT)
#define ALLOWED_FLAGS (AGX_DBG_NO16)
/**
* Compute a disk cache key for the given uncompiled shader and shader key.

View file

@ -1919,6 +1919,7 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
agx_init_meta_shaders(ctx);
ctx->blitter = util_blitter_create(pctx);
ctx->compute_blitter.blit_cs = asahi_blit_key_table_create(ctx);
ctx->result_buf =
agx_bo_create(agx_device(screen),

View file

@ -2767,9 +2767,13 @@ agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
/* For the texture descriptor, lower cubes to 2D arrays. This matches the
* transform done in the compiler.
* transform done in the compiler. Also, force 2D arrays for internal
* blitter images, this helps reduce shader variants.
*/
if (target_is_cube(sampler_view.target))
bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
if (target_is_cube(sampler_view.target) ||
(sampler_view.target == PIPE_TEXTURE_3D && internal))
sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
agx_pack_texture(texture, agx_resource(view->resource), view->format,

View file

@ -575,11 +575,16 @@ enum asahi_blit_clamp {
ASAHI_BLIT_CLAMP_COUNT,
};
struct asahi_blit_key {
enum pipe_format src_format, dst_format;
bool array;
};
DERIVE_HASH_TABLE(asahi_blit_key);
struct asahi_blitter {
bool active;
/* [clamp_type][is_array] */
void *blit_cs[ASAHI_BLIT_CLAMP_COUNT][2];
struct hash_table *blit_cs;
/* [filter] */
void *sampler[2];