diff --git a/src/amd/common/ac_surface_dcc_address_test.c b/src/amd/common/ac_surface_dcc_address_test.c index ecd4b63ef97..72d1694bb8d 100644 --- a/src/amd/common/ac_surface_dcc_address_test.c +++ b/src/amd/common/ac_surface_dcc_address_test.c @@ -255,6 +255,13 @@ static bool one_dcc_address_test(const char *name, const char *test, ADDR_HANDLE addr = gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight, dout.metaBlkDepth, dout.pitch, dout.height, in.x, in.y, in.slice, in.sample, in.pipeXor); + if (in.sample == 1) { + /* Sample 0 should be one byte before sample 1. The DCC MSAA clear relies on it. */ + assert(addr - 1 == + gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight, + dout.metaBlkDepth, dout.pitch, dout.height, + in.x, in.y, in.slice, 0, in.pipeXor)); + } } else { addr = gfx10_dcc_addr_from_coord(info, dout.equation.gfx10_bits, in.bpp, dout.metaBlkWidth, dout.metaBlkHeight, diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index adeff6f4b73..82e7d2d428f 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -43,6 +43,7 @@ void si_init_buffer_clear(struct si_clear_info *info, info->size = size; info->clear_value = clear_value; info->writemask = 0xffffffff; + info->is_dcc_msaa = false; } static void si_init_buffer_clear_rmw(struct si_clear_info *info, @@ -75,6 +76,12 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, /* Execute clears. */ for (unsigned i = 0; i < num_clears; i++) { + if (info[i].is_dcc_msaa) { + gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value, + SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP); + continue; + } + assert(info[i].size > 0); if (info[i].writemask != 0xffffffff) { @@ -328,10 +335,13 @@ bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsi if (tex->buffer.b.b.last_level > 0) return false; - /* 4x and 8x MSAA needs a sophisticated compute shader for - * the clear. See AMDVLK. */ - if (tex->buffer.b.b.nr_storage_samples >= 4) - return false; + /* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other + * samples untouched. (only the first 2 samples are compressed) */ + if (tex->buffer.b.b.nr_storage_samples >= 4) { + si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value); + out->is_dcc_msaa = true; + return true; + } clear_size = tex->surface.meta_size; } else { diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 7ea7893b5bd..534b474a199 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -725,6 +725,78 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) pipe_resource_reference(&saved_sb.buffer, NULL); } +void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value, + unsigned flags, enum si_coherency coher) +{ + struct pipe_context *ctx = &sctx->b; + struct si_texture *tex = (struct si_texture*)res; + + if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) + sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_shader_buffer saved_sb = {}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); + + unsigned saved_writable_mask = 0; + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(0))) + saved_writable_mask |= 1 << 0; + + /* Set the DCC buffer. */ + assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); + assert(tex->buffer.bo_size <= UINT_MAX); + + struct pipe_shader_buffer sb = {}; + sb.buffer = &tex->buffer.b.b; + sb.buffer_offset = tex->surface.meta_offset; + sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset; + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); + + sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) | + (tex->surface.u.gfx9.color.dcc_height << 16); + sctx->cs_user_data[1] = (clear_value & 0xffff) | + ((uint32_t)tex->surface.tile_swizzle << 16); + + /* These variables identify the shader variant. */ + unsigned swizzle_mode = tex->surface.u.gfx9.swizzle_mode; + unsigned bpe_log2 = util_logbase2(tex->surface.bpe); + bool samples8 = tex->buffer.b.b.nr_storage_samples == 8; + bool is_array = tex->buffer.b.b.array_size > 1; + void **shader = &sctx->cs_clear_dcc_msaa[swizzle_mode][bpe_log2][samples8][is_array]; + + if (!*shader) + *shader = gfx9_create_clear_dcc_msaa_cs(sctx, tex); + ctx->bind_compute_state(ctx, *shader); + + /* Dispatch compute. */ + unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width); + unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height); + unsigned depth = DIV_ROUND_UP(tex->buffer.b.b.array_size, tex->surface.u.gfx9.color.dcc_block_depth); + + struct pipe_grid_info info = {}; + info.block[0] = 8; + info.block[1] = 8; + info.block[2] = 1; + info.last_block[0] = width % info.block[0]; + info.last_block[1] = height % info.block[1]; + info.grid[0] = DIV_ROUND_UP(width, info.block[0]); + info.grid[1] = DIV_ROUND_UP(height, info.block[1]); + info.grid[2] = depth; + + si_launch_grid_internal(sctx, &info, saved_cs, flags); + + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, tex->surface.meta_size); + + if (flags & SI_OP_SYNC_AFTER) + sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; + + /* Restore states. */ + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); + pipe_resource_reference(&saved_sb.buffer, NULL); +} + /* Expand FMASK to make it identity, so that image stores can ignore it. */ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index ede75601f4a..6037558c213 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -268,6 +268,17 @@ static void si_destroy_context(struct pipe_context *context) } } + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) { + for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) { + for (unsigned l = 0; l < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j][k]); l++) { + if (sctx->cs_clear_dcc_msaa[i][j][k][l]) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_dcc_msaa[i][j][k][l]); + } + } + } + } + if (sctx->blitter) util_blitter_destroy(sctx->blitter); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 12d806093b2..555535c13db 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1316,6 +1316,10 @@ struct si_context { bool thread_trace_enabled; unsigned context_flags; + + /* Shaders. */ + /* TODO: move other shaders here too */ + void *cs_clear_dcc_msaa[32][5][2][2]; /* [swizzle_mode][log2(bpe)][samples == 8][is_array] */ }; /* si_blit.c */ @@ -1368,6 +1372,7 @@ struct si_clear_info { uint32_t size; uint32_t clear_value; uint32_t writemask; + bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */ }; enum pipe_format si_simplify_cb_format(enum pipe_format format); @@ -1423,6 +1428,8 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac unsigned dsty, unsigned width, unsigned height, bool render_condition_enabled); void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); +void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value, + unsigned flags, enum si_coherency coher); void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex); void si_init_compute_blit_functions(struct si_context *sctx); @@ -1539,6 +1546,7 @@ void si_resume_queries(struct si_context *sctx); /* si_shaderlib_nir.c */ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf); +void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); /* si_shaderlib_tgsi.c */ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 49d07fa3f11..06387cf6188 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -100,3 +100,49 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) return create_nir_cs(sctx, &b); } + +void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex) +{ + const nir_shader_compiler_options *options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa"); + b.shader->info.cs.local_size[0] = 8; + b.shader->info.cs.local_size[1] = 8; + b.shader->info.cs.local_size[2] = 1; + b.shader->info.cs.user_data_components_amd = 2; + b.shader->info.num_ssbos = 1; + + /* Get user data SGPRs. */ + nir_ssa_def *user_sgprs = nir_load_user_data_amd(&b); + nir_ssa_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor; + unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height); + unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor); + clear_value = nir_u2u16(&b, clear_value); + + /* Get the 2D coordinates. */ + nir_ssa_def *coord = get_global_ids(&b, 3); + nir_ssa_def *zero = nir_imm_int(&b, 0); + + /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */ + coord = nir_imul(&b, coord, + nir_channels(&b, nir_imm_ivec4(&b, tex->surface.u.gfx9.color.dcc_block_width, + tex->surface.u.gfx9.color.dcc_block_height, + tex->surface.u.gfx9.color.dcc_block_depth, 0), 0x7)); + + nir_ssa_def *offset = + ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe, + &tex->surface.u.gfx9.color.dcc_equation, + dcc_pitch, dcc_height, zero, /* DCC slice size */ + nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */ + tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */ + zero, pipe_xor); /* sample, pipe_xor */ + + /* The trick here is that DCC elements for an even and the next odd sample are next to each other + * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always + * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time. + */ + nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2); + + return create_nir_cs(sctx, &b); +}