mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 17:30:12 +01:00
radeonsi: use compute shaders for clear_buffer & copy_buffer
Fast color clears should be much faster. Also, fast color clears on evicted buffers should be 200x faster on GFX8 and older.
This commit is contained in:
parent
5030adcbe0
commit
9b331e462e
8 changed files with 350 additions and 203 deletions
|
|
@ -11,6 +11,7 @@ C_SOURCES := \
|
|||
si_clear.c \
|
||||
si_compute.c \
|
||||
si_compute.h \
|
||||
si_compute_blit.c \
|
||||
si_cp_dma.c \
|
||||
si_debug.c \
|
||||
si_descriptors.c \
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ files_libradeonsi = files(
|
|||
'si_clear.c',
|
||||
'si_compute.c',
|
||||
'si_compute.h',
|
||||
'si_compute_blit.c',
|
||||
'si_cp_dma.c',
|
||||
'si_debug.c',
|
||||
'si_descriptors.c',
|
||||
|
|
|
|||
|
|
@ -256,7 +256,7 @@ void vi_dcc_clear_level(struct si_context *sctx,
|
|||
}
|
||||
|
||||
si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
|
||||
clear_value, SI_COHERENCY_CB_META);
|
||||
&clear_value, 4, SI_COHERENCY_CB_META);
|
||||
}
|
||||
|
||||
/* Set the same micro tile mode as the destination of the last MSAA resolve.
|
||||
|
|
@ -487,9 +487,10 @@ static void si_do_fast_color_clear(struct si_context *sctx,
|
|||
if (eliminate_needed)
|
||||
continue;
|
||||
|
||||
uint32_t clear_value = 0xCCCCCCCC;
|
||||
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
|
||||
tex->cmask_offset, tex->surface.cmask_size,
|
||||
0xCCCCCCCC, SI_COHERENCY_CB_META);
|
||||
&clear_value, 4, SI_COHERENCY_CB_META);
|
||||
need_decompress_pass = true;
|
||||
}
|
||||
|
||||
|
|
@ -518,9 +519,10 @@ static void si_do_fast_color_clear(struct si_context *sctx,
|
|||
continue;
|
||||
|
||||
/* Do the fast clear. */
|
||||
uint32_t clear_value = 0;
|
||||
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
|
||||
tex->cmask_offset, tex->surface.cmask_size, 0,
|
||||
SI_COHERENCY_CB_META);
|
||||
tex->cmask_offset, tex->surface.cmask_size,
|
||||
&clear_value, 4, SI_COHERENCY_CB_META);
|
||||
need_decompress_pass = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
285
src/gallium/drivers/radeonsi/si_compute_blit.c
Normal file
285
src/gallium/drivers/radeonsi/si_compute_blit.c
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
/*
|
||||
* Copyright 2018 Advanced Micro Devices, Inc.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* on the rights to use, copy, modify, merge, publish, distribute, sub
|
||||
* license, and/or sell copies of the Software, and to permit persons to whom
|
||||
* the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
|
||||
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
||||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "si_pipe.h"
|
||||
|
||||
/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
|
||||
* and L2_STREAM for src.
|
||||
*/
|
||||
static enum si_cache_policy get_cache_policy(struct si_context *sctx,
|
||||
enum si_coherency coher,
|
||||
uint64_t size)
|
||||
{
|
||||
if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
|
||||
coher == SI_COHERENCY_CP)) ||
|
||||
(sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
|
||||
return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
|
||||
|
||||
return L2_BYPASS;
|
||||
}
|
||||
|
||||
unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
|
||||
enum si_cache_policy cache_policy)
|
||||
{
|
||||
switch (coher) {
|
||||
default:
|
||||
case SI_COHERENCY_NONE:
|
||||
case SI_COHERENCY_CP:
|
||||
return 0;
|
||||
case SI_COHERENCY_SHADER:
|
||||
return SI_CONTEXT_INV_SMEM_L1 |
|
||||
SI_CONTEXT_INV_VMEM_L1 |
|
||||
(cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
|
||||
case SI_COHERENCY_CB_META:
|
||||
return SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
}
|
||||
}
|
||||
|
||||
static void si_compute_do_clear_or_copy(struct si_context *sctx,
|
||||
struct pipe_resource *dst,
|
||||
unsigned dst_offset,
|
||||
struct pipe_resource *src,
|
||||
unsigned src_offset,
|
||||
unsigned size,
|
||||
const uint32_t *clear_value,
|
||||
unsigned clear_value_size,
|
||||
enum si_coherency coher)
|
||||
{
|
||||
struct pipe_context *ctx = &sctx->b;
|
||||
|
||||
assert(src_offset % 4 == 0);
|
||||
assert(dst_offset % 4 == 0);
|
||||
assert(size % 4 == 0);
|
||||
|
||||
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
|
||||
assert(!src || src_offset + size <= src->width0);
|
||||
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
|
||||
si_emit_cache_flush(sctx);
|
||||
|
||||
/* Save states. */
|
||||
void *saved_cs = sctx->cs_shader_state.program;
|
||||
struct pipe_shader_buffer saved_sb[2] = {};
|
||||
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
|
||||
|
||||
/* The memory accesses are coalesced, meaning that the 1st instruction writes
|
||||
* the 1st contiguous block of data for the whole wave, the 2nd instruction
|
||||
* writes the 2nd contiguous block of data, etc.
|
||||
*/
|
||||
unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
|
||||
SI_COMPUTE_CLEAR_DW_PER_THREAD;
|
||||
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
|
||||
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
|
||||
unsigned dwords_per_wave = dwords_per_thread * 64;
|
||||
|
||||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(64, num_instructions);
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
|
||||
struct pipe_shader_buffer sb[2] = {};
|
||||
sb[0].buffer = dst;
|
||||
sb[0].buffer_offset = dst_offset;
|
||||
sb[0].buffer_size = size;
|
||||
|
||||
if (src) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_offset = src_offset;
|
||||
sb[1].buffer_size = size;
|
||||
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
|
||||
ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
|
||||
} else {
|
||||
assert(clear_value_size >= 4 &&
|
||||
clear_value_size <= 16 &&
|
||||
util_is_power_of_two_or_zero(clear_value_size));
|
||||
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
|
||||
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
|
||||
ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
|
||||
}
|
||||
|
||||
ctx->launch_grid(ctx, &info);
|
||||
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
(cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
|
||||
|
||||
if (cache_policy != L2_BYPASS)
|
||||
r600_resource(dst)->TC_L2_dirty = true;
|
||||
|
||||
/* Restore states. */
|
||||
ctx->bind_compute_state(ctx, saved_cs);
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
|
||||
}
|
||||
|
||||
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, uint32_t *clear_value,
|
||||
uint32_t clear_value_size, enum si_coherency coher)
|
||||
{
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
unsigned clear_alignment = MIN2(clear_value_size, 4);
|
||||
|
||||
assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
|
||||
assert(offset % clear_alignment == 0);
|
||||
assert(size % clear_alignment == 0);
|
||||
assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
|
||||
|
||||
/* Reduce a large clear value size if possible. */
|
||||
if (clear_value_size > 4) {
|
||||
bool clear_dword_duplicated = true;
|
||||
|
||||
/* See if we can lower large fills to dword fills. */
|
||||
for (unsigned i = 1; i < clear_value_size / 4; i++) {
|
||||
if (clear_value[0] != clear_value[i]) {
|
||||
clear_dword_duplicated = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (clear_dword_duplicated)
|
||||
clear_value_size = 4;
|
||||
}
|
||||
|
||||
/* Expand a small clear value size. */
|
||||
uint32_t tmp_clear_value;
|
||||
if (clear_value_size <= 2) {
|
||||
if (clear_value_size == 1) {
|
||||
tmp_clear_value = *(uint8_t*)clear_value;
|
||||
tmp_clear_value |= (tmp_clear_value << 8) |
|
||||
(tmp_clear_value << 16) |
|
||||
(tmp_clear_value << 24);
|
||||
} else {
|
||||
tmp_clear_value = *(uint16_t*)clear_value;
|
||||
tmp_clear_value |= tmp_clear_value << 16;
|
||||
}
|
||||
clear_value = &tmp_clear_value;
|
||||
clear_value_size = 4;
|
||||
}
|
||||
|
||||
/* Use transform feedback for 12-byte clears. */
|
||||
/* TODO: Use compute. */
|
||||
if (clear_value_size == 12) {
|
||||
union pipe_color_union streamout_clear_value;
|
||||
|
||||
memcpy(&streamout_clear_value, clear_value, clear_value_size);
|
||||
si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
|
||||
util_blitter_clear_buffer(sctx->blitter, dst, offset,
|
||||
size, clear_value_size / 4,
|
||||
&streamout_clear_value);
|
||||
si_blitter_end(sctx);
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t aligned_size = size & ~3ull;
|
||||
if (aligned_size >= 4) {
|
||||
/* Before GFX9, CP DMA was very slow when clearing GTT, so never
|
||||
* use CP DMA clears on those chips, because we can't be certain
|
||||
* about buffer placements.
|
||||
*/
|
||||
if (clear_value_size > 4 ||
|
||||
(clear_value_size == 4 &&
|
||||
offset % 4 == 0 &&
|
||||
(size > 32*1024 || sctx->chip_class <= VI))) {
|
||||
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
|
||||
aligned_size, clear_value,
|
||||
clear_value_size, coher);
|
||||
} else {
|
||||
assert(clear_value_size == 4);
|
||||
si_cp_dma_clear_buffer(sctx, dst, offset,
|
||||
aligned_size, *clear_value, coher,
|
||||
get_cache_policy(sctx, coher, size));
|
||||
}
|
||||
|
||||
offset += aligned_size;
|
||||
size -= aligned_size;
|
||||
}
|
||||
|
||||
/* Handle non-dword alignment. */
|
||||
if (size) {
|
||||
assert(dst);
|
||||
assert(dst->target == PIPE_BUFFER);
|
||||
assert(size < 4);
|
||||
|
||||
pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
|
||||
}
|
||||
}
|
||||
|
||||
static void si_pipe_clear_buffer(struct pipe_context *ctx,
|
||||
struct pipe_resource *dst,
|
||||
unsigned offset, unsigned size,
|
||||
const void *clear_value,
|
||||
int clear_value_size)
|
||||
{
|
||||
enum si_coherency coher;
|
||||
|
||||
if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
|
||||
coher = SI_COHERENCY_CP;
|
||||
else
|
||||
coher = SI_COHERENCY_SHADER;
|
||||
|
||||
si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
|
||||
clear_value_size, coher);
|
||||
}
|
||||
|
||||
void si_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size)
|
||||
{
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
enum si_coherency coher = SI_COHERENCY_SHADER;
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
|
||||
|
||||
/* Only use compute for VRAM copies on dGPUs. */
|
||||
if (sctx->screen->info.has_dedicated_vram &&
|
||||
r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
|
||||
r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
|
||||
size > 32 * 1024 &&
|
||||
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
|
||||
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
|
||||
size, NULL, 0, coher);
|
||||
} else {
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
|
||||
0, coher, cache_policy);
|
||||
}
|
||||
}
|
||||
|
||||
void si_init_compute_blit_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.clear_buffer = si_pipe_clear_buffer;
|
||||
}
|
||||
|
|
@ -25,12 +25,6 @@
|
|||
#include "si_pipe.h"
|
||||
#include "sid.h"
|
||||
|
||||
/* Recommended maximum sizes for optimal performance.
|
||||
* Fall back to compute or SDMA if the size is greater.
|
||||
*/
|
||||
#define CP_DMA_COPY_PERF_THRESHOLD (64 * 1024) /* copied from Vulkan */
|
||||
#define CP_DMA_CLEAR_PERF_THRESHOLD (32 * 1024) /* guess (clear is much slower) */
|
||||
|
||||
/* Set this if you want the ME to wait until CP DMA is done.
|
||||
* It should be set on the last CP DMA packet. */
|
||||
#define CP_DMA_SYNC (1 << 0)
|
||||
|
|
@ -155,35 +149,6 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx)
|
|||
si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
|
||||
}
|
||||
|
||||
static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher,
|
||||
enum si_cache_policy cache_policy)
|
||||
{
|
||||
switch (coher) {
|
||||
default:
|
||||
case SI_COHERENCY_NONE:
|
||||
return 0;
|
||||
case SI_COHERENCY_SHADER:
|
||||
assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
|
||||
return SI_CONTEXT_INV_SMEM_L1 |
|
||||
SI_CONTEXT_INV_VMEM_L1 |
|
||||
(cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
|
||||
case SI_COHERENCY_CB_META:
|
||||
assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS :
|
||||
cache_policy == L2_BYPASS);
|
||||
return SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
}
|
||||
}
|
||||
|
||||
static enum si_cache_policy get_cache_policy(struct si_context *sctx,
|
||||
enum si_coherency coher)
|
||||
{
|
||||
if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) ||
|
||||
(sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
|
||||
return L2_LRU;
|
||||
|
||||
return L2_BYPASS;
|
||||
}
|
||||
|
||||
static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
|
||||
struct pipe_resource *src, unsigned byte_count,
|
||||
uint64_t remaining_size, unsigned user_flags,
|
||||
|
|
@ -262,7 +227,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
/* Flush the caches. */
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
get_flush_flags(sctx, coher, cache_policy);
|
||||
si_get_flush_flags(sctx, coher, cache_policy);
|
||||
|
||||
while (size) {
|
||||
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
|
||||
|
|
@ -286,122 +251,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
sctx->num_cp_dma_calls++;
|
||||
}
|
||||
|
||||
/* dst == NULL means GDS. */
|
||||
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, unsigned value,
|
||||
enum si_coherency coher)
|
||||
{
|
||||
struct radeon_winsys *ws = sctx->ws;
|
||||
struct r600_resource *rdst = r600_resource(dst);
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
|
||||
uint64_t dma_clear_size;
|
||||
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
dma_clear_size = size & ~3ull;
|
||||
|
||||
/* dma_clear_buffer can use clear_buffer on failure. Make sure that
|
||||
* doesn't happen. We don't want an infinite recursion: */
|
||||
if (sctx->dma_cs &&
|
||||
!(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
|
||||
(offset % 4 == 0) &&
|
||||
/* CP DMA is very slow. Always use SDMA for big clears. This
|
||||
* alone improves DeusEx:MD performance by 70%. */
|
||||
(size > CP_DMA_CLEAR_PERF_THRESHOLD ||
|
||||
/* Buffers not used by the GFX IB yet will be cleared by SDMA.
|
||||
* This happens to move most buffer clears to SDMA, including
|
||||
* DCC and CMASK clears, because pipe->clear clears them before
|
||||
* si_emit_framebuffer_state (in a draw call) adds them.
|
||||
* For example, DeusEx:MD has 21 buffer clears per frame and all
|
||||
* of them are moved to SDMA thanks to this. */
|
||||
!ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
|
||||
RADEON_USAGE_READWRITE))) {
|
||||
si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
|
||||
|
||||
offset += dma_clear_size;
|
||||
size -= dma_clear_size;
|
||||
} else if (dma_clear_size >= 4) {
|
||||
si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
|
||||
coher, cache_policy);
|
||||
|
||||
offset += dma_clear_size;
|
||||
size -= dma_clear_size;
|
||||
}
|
||||
|
||||
if (size) {
|
||||
/* Handle non-dword alignment.
|
||||
*
|
||||
* This function is called for embedded texture metadata clears,
|
||||
* but those should always be properly aligned. */
|
||||
assert(dst);
|
||||
assert(dst->target == PIPE_BUFFER);
|
||||
assert(size < 4);
|
||||
|
||||
pipe_buffer_write(&sctx->b, dst, offset, size, &value);
|
||||
}
|
||||
}
|
||||
|
||||
static void si_pipe_clear_buffer(struct pipe_context *ctx,
|
||||
struct pipe_resource *dst,
|
||||
unsigned offset, unsigned size,
|
||||
const void *clear_value_ptr,
|
||||
int clear_value_size)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context*)ctx;
|
||||
uint32_t dword_value;
|
||||
unsigned i;
|
||||
|
||||
assert(offset % clear_value_size == 0);
|
||||
assert(size % clear_value_size == 0);
|
||||
|
||||
if (clear_value_size > 4) {
|
||||
const uint32_t *u32 = clear_value_ptr;
|
||||
bool clear_dword_duplicated = true;
|
||||
|
||||
/* See if we can lower large fills to dword fills. */
|
||||
for (i = 1; i < clear_value_size / 4; i++)
|
||||
if (u32[0] != u32[i]) {
|
||||
clear_dword_duplicated = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!clear_dword_duplicated) {
|
||||
/* Use transform feedback for 64-bit, 96-bit, and
|
||||
* 128-bit fills.
|
||||
*/
|
||||
union pipe_color_union clear_value;
|
||||
|
||||
memcpy(&clear_value, clear_value_ptr, clear_value_size);
|
||||
si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
|
||||
util_blitter_clear_buffer(sctx->blitter, dst, offset,
|
||||
size, clear_value_size / 4,
|
||||
&clear_value);
|
||||
si_blitter_end(sctx);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Expand the clear value to a dword. */
|
||||
switch (clear_value_size) {
|
||||
case 1:
|
||||
dword_value = *(uint8_t*)clear_value_ptr;
|
||||
dword_value |= (dword_value << 8) |
|
||||
(dword_value << 16) |
|
||||
(dword_value << 24);
|
||||
break;
|
||||
case 2:
|
||||
dword_value = *(uint16_t*)clear_value_ptr;
|
||||
dword_value |= dword_value << 16;
|
||||
break;
|
||||
default:
|
||||
dword_value = *(uint32_t*)clear_value_ptr;
|
||||
}
|
||||
|
||||
si_clear_buffer(sctx, dst, offset, size, dword_value,
|
||||
SI_COHERENCY_SHADER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Realign the CP DMA engine. This must be done after a copy with an unaligned
|
||||
* size.
|
||||
|
|
@ -509,7 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
|
|||
if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
get_flush_flags(sctx, coher, cache_policy);
|
||||
si_get_flush_flags(sctx, coher, cache_policy);
|
||||
}
|
||||
|
||||
/* This is the main part doing the copying. Src is always aligned. */
|
||||
|
|
@ -549,26 +398,12 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
|
|||
si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
|
||||
cache_policy, &is_first);
|
||||
}
|
||||
}
|
||||
|
||||
void si_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size)
|
||||
{
|
||||
enum si_coherency coher = SI_COHERENCY_SHADER;
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
|
||||
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
|
||||
0, coher, cache_policy);
|
||||
|
||||
if (cache_policy != L2_BYPASS)
|
||||
if (dst && cache_policy != L2_BYPASS)
|
||||
r600_resource(dst)->TC_L2_dirty = true;
|
||||
|
||||
/* If it's not a prefetch... */
|
||||
if (dst_offset != src_offset)
|
||||
/* If it's not a prefetch or GDS copy... */
|
||||
if (dst && src && (dst != src || dst_offset != src_offset))
|
||||
sctx->num_cp_dma_calls++;
|
||||
}
|
||||
|
||||
|
|
@ -744,8 +579,3 @@ void si_test_gds(struct si_context *sctx)
|
|||
pipe_resource_reference(&dst, NULL);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void si_init_cp_dma_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.clear_buffer = si_pipe_clear_buffer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -195,6 +195,10 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
|
||||
if (sctx->vs_blit_texcoord)
|
||||
sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
|
||||
if (sctx->cs_clear_buffer)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
|
||||
if (sctx->cs_copy_buffer)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
||||
|
||||
if (sctx->blitter)
|
||||
util_blitter_destroy(sctx->blitter);
|
||||
|
|
@ -416,7 +420,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
|
||||
sctx->allocator_zeroed_memory =
|
||||
u_suballocator_create(&sctx->b, sscreen->info.gart_page_size,
|
||||
0, PIPE_USAGE_DEFAULT, 0, true);
|
||||
0, PIPE_USAGE_DEFAULT,
|
||||
SI_RESOURCE_FLAG_SO_FILLED_SIZE, true);
|
||||
if (!sctx->allocator_zeroed_memory)
|
||||
goto fail;
|
||||
|
||||
|
|
@ -453,7 +458,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
si_init_clear_functions(sctx);
|
||||
si_init_blit_functions(sctx);
|
||||
si_init_compute_functions(sctx);
|
||||
si_init_cp_dma_functions(sctx);
|
||||
si_init_compute_blit_functions(sctx);
|
||||
si_init_debug_functions(sctx);
|
||||
si_init_msaa_functions(sctx);
|
||||
si_init_streamout_functions(sctx);
|
||||
|
|
@ -503,6 +508,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
if (sscreen->debug_flags & DBG(FORCE_DMA))
|
||||
sctx->b.resource_copy_region = sctx->dma_copy;
|
||||
|
||||
bool dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
|
||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
|
||||
SI_COMPUTE_CLEAR_DW_PER_THREAD,
|
||||
dst_stream_policy, false);
|
||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
|
||||
SI_COMPUTE_COPY_DW_PER_THREAD,
|
||||
dst_stream_policy, true);
|
||||
|
||||
sctx->blitter = util_blitter_create(&sctx->b);
|
||||
if (sctx->blitter == NULL)
|
||||
goto fail;
|
||||
|
|
@ -561,9 +574,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
&sctx->null_const_buf);
|
||||
|
||||
/* Clear the NULL constant buffer, because loads should return zeros. */
|
||||
uint32_t clear_value = 0;
|
||||
si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
|
||||
sctx->null_const_buf.buffer->width0, 0,
|
||||
SI_COHERENCY_SHADER);
|
||||
sctx->null_const_buf.buffer->width0,
|
||||
&clear_value, 4, SI_COHERENCY_SHADER);
|
||||
}
|
||||
|
||||
uint64_t max_threads_per_block;
|
||||
|
|
|
|||
|
|
@ -52,6 +52,11 @@
|
|||
/* Alignment for optimal CP DMA performance. */
|
||||
#define SI_CPDMA_ALIGNMENT 32
|
||||
|
||||
/* Tunables for compute-based clear_buffer and copy_buffer: */
|
||||
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
|
||||
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
|
||||
#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
|
||||
|
||||
/* Pipeline & streamout query controls. */
|
||||
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
|
||||
#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
|
||||
|
|
@ -102,6 +107,7 @@
|
|||
#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
|
||||
#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
|
||||
#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
|
||||
#define SI_RESOURCE_FLAG_SO_FILLED_SIZE (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
|
||||
|
||||
/* Debug flags. */
|
||||
enum {
|
||||
|
|
@ -172,6 +178,19 @@ enum {
|
|||
#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1))
|
||||
#define DBG(name) (1ull << DBG_##name)
|
||||
|
||||
enum si_cache_policy {
|
||||
L2_BYPASS,
|
||||
L2_STREAM, /* same as SLC=1 */
|
||||
L2_LRU, /* same as SLC=0 */
|
||||
};
|
||||
|
||||
enum si_coherency {
|
||||
SI_COHERENCY_NONE, /* no cache flushes needed */
|
||||
SI_COHERENCY_SHADER,
|
||||
SI_COHERENCY_CB_META,
|
||||
SI_COHERENCY_CP,
|
||||
};
|
||||
|
||||
struct si_compute;
|
||||
struct hash_table;
|
||||
struct u_suballocator;
|
||||
|
|
@ -773,6 +792,8 @@ struct si_context {
|
|||
void *vs_blit_color;
|
||||
void *vs_blit_color_layered;
|
||||
void *vs_blit_texcoord;
|
||||
void *cs_clear_buffer;
|
||||
void *cs_copy_buffer;
|
||||
struct si_screen *screen;
|
||||
struct pipe_debug_callback debug;
|
||||
struct ac_llvm_compiler compiler; /* only non-threaded compilation */
|
||||
|
|
@ -1110,6 +1131,17 @@ void vi_dcc_clear_level(struct si_context *sctx,
|
|||
unsigned level, unsigned clear_value);
|
||||
void si_init_clear_functions(struct si_context *sctx);
|
||||
|
||||
/* si_compute_blit.c */
|
||||
unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
|
||||
enum si_cache_policy cache_policy);
|
||||
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, uint32_t *clear_value,
|
||||
uint32_t clear_value_size, enum si_coherency coher);
|
||||
void si_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size);
|
||||
void si_init_compute_blit_functions(struct si_context *sctx);
|
||||
|
||||
/* si_cp_dma.c */
|
||||
#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */
|
||||
#define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */
|
||||
|
|
@ -1122,39 +1154,20 @@ void si_init_clear_functions(struct si_context *sctx);
|
|||
SI_CPDMA_SKIP_GFX_SYNC | \
|
||||
SI_CPDMA_SKIP_BO_LIST_UPDATE)
|
||||
|
||||
enum si_cache_policy {
|
||||
L2_BYPASS,
|
||||
L2_STREAM, /* same as SLC=1 */
|
||||
L2_LRU, /* same as SLC=0 */
|
||||
};
|
||||
|
||||
enum si_coherency {
|
||||
SI_COHERENCY_NONE, /* no cache flushes needed */
|
||||
SI_COHERENCY_SHADER,
|
||||
SI_COHERENCY_CB_META,
|
||||
};
|
||||
|
||||
void si_cp_dma_wait_for_idle(struct si_context *sctx);
|
||||
void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, unsigned value,
|
||||
enum si_coherency coher,
|
||||
enum si_cache_policy cache_policy);
|
||||
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, unsigned value,
|
||||
enum si_coherency coher);
|
||||
void si_cp_dma_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size,
|
||||
unsigned user_flags, enum si_coherency coher,
|
||||
enum si_cache_policy cache_policy);
|
||||
void si_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size);
|
||||
void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
|
||||
uint64_t offset, unsigned size);
|
||||
void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
|
||||
void si_test_gds(struct si_context *sctx);
|
||||
void si_init_cp_dma_functions(struct si_context *sctx);
|
||||
|
||||
/* si_debug.c */
|
||||
void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
|
||||
|
|
|
|||
|
|
@ -307,7 +307,8 @@ void si_test_dma(struct si_screen *sscreen)
|
|||
set_random_pixels(ctx, src, &src_cpu);
|
||||
|
||||
/* clear dst pixels */
|
||||
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0,
|
||||
uint32_t zero = 0;
|
||||
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
|
||||
SI_COHERENCY_SHADER);
|
||||
memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue