radeonsi: remove splitting IBs that use too much memory

It was needed for r300, not so much for GCN/RDNA.
This reduces draw overhead.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
Marek Olšák 2023-07-16 05:59:58 -04:00 committed by Marge Bot
parent a59d387bc2
commit 7d67e10b02
12 changed files with 29 additions and 129 deletions

View file

@ -130,9 +130,6 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
res->flags |= RADEON_FLAG_DISCARDABLE;
}
/* Set expected VRAM and GART usage for the buffer. */
res->memory_usage_kb = MAX2(1, size / 1024);
if (res->domains & RADEON_DOMAIN_VRAM) {
/* We don't want to evict buffers from VRAM by mapping them for CPU access,
* because they might never be moved back again. If a buffer is large enough,
@ -272,7 +269,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
sdst->b.b.bind = ssrc->b.b.bind;
sdst->flags = ssrc->flags;
assert(sdst->memory_usage_kb == ssrc->memory_usage_kb);
assert(sdst->bo_size == ssrc->bo_size);
assert(sdst->bo_alignment_log2 == ssrc->bo_alignment_log2);
assert(sdst->domains == ssrc->domains);
@ -633,7 +629,6 @@ static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *scre
}
buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
buf->memory_usage_kb = templ->width0 / 1024;
return &buf->b.b;
}

View file

@ -972,13 +972,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
gfx11_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
}
/* Add buffer sizes for memory checking in need_cs_space. */
si_context_add_resource_size(sctx, &program->shader.bo->b.b);
/* TODO: add the scratch buffer */
if (info->indirect) {
si_context_add_resource_size(sctx, info->indirect);
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2;

View file

@ -129,12 +129,6 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
uint64_t remaining_size, unsigned user_flags, enum si_coherency coher,
bool *is_first, unsigned *packet_flags)
{
/* Count memory usage in so that need_cs_space can take it into account. */
if (dst)
si_context_add_resource_size(sctx, dst);
if (src)
si_context_add_resource_size(sctx, src);
if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE))
si_need_gfx_cs_space(sctx, 0);

View file

@ -211,7 +211,7 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_reso
tex = tex->flushed_depth_texture;
priority = si_get_sampler_view_priority(&tex->buffer);
radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage | priority, check_mem);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, usage | priority);
}
static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
@ -1251,8 +1251,8 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
buffers->buffers[slot] = buffer;
buffers->offsets[slot] = buffer_offset;
radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
RADEON_USAGE_READ | buffers->priority_constbuf, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer),
RADEON_USAGE_READ | buffers->priority_constbuf);
buffers->enabled_mask |= 1llu << slot;
} else {
/* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */
@ -1396,8 +1396,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
buffers->offsets[slot] = sbuffer->buffer_offset;
radeon_add_to_gfx_buffer_list_check_mem(
sctx, buf, (writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ) | priority, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, buf,
(writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ) | priority);
if (writable)
buffers->writable_mask |= 1llu << slot;
else
@ -1673,10 +1673,9 @@ static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_
si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
sctx->descriptors_dirty |= 1u << descriptors_idx;
radeon_add_to_gfx_buffer_list_check_mem(
sctx, si_resource(buffer),
(buffers->writable_mask & (1llu << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ) |
priority, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer),
(buffers->writable_mask & (1llu << i) ?
RADEON_USAGE_READWRITE : RADEON_USAGE_READ) | priority);
noop = false;
}
}
@ -1709,9 +1708,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
for (unsigned i = 0; i < ARRAY_SIZE(sctx->vertex_buffer); i++) {
struct si_resource *buf = si_resource(sctx->vertex_buffer[i].buffer.resource);
if (buf) {
radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
RADEON_USAGE_READ |
RADEON_PRIO_VERTEX_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, buf,
RADEON_USAGE_READ |
RADEON_PRIO_VERTEX_BUFFER);
}
}
} else if (buffer->bind_history & SI_BIND_VERTEX_BUFFER) {
@ -1725,9 +1724,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
if (sctx->vertex_buffer[vb].buffer.resource == buf) {
sctx->vertex_buffers_dirty = num_elems > 0;
radeon_add_to_gfx_buffer_list_check_mem(sctx, buffer,
RADEON_USAGE_READ |
RADEON_PRIO_VERTEX_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, buffer,
RADEON_USAGE_READ |
RADEON_PRIO_VERTEX_BUFFER);
break;
}
}
@ -1746,8 +1745,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
sctx->descriptors_dirty |= 1u << SI_DESCS_INTERNAL;
radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_WRITE |
RADEON_PRIO_SHADER_RW_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_WRITE |
RADEON_PRIO_SHADER_RW_BUFFER);
/* Update the streamout state. */
if (sctx->streamout.begin_emitted)
@ -1803,8 +1802,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
descs->list + desc_slot * 16 + 4);
sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ |
RADEON_PRIO_SAMPLER_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READ |
RADEON_PRIO_SAMPLER_BUFFER);
}
}
}
@ -1833,9 +1832,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
descs->list + desc_slot * 8 + 4);
sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
RADEON_USAGE_READWRITE |
RADEON_PRIO_SAMPLER_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer),
RADEON_USAGE_READWRITE |
RADEON_PRIO_SAMPLER_BUFFER);
if (shader == PIPE_SHADER_COMPUTE)
sctx->compute_image_sgprs_dirty = true;
@ -1860,8 +1859,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
(*tex_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;
radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ |
RADEON_PRIO_SAMPLER_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READ |
RADEON_PRIO_SAMPLER_BUFFER);
}
}
}
@ -1885,8 +1884,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
(*img_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;
radeon_add_to_gfx_buffer_list_check_mem(
sctx, si_resource(buffer), RADEON_USAGE_READWRITE | RADEON_PRIO_SAMPLER_BUFFER, true);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(buffer),
RADEON_USAGE_READWRITE | RADEON_PRIO_SAMPLER_BUFFER);
}
}
}

View file

@ -550,10 +550,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->num_buffered_gfx_sh_regs = 0;
ctx->num_buffered_compute_sh_regs = 0;
if (ctx->scratch_buffer) {
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
if (ctx->scratch_buffer)
si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
}
if (ctx->streamout.suspended) {
ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;

View file

@ -1311,8 +1311,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
ac_get_hs_info(&sscreen->info, &sscreen->hs);
sscreen->has_draw_indirect_multi =

View file

@ -311,8 +311,6 @@ struct si_resource {
/* Winsys objects. */
struct pb_buffer *buf;
uint64_t gpu_address;
/* Memory usage if the buffer placement is optimal. */
uint32_t memory_usage_kb;
/* Resource properties. */
uint64_t bo_size;
@ -547,7 +545,6 @@ struct si_screen {
unsigned width, unsigned height, unsigned depth,
bool get_bo_metadata, uint32_t *state, uint32_t *fmask_state);
unsigned max_memory_usage_kb;
unsigned pa_sc_raster_config;
unsigned pa_sc_raster_config_1;
unsigned se_tile_repeat;
@ -1030,8 +1027,6 @@ struct si_context {
unsigned last_compressed_colortex_counter;
unsigned last_num_draw_calls;
unsigned flags; /* flush flags */
/* Current unaccounted memory usage. */
uint32_t memory_usage_kb;
/* Atoms (direct states). */
union si_state_atoms atoms;
@ -1760,14 +1755,6 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx,
return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10;
}
static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
{
if (r) {
/* Add memory usage for need_gfx_cs_space */
sctx->memory_usage_kb += si_resource(r)->memory_usage_kb;
}
}
static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
{
return 1 << (atom - sctx->atoms.array);
@ -1982,35 +1969,12 @@ static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
}
/**
* Return true if there is enough memory in VRAM and GTT for the buffers
* added so far.
*
* \param vram VRAM memory size not added to the buffer list yet
* \param gtt GTT memory size not added to the buffer list yet
*/
static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
uint32_t kb)
{
return kb + cs->used_vram_kb + cs->used_gart_kb < screen->max_memory_usage_kb;
}
static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
{
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
/* There are two memory usage counters in the winsys for all buffers
* that have been added (cs_add_buffer) and one counter in the pipe
* driver for those that haven't been added yet.
*/
uint32_t kb = ctx->memory_usage_kb;
ctx->memory_usage_kb = 0;
if (radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, kb) &&
ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws)))
return;
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
if (!ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws)))
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
/**
@ -2031,33 +1995,6 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct rad
bo->domains);
}
/**
* Same as above, but also checks memory usage and flushes the context
* accordingly.
*
* When this SHOULD NOT be used:
*
* - if si_context_add_resource_size has been called for the buffer
* followed by *_need_cs_space for checking the memory usage
*
* - when emitting state packets and draw packets (because preceding packets
* can't be re-emitted at that point)
*
* - if shader resource "enabled_mask" is not up-to-date or there is
* a different constraint disallowing a context flush
*/
static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
struct si_resource *bo,
unsigned usage,
bool check_mem)
{
if (check_mem &&
!radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->memory_usage_kb + bo->memory_usage_kb))
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage);
}
static inline void si_select_draw_vbo(struct si_context *sctx)
{
pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]

View file

@ -2993,8 +2993,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
sctx->framebuffer.has_dcc_msaa = true;
}
si_context_add_resource_size(sctx, surf->base.texture);
p_atomic_inc(&tex->framebuffers_bound);
/* Update the minimum but don't keep 0. */
@ -3016,8 +3014,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
sctx->framebuffer.DB_has_shader_readable_metadata = true;
si_context_add_resource_size(sctx, surf->base.texture);
/* Update the minimum but don't keep 0. */
if (!sctx->framebuffer.min_bytes_per_pixel ||
zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)

View file

@ -2085,10 +2085,6 @@ static void si_draw(struct pipe_context *ctx,
unsigned total_direct_count = 0;
if (!IS_DRAW_VERTEX_STATE && indirect) {
/* Add the buffer size for memory checking in need_cs_space. */
if (indirect->buffer)
si_context_add_resource_size(sctx, indirect->buffer);
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (GFX_VERSION <= GFX8) {
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
@ -2207,8 +2203,7 @@ static void si_draw(struct pipe_context *ctx,
}
}
/* Since we've called si_context_add_resource_size for vertex buffers,
* this must be called after si_need_cs_space, because we must let
/* This must be called after si_need_cs_space, because we must let
* need_cs_space flush before we add buffers to the buffer list.
*
* This must be done after si_update_shaders because si_update_shaders can

View file

@ -4044,8 +4044,6 @@ bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
sctx->screen->info.pte_fragment_size);
if (!sctx->scratch_buffer)
return false;
si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
}
if (sctx->gfx_level < GFX11 && !si_update_scratch_relocs(sctx))

View file

@ -135,7 +135,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
if (!targets[i])
continue;
si_context_add_resource_size(sctx, targets[i]->buffer);
enabled_mask |= 1 << i;
if (offsets[i] == ((unsigned)-1))

View file

@ -504,7 +504,6 @@ static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_tex
tex->buffer.b.b.bind = templ.bind;
radeon_bo_reference(sctx->screen->ws, &tex->buffer.buf, new_tex->buffer.buf);
tex->buffer.gpu_address = new_tex->buffer.gpu_address;
tex->buffer.memory_usage_kb = new_tex->buffer.memory_usage_kb;
tex->buffer.bo_size = new_tex->buffer.bo_size;
tex->buffer.bo_alignment_log2 = new_tex->buffer.bo_alignment_log2;
tex->buffer.domains = new_tex->buffer.domains;
@ -988,7 +987,6 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
resource->bo_alignment_log2 = plane0->buffer.bo_alignment_log2;
resource->flags = plane0->buffer.flags;
resource->domains = plane0->buffer.domains;
resource->memory_usage_kb = plane0->buffer.memory_usage_kb;
radeon_bo_reference(sscreen->ws, &resource->buf, plane0->buffer.buf);
resource->gpu_address = plane0->buffer.gpu_address;
@ -1009,7 +1007,6 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
resource->bo_size = imported_buf->size;
resource->bo_alignment_log2 = imported_buf->alignment_log2;
resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
resource->memory_usage_kb = MAX2(1, resource->bo_size / 1024);
if (sscreen->ws->buffer_get_flags)
resource->flags = sscreen->ws->buffer_get_flags(resource->buf);
}