radeonsi: remove temporary si_context::vb_descriptor_user_sgprs

We were writing descriptors into si_context and then copying them into
the command buffer. Just write them into the command buffer directly.
Also set the pointer to VBO descriptors right after them.

When we start a new command buffer or we finish blitting, we no longer
restore precomputed VBO descriptors. Instead, we just reupload them again.
It's a compromise to have the common path simpler and faster (maybe).

This removes a lot of stuff. Now the VBO descriptor upload path looks
very similar to the display list path.

There was an accidental hidden optimization that is now documented as
"last_const_upload_buffer".

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17933>
This commit is contained in:
Marek Olšák 2022-08-06 09:55:51 -04:00 committed by Marge Bot
parent a5d37e161d
commit 0e574c801c
7 changed files with 98 additions and 133 deletions

View file

@ -110,12 +110,7 @@ void si_blitter_end(struct si_context *sctx)
if (sctx->screen->use_ngg_culling)
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
num_vbos_in_user_sgprs;
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}

View file

@ -73,6 +73,14 @@
__cs_num += __n; \
} while (0)
/* Instead of writing into the command buffer, return the pointer to the command buffer and
* assume that the caller will fill the specified number of elements.
*/
#define radeon_emit_array_get_ptr(num, ptr) do { \
*(ptr) = __cs_buf + __cs_num; \
__cs_num += (num); \
} while (0)
#define radeon_set_config_reg_seq(reg, num) do { \
SI_CHECK_SHADOWED_REGS(reg, num); \
assert((reg) < SI_CONTEXT_REG_OFFSET); \

View file

@ -1160,32 +1160,6 @@ static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
}
}
/* VERTEX BUFFERS */
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
{
int count = sctx->num_vertex_elements;
int i;
for (i = 0; i < count; i++) {
int vb = sctx->vertex_elements->vertex_buffer_index[i];
if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
continue;
if (!sctx->vertex_buffer[vb].buffer.resource)
continue;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
si_resource(sctx->vertex_buffer[vb].buffer.resource),
RADEON_USAGE_READ | RADEON_PRIO_VERTEX_BUFFER);
}
if (!sctx->vb_descriptors_buffer)
return;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
RADEON_USAGE_READ | RADEON_PRIO_DESCRIPTORS);
}
/* CONSTANT BUFFERS */
static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
@ -2059,29 +2033,16 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
sctx->shader_pointers_dirty |=
u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
if (shader == PIPE_SHADER_VERTEX) {
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty =
sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
}
if (shader == PIPE_SHADER_VERTEX)
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
void si_shader_pointers_mark_dirty(struct si_context *sctx)
{
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty =
sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
@ -2884,8 +2845,6 @@ void si_release_all_descriptors(struct si_context *sctx)
for (i = 0; i < SI_NUM_DESCS; ++i)
si_release_descriptors(&sctx->descriptors[i]);
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
si_release_bindless_descriptors(sctx);
}
@ -2963,7 +2922,6 @@ void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx)
si_image_views_begin_new_cs(sctx, &sctx->images[i]);
}
si_buffer_resources_begin_new_cs(sctx, &sctx->internal_bindings);
si_vertex_buffers_begin_new_cs(sctx);
for (unsigned i = 0; i < ARRAY_SIZE(sctx->vertex_buffer); i++) {
struct si_resource *buf = si_resource(sctx->vertex_buffer[i].buffer.resource);

View file

@ -221,6 +221,7 @@ static void si_destroy_context(struct pipe_context *context)
si_resource_reference(&sctx->wait_mem_scratch_tmz, NULL);
si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
si_resource_reference(&sctx->pipeline_stats_query_buf, NULL);
si_resource_reference(&sctx->last_const_upload_buffer, NULL);
if (sctx->cs_preamble_state)
si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);

View file

@ -1111,13 +1111,14 @@ struct si_context {
/* Vertex buffers. */
bool vertex_buffers_dirty;
bool vertex_buffer_pointer_dirty;
bool vertex_buffer_user_sgprs_dirty;
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
struct si_resource *vb_descriptors_buffer;
unsigned vb_descriptors_offset;
unsigned vb_descriptor_user_sgprs[5 * 4];
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
/* Even though we don't need this variable, u_upload_alloc has an optimization that skips
* reference counting when the new upload buffer is the same as the last one. So keep
* the last upload buffer here and always pass &last_const_upload_buffer to u_upload_alloc.
*/
struct si_resource *last_const_upload_buffer;
/* MSAA config state. */
int ps_iter_samples;

View file

@ -5089,14 +5089,7 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
sctx->vertex_elements = v;
sctx->num_vertex_elements = v->count;
if (sctx->num_vertex_elements) {
sctx->vertex_buffers_dirty = true;
} else {
sctx->vertex_buffers_dirty = false;
sctx->vertex_buffer_pointer_dirty = false;
sctx->vertex_buffer_user_sgprs_dirty = false;
}
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
if (old->instance_divisor_is_one != v->instance_divisor_is_one ||
old->instance_divisor_is_fetched != v->instance_divisor_is_fetched ||

View file

@ -1885,6 +1885,20 @@ static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_stat
return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
}
template<amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
static unsigned get_vb_descriptor_sgpr_ptr_offset(void)
{
/* Find the location of the VB descriptor pointer. */
unsigned dw_offset = SI_VS_NUM_USER_SGPR;
if (GFX_VERSION >= GFX9) {
if (HAS_TESS)
dw_offset = GFX9_TCS_NUM_USER_SGPR;
else if (HAS_GS || NGG)
dw_offset = GFX9_GS_NUM_USER_SGPR;
}
return dw_offset * 4;
}
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
@ -1897,7 +1911,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
PIPE_SHADER_VERTEX);
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION);
bool pointer_dirty, user_sgprs_dirty;
assert(count <= SI_MAX_ATTRIBS);
@ -1908,37 +1921,39 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
unsigned alloc_size = IS_DRAW_VERTEX_STATE ?
vstate->velems.vb_desc_list_alloc_size :
velems->vb_desc_list_alloc_size;
uint64_t vb_descriptors_address = 0;
uint32_t *ptr;
if (alloc_size) {
unsigned offset;
/* Vertex buffer descriptors are the only ones which are uploaded directly
* and don't go through si_upload_graphics_shader_descriptors.
*/
u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
(struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
if (!sctx->vb_descriptors_buffer) {
sctx->vb_descriptors_offset = 0;
si_optimal_tcc_alignment(sctx, alloc_size), &offset,
(struct pipe_resource **)&sctx->last_const_upload_buffer, (void **)&ptr);
if (!sctx->last_const_upload_buffer)
return false;
}
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->last_const_upload_buffer,
RADEON_USAGE_READ | RADEON_PRIO_DESCRIPTORS);
vb_descriptors_address = sctx->last_const_upload_buffer->gpu_address + offset;
/* GFX6 doesn't support the L2 prefetch. */
if (GFX_VERSION >= GFX7)
si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
si_cp_dma_prefetch(sctx, &sctx->last_const_upload_buffer->b.b, offset,
alloc_size);
} else {
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
}
if (IS_DRAW_VERTEX_STATE) {
unsigned i = 0;
radeon_begin(&sctx->gfx_cs);
if (num_vbos_in_user_sgprs) {
unsigned num_vb_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs);
for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) {
@ -1946,15 +1961,24 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
radeon_emit_array(&vstate->descriptors[velem_index * 4], 4);
}
radeon_end();
}
for (; partial_velem_mask; i++) {
unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
if (partial_velem_mask) {
assert(alloc_size);
memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
unsigned vb_desc_offset =
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
radeon_set_sh_reg(vb_desc_offset, vb_descriptors_address);
for (; partial_velem_mask; i++) {
unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
}
}
radeon_end();
if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) {
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
@ -1964,62 +1988,48 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
/* The next draw_vbo should recompute and rebind vertex buffer descriptors. */
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
user_sgprs_dirty = false; /* We just set them above. */
pointer_dirty = count > num_vbos_in_user_sgprs;
} else {
for (unsigned i = 0; i < count; i++) {
unsigned vbo_index = velems->vertex_buffer_index[i];
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
: &ptr[(i - num_vbos_in_user_sgprs) * 4];
unsigned count_in_user_sgprs = MIN2(count, num_vbos_in_user_sgprs);
unsigned i = 0;
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
if (count_in_user_sgprs) {
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4,
count_in_user_sgprs * 4);
/* the first iteration always executes */
do {
unsigned vbo_index = velems->vertex_buffer_index[i];
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
uint32_t *desc;
radeon_emit_array_get_ptr(4, &desc);
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
} while (++i < count_in_user_sgprs);
radeon_end();
}
if (alloc_size) {
/* the first iteration always executes */
do {
unsigned vbo_index = velems->vertex_buffer_index[i];
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
} while (++i < count);
unsigned vb_desc_ptr_offset =
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg(vb_desc_ptr_offset, vb_descriptors_address);
radeon_end();
}
sctx->vertex_buffers_dirty = false;
user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
pointer_dirty = alloc_size != 0;
}
} else {
pointer_dirty = sctx->vertex_buffer_pointer_dirty;
user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;
}
if (pointer_dirty || user_sgprs_dirty) {
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
assert(count);
radeon_begin(cs);
/* Set the pointer to vertex buffer descriptors. */
if (pointer_dirty && count > num_vbos_in_user_sgprs) {
/* Find the location of the VB descriptor pointer. */
unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
if (GFX_VERSION >= GFX9) {
if (HAS_TESS)
sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
else if (HAS_GS || NGG)
sh_dw_offset = GFX9_GS_NUM_USER_SGPR;
}
radeon_set_sh_reg(sh_base + sh_dw_offset * 4,
sctx->vb_descriptors_buffer->gpu_address +
sctx->vb_descriptors_offset);
sctx->vertex_buffer_pointer_dirty = false;
}
/* Set VB descriptors in user SGPRs. */
if (user_sgprs_dirty) {
assert(num_vbos_in_user_sgprs);
unsigned num_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);
radeon_emit_array(sctx->vb_descriptor_user_sgprs, num_sgprs);
sctx->vertex_buffer_user_sgprs_dirty = false;
}
radeon_end();
}
return true;
@ -2678,8 +2688,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
/* Don't set per-stage shader pointers for VS. */
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
sctx->vertex_buffer_pointer_dirty = false;
sctx->vertex_buffer_user_sgprs_dirty = false;
sctx->vertex_buffers_dirty = false;
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
}