mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-01 03:48:06 +02:00
radeonsi: remove temporary si_context::vb_descriptor_user_sgprs
We were writing descriptors into si_context and then copying them into the command buffer. Just write them into the command buffer directly. Also set the pointer to VBO descriptors right after them. When we start a new command buffer or we finish blitting, we no longer restore precomputed VBO descriptors. Instead, we just reupload them again. It's a compromise to have the common path simpler and faster (maybe). This removes a lot of stuff. Now the VBO descriptor upload path looks very similar to the display list path. There was an accidental hidden optimization that is now documented as "last_const_upload_buffer". Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17933>
This commit is contained in:
parent
a5d37e161d
commit
0e574c801c
7 changed files with 98 additions and 133 deletions
|
|
@ -110,12 +110,7 @@ void si_blitter_end(struct si_context *sctx)
|
|||
if (sctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
|
||||
|
||||
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
|
||||
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
|
||||
sctx->num_vertex_elements >
|
||||
num_vbos_in_user_sgprs;
|
||||
sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
|
||||
num_vbos_in_user_sgprs;
|
||||
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -73,6 +73,14 @@
|
|||
__cs_num += __n; \
|
||||
} while (0)
|
||||
|
||||
/* Instead of writing into the command buffer, return the pointer to the command buffer and
|
||||
* assume that the caller will fill the specified number of elements.
|
||||
*/
|
||||
#define radeon_emit_array_get_ptr(num, ptr) do { \
|
||||
*(ptr) = __cs_buf + __cs_num; \
|
||||
__cs_num += (num); \
|
||||
} while (0)
|
||||
|
||||
#define radeon_set_config_reg_seq(reg, num) do { \
|
||||
SI_CHECK_SHADOWED_REGS(reg, num); \
|
||||
assert((reg) < SI_CONTEXT_REG_OFFSET); \
|
||||
|
|
|
|||
|
|
@ -1160,32 +1160,6 @@ static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
|
|||
}
|
||||
}
|
||||
|
||||
/* VERTEX BUFFERS */
|
||||
|
||||
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
|
||||
{
|
||||
int count = sctx->num_vertex_elements;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
int vb = sctx->vertex_elements->vertex_buffer_index[i];
|
||||
|
||||
if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
|
||||
continue;
|
||||
if (!sctx->vertex_buffer[vb].buffer.resource)
|
||||
continue;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
|
||||
si_resource(sctx->vertex_buffer[vb].buffer.resource),
|
||||
RADEON_USAGE_READ | RADEON_PRIO_VERTEX_BUFFER);
|
||||
}
|
||||
|
||||
if (!sctx->vb_descriptors_buffer)
|
||||
return;
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_DESCRIPTORS);
|
||||
}
|
||||
|
||||
/* CONSTANT BUFFERS */
|
||||
|
||||
static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
|
||||
|
|
@ -2059,29 +2033,16 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
|
|||
sctx->shader_pointers_dirty |=
|
||||
u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
|
||||
|
||||
if (shader == PIPE_SHADER_VERTEX) {
|
||||
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
|
||||
|
||||
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
|
||||
sctx->num_vertex_elements >
|
||||
num_vbos_in_user_sgprs;
|
||||
sctx->vertex_buffer_user_sgprs_dirty =
|
||||
sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
|
||||
}
|
||||
if (shader == PIPE_SHADER_VERTEX)
|
||||
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
}
|
||||
|
||||
void si_shader_pointers_mark_dirty(struct si_context *sctx)
|
||||
{
|
||||
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
|
||||
|
||||
sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
|
||||
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
|
||||
sctx->num_vertex_elements >
|
||||
num_vbos_in_user_sgprs;
|
||||
sctx->vertex_buffer_user_sgprs_dirty =
|
||||
sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
|
||||
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
|
||||
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
|
||||
|
|
@ -2884,8 +2845,6 @@ void si_release_all_descriptors(struct si_context *sctx)
|
|||
for (i = 0; i < SI_NUM_DESCS; ++i)
|
||||
si_release_descriptors(&sctx->descriptors[i]);
|
||||
|
||||
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
|
||||
|
||||
si_release_bindless_descriptors(sctx);
|
||||
}
|
||||
|
||||
|
|
@ -2963,7 +2922,6 @@ void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx)
|
|||
si_image_views_begin_new_cs(sctx, &sctx->images[i]);
|
||||
}
|
||||
si_buffer_resources_begin_new_cs(sctx, &sctx->internal_bindings);
|
||||
si_vertex_buffers_begin_new_cs(sctx);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->vertex_buffer); i++) {
|
||||
struct si_resource *buf = si_resource(sctx->vertex_buffer[i].buffer.resource);
|
||||
|
|
|
|||
|
|
@ -221,6 +221,7 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
si_resource_reference(&sctx->wait_mem_scratch_tmz, NULL);
|
||||
si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
|
||||
si_resource_reference(&sctx->pipeline_stats_query_buf, NULL);
|
||||
si_resource_reference(&sctx->last_const_upload_buffer, NULL);
|
||||
|
||||
if (sctx->cs_preamble_state)
|
||||
si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);
|
||||
|
|
|
|||
|
|
@ -1111,13 +1111,14 @@ struct si_context {
|
|||
|
||||
/* Vertex buffers. */
|
||||
bool vertex_buffers_dirty;
|
||||
bool vertex_buffer_pointer_dirty;
|
||||
bool vertex_buffer_user_sgprs_dirty;
|
||||
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
|
||||
uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
|
||||
struct si_resource *vb_descriptors_buffer;
|
||||
unsigned vb_descriptors_offset;
|
||||
unsigned vb_descriptor_user_sgprs[5 * 4];
|
||||
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
|
||||
|
||||
/* Even though we don't need this variable, u_upload_alloc has an optimization that skips
|
||||
* reference counting when the new upload buffer is the same as the last one. So keep
|
||||
* the last upload buffer here and always pass &last_const_upload_buffer to u_upload_alloc.
|
||||
*/
|
||||
struct si_resource *last_const_upload_buffer;
|
||||
|
||||
/* MSAA config state. */
|
||||
int ps_iter_samples;
|
||||
|
|
|
|||
|
|
@ -5089,14 +5089,7 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
|
|||
|
||||
sctx->vertex_elements = v;
|
||||
sctx->num_vertex_elements = v->count;
|
||||
|
||||
if (sctx->num_vertex_elements) {
|
||||
sctx->vertex_buffers_dirty = true;
|
||||
} else {
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
sctx->vertex_buffer_pointer_dirty = false;
|
||||
sctx->vertex_buffer_user_sgprs_dirty = false;
|
||||
}
|
||||
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
|
||||
|
||||
if (old->instance_divisor_is_one != v->instance_divisor_is_one ||
|
||||
old->instance_divisor_is_fetched != v->instance_divisor_is_fetched ||
|
||||
|
|
|
|||
|
|
@ -1885,6 +1885,20 @@ static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_stat
|
|||
return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
|
||||
}
|
||||
|
||||
template<amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
|
||||
static unsigned get_vb_descriptor_sgpr_ptr_offset(void)
|
||||
{
|
||||
/* Find the location of the VB descriptor pointer. */
|
||||
unsigned dw_offset = SI_VS_NUM_USER_SGPR;
|
||||
if (GFX_VERSION >= GFX9) {
|
||||
if (HAS_TESS)
|
||||
dw_offset = GFX9_TCS_NUM_USER_SGPR;
|
||||
else if (HAS_GS || NGG)
|
||||
dw_offset = GFX9_GS_NUM_USER_SGPR;
|
||||
}
|
||||
return dw_offset * 4;
|
||||
}
|
||||
|
||||
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
||||
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
|
||||
static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
|
||||
|
|
@ -1897,7 +1911,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
|
|||
unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
|
||||
PIPE_SHADER_VERTEX);
|
||||
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION);
|
||||
bool pointer_dirty, user_sgprs_dirty;
|
||||
|
||||
assert(count <= SI_MAX_ATTRIBS);
|
||||
|
||||
|
|
@ -1908,37 +1921,39 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
|
|||
unsigned alloc_size = IS_DRAW_VERTEX_STATE ?
|
||||
vstate->velems.vb_desc_list_alloc_size :
|
||||
velems->vb_desc_list_alloc_size;
|
||||
uint64_t vb_descriptors_address = 0;
|
||||
uint32_t *ptr;
|
||||
|
||||
if (alloc_size) {
|
||||
unsigned offset;
|
||||
|
||||
/* Vertex buffer descriptors are the only ones which are uploaded directly
|
||||
* and don't go through si_upload_graphics_shader_descriptors.
|
||||
*/
|
||||
u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
|
||||
si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
|
||||
(struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
|
||||
if (!sctx->vb_descriptors_buffer) {
|
||||
sctx->vb_descriptors_offset = 0;
|
||||
si_optimal_tcc_alignment(sctx, alloc_size), &offset,
|
||||
(struct pipe_resource **)&sctx->last_const_upload_buffer, (void **)&ptr);
|
||||
if (!sctx->last_const_upload_buffer)
|
||||
return false;
|
||||
}
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->last_const_upload_buffer,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_DESCRIPTORS);
|
||||
vb_descriptors_address = sctx->last_const_upload_buffer->gpu_address + offset;
|
||||
|
||||
/* GFX6 doesn't support the L2 prefetch. */
|
||||
if (GFX_VERSION >= GFX7)
|
||||
si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
|
||||
si_cp_dma_prefetch(sctx, &sctx->last_const_upload_buffer->b.b, offset,
|
||||
alloc_size);
|
||||
} else {
|
||||
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
|
||||
}
|
||||
|
||||
if (IS_DRAW_VERTEX_STATE) {
|
||||
unsigned i = 0;
|
||||
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
|
||||
if (num_vbos_in_user_sgprs) {
|
||||
unsigned num_vb_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
|
||||
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs);
|
||||
|
||||
for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) {
|
||||
|
|
@ -1946,15 +1961,24 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
|
|||
|
||||
radeon_emit_array(&vstate->descriptors[velem_index * 4], 4);
|
||||
}
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
for (; partial_velem_mask; i++) {
|
||||
unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
|
||||
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
if (partial_velem_mask) {
|
||||
assert(alloc_size);
|
||||
|
||||
memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
|
||||
unsigned vb_desc_offset =
|
||||
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
|
||||
|
||||
radeon_set_sh_reg(vb_desc_offset, vb_descriptors_address);
|
||||
|
||||
for (; partial_velem_mask; i++) {
|
||||
unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
|
||||
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
|
||||
memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
|
||||
}
|
||||
}
|
||||
radeon_end();
|
||||
|
||||
if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) {
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
|
||||
|
|
@ -1964,62 +1988,48 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
|
|||
|
||||
/* The next draw_vbo should recompute and rebind vertex buffer descriptors. */
|
||||
sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
|
||||
|
||||
user_sgprs_dirty = false; /* We just set them above. */
|
||||
pointer_dirty = count > num_vbos_in_user_sgprs;
|
||||
} else {
|
||||
for (unsigned i = 0; i < count; i++) {
|
||||
unsigned vbo_index = velems->vertex_buffer_index[i];
|
||||
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
|
||||
uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
|
||||
: &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
unsigned count_in_user_sgprs = MIN2(count, num_vbos_in_user_sgprs);
|
||||
unsigned i = 0;
|
||||
|
||||
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
|
||||
if (count_in_user_sgprs) {
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4,
|
||||
count_in_user_sgprs * 4);
|
||||
|
||||
/* the first iteration always executes */
|
||||
do {
|
||||
unsigned vbo_index = velems->vertex_buffer_index[i];
|
||||
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
|
||||
uint32_t *desc;
|
||||
|
||||
radeon_emit_array_get_ptr(4, &desc);
|
||||
|
||||
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
|
||||
} while (++i < count_in_user_sgprs);
|
||||
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
if (alloc_size) {
|
||||
/* the first iteration always executes */
|
||||
do {
|
||||
unsigned vbo_index = velems->vertex_buffer_index[i];
|
||||
struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
|
||||
uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
|
||||
si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc);
|
||||
} while (++i < count);
|
||||
|
||||
unsigned vb_desc_ptr_offset =
|
||||
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
radeon_set_sh_reg(vb_desc_ptr_offset, vb_descriptors_address);
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
|
||||
pointer_dirty = alloc_size != 0;
|
||||
}
|
||||
} else {
|
||||
pointer_dirty = sctx->vertex_buffer_pointer_dirty;
|
||||
user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;
|
||||
}
|
||||
|
||||
if (pointer_dirty || user_sgprs_dirty) {
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
assert(count);
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
/* Set the pointer to vertex buffer descriptors. */
|
||||
if (pointer_dirty && count > num_vbos_in_user_sgprs) {
|
||||
/* Find the location of the VB descriptor pointer. */
|
||||
unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
|
||||
if (GFX_VERSION >= GFX9) {
|
||||
if (HAS_TESS)
|
||||
sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
|
||||
else if (HAS_GS || NGG)
|
||||
sh_dw_offset = GFX9_GS_NUM_USER_SGPR;
|
||||
}
|
||||
|
||||
radeon_set_sh_reg(sh_base + sh_dw_offset * 4,
|
||||
sctx->vb_descriptors_buffer->gpu_address +
|
||||
sctx->vb_descriptors_offset);
|
||||
sctx->vertex_buffer_pointer_dirty = false;
|
||||
}
|
||||
|
||||
/* Set VB descriptors in user SGPRs. */
|
||||
if (user_sgprs_dirty) {
|
||||
assert(num_vbos_in_user_sgprs);
|
||||
|
||||
unsigned num_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
|
||||
|
||||
radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);
|
||||
radeon_emit_array(sctx->vb_descriptor_user_sgprs, num_sgprs);
|
||||
sctx->vertex_buffer_user_sgprs_dirty = false;
|
||||
}
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -2678,8 +2688,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
|
|||
|
||||
/* Don't set per-stage shader pointers for VS. */
|
||||
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
|
||||
sctx->vertex_buffer_pointer_dirty = false;
|
||||
sctx->vertex_buffer_user_sgprs_dirty = false;
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
|
||||
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue