mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 15:20:17 +01:00
radeonsi: move si_upload_vertex_buffer_descriptors into si_state_draw.c
It will be inlined there. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6786>
This commit is contained in:
parent
12b1e8a35d
commit
8ab15c9e33
3 changed files with 103 additions and 104 deletions
|
|
@ -1073,109 +1073,6 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
|
|||
RADEON_PRIO_DESCRIPTORS);
|
||||
}
|
||||
|
||||
bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
|
||||
{
|
||||
unsigned i, count = sctx->num_vertex_elements;
|
||||
uint32_t *ptr;
|
||||
|
||||
if (!sctx->vertex_buffers_dirty || !count)
|
||||
return true;
|
||||
|
||||
struct si_vertex_elements *velems = sctx->vertex_elements;
|
||||
unsigned alloc_size = velems->vb_desc_list_alloc_size;
|
||||
|
||||
if (alloc_size) {
|
||||
/* Vertex buffer descriptors are the only ones which are uploaded
|
||||
* directly through a staging buffer and don't go through
|
||||
* the fine-grained upload path.
|
||||
*/
|
||||
u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
|
||||
si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
|
||||
(struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
|
||||
if (!sctx->vb_descriptors_buffer) {
|
||||
sctx->vb_descriptors_offset = 0;
|
||||
sctx->vb_descriptors_gpu_list = NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
sctx->vb_descriptors_gpu_list = ptr;
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_DESCRIPTORS);
|
||||
sctx->vertex_buffer_pointer_dirty = true;
|
||||
sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
} else {
|
||||
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
|
||||
sctx->vertex_buffer_pointer_dirty = false;
|
||||
sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
}
|
||||
|
||||
assert(count <= SI_MAX_ATTRIBS);
|
||||
|
||||
unsigned first_vb_use_mask = velems->first_vb_use_mask;
|
||||
unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
struct pipe_vertex_buffer *vb;
|
||||
struct si_resource *buf;
|
||||
unsigned vbo_index = velems->vertex_buffer_index[i];
|
||||
uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
|
||||
: &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
|
||||
vb = &sctx->vertex_buffer[vbo_index];
|
||||
buf = si_resource(vb->buffer.resource);
|
||||
if (!buf) {
|
||||
memset(desc, 0, 16);
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
|
||||
|
||||
if (offset >= buf->b.b.width0) {
|
||||
assert(offset < buf->b.b.width0);
|
||||
memset(desc, 0, 16);
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t va = buf->gpu_address + offset;
|
||||
|
||||
int64_t num_records = (int64_t)buf->b.b.width0 - offset;
|
||||
if (sctx->chip_class != GFX8 && vb->stride) {
|
||||
/* Round up by rounding down and adding 1 */
|
||||
num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
|
||||
}
|
||||
assert(num_records >= 0 && num_records <= UINT_MAX);
|
||||
|
||||
uint32_t rsrc_word3 = velems->rsrc_word3[i];
|
||||
|
||||
/* OOB_SELECT chooses the out-of-bounds check:
|
||||
* - 1: index >= NUM_RECORDS (Structured)
|
||||
* - 3: offset >= NUM_RECORDS (Raw)
|
||||
*/
|
||||
if (sctx->chip_class >= GFX10)
|
||||
rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
|
||||
: V_008F0C_OOB_SELECT_RAW);
|
||||
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
|
||||
desc[2] = num_records;
|
||||
desc[3] = rsrc_word3;
|
||||
|
||||
if (first_vb_use_mask & (1 << i)) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource),
|
||||
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
|
||||
}
|
||||
}
|
||||
|
||||
/* Don't flush the const cache. It would have a very negative effect
|
||||
* on performance (confirmed by testing). New descriptors are always
|
||||
* uploaded to a fresh new buffer, so I don't think flushing the const
|
||||
* cache is needed. */
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* CONSTANT BUFFERS */
|
||||
|
||||
static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
|
||||
|
|
|
|||
|
|
@ -496,7 +496,6 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource
|
|||
unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
|
||||
unsigned element_size, unsigned index_stride, uint64_t offset);
|
||||
void si_init_all_descriptors(struct si_context *sctx);
|
||||
bool si_upload_vertex_buffer_descriptors(struct si_context *sctx);
|
||||
bool si_upload_graphics_shader_descriptors(struct si_context *sctx);
|
||||
bool si_upload_compute_shader_descriptors(struct si_context *sctx);
|
||||
void si_release_all_descriptors(struct si_context *sctx);
|
||||
|
|
|
|||
|
|
@ -1410,6 +1410,109 @@ void si_emit_cache_flush(struct si_context *sctx)
|
|||
sctx->flags = 0;
|
||||
}
|
||||
|
||||
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
|
||||
{
|
||||
unsigned i, count = sctx->num_vertex_elements;
|
||||
uint32_t *ptr;
|
||||
|
||||
if (!sctx->vertex_buffers_dirty || !count)
|
||||
return true;
|
||||
|
||||
struct si_vertex_elements *velems = sctx->vertex_elements;
|
||||
unsigned alloc_size = velems->vb_desc_list_alloc_size;
|
||||
|
||||
if (alloc_size) {
|
||||
/* Vertex buffer descriptors are the only ones which are uploaded
|
||||
* directly through a staging buffer and don't go through
|
||||
* the fine-grained upload path.
|
||||
*/
|
||||
u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
|
||||
si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
|
||||
(struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
|
||||
if (!sctx->vb_descriptors_buffer) {
|
||||
sctx->vb_descriptors_offset = 0;
|
||||
sctx->vb_descriptors_gpu_list = NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
sctx->vb_descriptors_gpu_list = ptr;
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_DESCRIPTORS);
|
||||
sctx->vertex_buffer_pointer_dirty = true;
|
||||
sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
} else {
|
||||
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
|
||||
sctx->vertex_buffer_pointer_dirty = false;
|
||||
sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
}
|
||||
|
||||
assert(count <= SI_MAX_ATTRIBS);
|
||||
|
||||
unsigned first_vb_use_mask = velems->first_vb_use_mask;
|
||||
unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
struct pipe_vertex_buffer *vb;
|
||||
struct si_resource *buf;
|
||||
unsigned vbo_index = velems->vertex_buffer_index[i];
|
||||
uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
|
||||
: &ptr[(i - num_vbos_in_user_sgprs) * 4];
|
||||
|
||||
vb = &sctx->vertex_buffer[vbo_index];
|
||||
buf = si_resource(vb->buffer.resource);
|
||||
if (!buf) {
|
||||
memset(desc, 0, 16);
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
|
||||
|
||||
if (offset >= buf->b.b.width0) {
|
||||
assert(offset < buf->b.b.width0);
|
||||
memset(desc, 0, 16);
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t va = buf->gpu_address + offset;
|
||||
|
||||
int64_t num_records = (int64_t)buf->b.b.width0 - offset;
|
||||
if (sctx->chip_class != GFX8 && vb->stride) {
|
||||
/* Round up by rounding down and adding 1 */
|
||||
num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
|
||||
}
|
||||
assert(num_records >= 0 && num_records <= UINT_MAX);
|
||||
|
||||
uint32_t rsrc_word3 = velems->rsrc_word3[i];
|
||||
|
||||
/* OOB_SELECT chooses the out-of-bounds check:
|
||||
* - 1: index >= NUM_RECORDS (Structured)
|
||||
* - 3: offset >= NUM_RECORDS (Raw)
|
||||
*/
|
||||
if (sctx->chip_class >= GFX10)
|
||||
rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
|
||||
: V_008F0C_OOB_SELECT_RAW);
|
||||
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
|
||||
desc[2] = num_records;
|
||||
desc[3] = rsrc_word3;
|
||||
|
||||
if (first_vb_use_mask & (1 << i)) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource),
|
||||
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
|
||||
}
|
||||
}
|
||||
|
||||
/* Don't flush the const cache. It would have a very negative effect
|
||||
* on performance (confirmed by testing). New descriptors are always
|
||||
* uploaded to a fresh new buffer, so I don't think flushing the const
|
||||
* cache is needed. */
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||
sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
|
||||
unsigned *start, unsigned *count)
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue