mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 04:20:08 +01:00
gallium/u_vbuf: add a faster path for uploading non-interleaved attribs
+1% higher FPS in torcs. Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5304>
This commit is contained in:
parent
88e8f1a38d
commit
90c34aed1d
1 changed files with 83 additions and 34 deletions
|
|
@ -131,6 +131,9 @@ struct u_vbuf_elements {
|
|||
* non-instanced. */
|
||||
uint32_t noninstance_vb_mask_any;
|
||||
|
||||
/* Which buffers are used by multiple vertex attribs. */
|
||||
uint32_t interleaved_vb_mask;
|
||||
|
||||
void *driver_cso;
|
||||
};
|
||||
|
||||
|
|
@ -802,6 +805,9 @@ u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
|
|||
|
||||
ve->src_format_size[i] = util_format_get_blocksize(format);
|
||||
|
||||
if (used_buffers & vb_index_bit)
|
||||
ve->interleaved_vb_mask |= vb_index_bit;
|
||||
|
||||
used_buffers |= vb_index_bit;
|
||||
|
||||
if (!ve->ve[i].instance_divisor) {
|
||||
|
|
@ -955,6 +961,49 @@ void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
|
|||
mgr->dirty_real_vb_mask |= ~mask;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE bool
|
||||
get_upload_offset_size(struct u_vbuf *mgr,
|
||||
const struct pipe_vertex_buffer *vb,
|
||||
struct u_vbuf_elements *ve,
|
||||
const struct pipe_vertex_element *velem,
|
||||
unsigned vb_index, unsigned velem_index,
|
||||
int start_vertex, unsigned num_vertices,
|
||||
int start_instance, unsigned num_instances,
|
||||
unsigned *offset, unsigned *size)
|
||||
{
|
||||
/* Skip the buffers generated by translate. */
|
||||
if ((1 << vb_index) & mgr->fallback_vbs_mask || !vb->is_user_buffer)
|
||||
return false;
|
||||
|
||||
unsigned instance_div = velem->instance_divisor;
|
||||
*offset = vb->buffer_offset + velem->src_offset;
|
||||
|
||||
if (!vb->stride) {
|
||||
/* Constant attrib. */
|
||||
*size = ve->src_format_size[velem_index];
|
||||
} else if (instance_div) {
|
||||
/* Per-instance attrib. */
|
||||
|
||||
/* Figure out how many instances we'll render given instance_div. We
|
||||
* can't use the typical div_round_up() pattern because the CTS uses
|
||||
* instance_div = ~0 for a test, which overflows div_round_up()'s
|
||||
* addition.
|
||||
*/
|
||||
unsigned count = num_instances / instance_div;
|
||||
if (count * instance_div != num_instances)
|
||||
count++;
|
||||
|
||||
*offset += vb->stride * start_instance;
|
||||
*size = vb->stride * (count - 1) + ve->src_format_size[velem_index];
|
||||
} else {
|
||||
/* Per-vertex attrib. */
|
||||
*offset += vb->stride * start_vertex;
|
||||
*size = vb->stride * (num_vertices - 1) + ve->src_format_size[velem_index];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static enum pipe_error
|
||||
u_vbuf_upload_buffers(struct u_vbuf *mgr,
|
||||
int start_vertex, unsigned num_vertices,
|
||||
|
|
@ -965,51 +1014,51 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
|
|||
unsigned nr_velems = ve->count;
|
||||
const struct pipe_vertex_element *velems =
|
||||
mgr->using_translate ? mgr->fallback_velems.velems : ve->ve;
|
||||
|
||||
/* Faster path when no vertex attribs are interleaved. */
|
||||
if ((ve->interleaved_vb_mask & mgr->user_vb_mask) == 0) {
|
||||
for (i = 0; i < nr_velems; i++) {
|
||||
const struct pipe_vertex_element *velem = &velems[i];
|
||||
unsigned index = velem->vertex_buffer_index;
|
||||
struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
|
||||
unsigned offset, size;
|
||||
|
||||
if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
|
||||
num_vertices, start_instance, num_instances,
|
||||
&offset, &size))
|
||||
continue;
|
||||
|
||||
struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[index];
|
||||
const uint8_t *ptr = mgr->vertex_buffer[index].buffer.user;
|
||||
|
||||
u_upload_data(mgr->pipe->stream_uploader,
|
||||
mgr->has_signed_vb_offset ? 0 : offset,
|
||||
size, 4, ptr + offset, &real_vb->buffer_offset,
|
||||
&real_vb->buffer.resource);
|
||||
if (!real_vb->buffer.resource)
|
||||
return PIPE_ERROR_OUT_OF_MEMORY;
|
||||
|
||||
real_vb->buffer_offset -= offset;
|
||||
}
|
||||
return PIPE_OK;
|
||||
}
|
||||
|
||||
unsigned start_offset[PIPE_MAX_ATTRIBS];
|
||||
unsigned end_offset[PIPE_MAX_ATTRIBS];
|
||||
uint32_t buffer_mask = 0;
|
||||
|
||||
/* Slower path supporting interleaved vertex attribs using 2 loops. */
|
||||
/* Determine how much data needs to be uploaded. */
|
||||
for (i = 0; i < nr_velems; i++) {
|
||||
const struct pipe_vertex_element *velem = &velems[i];
|
||||
unsigned index = velem->vertex_buffer_index;
|
||||
struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
|
||||
unsigned instance_div, first, size, index_bit;
|
||||
unsigned first, size, index_bit;
|
||||
|
||||
/* Skip the buffers generated by translate. */
|
||||
if ((1 << index) & mgr->fallback_vbs_mask) {
|
||||
if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
|
||||
num_vertices, start_instance, num_instances,
|
||||
&first, &size))
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!vb->is_user_buffer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
instance_div = velem->instance_divisor;
|
||||
first = vb->buffer_offset + velem->src_offset;
|
||||
|
||||
if (!vb->stride) {
|
||||
/* Constant attrib. */
|
||||
size = ve->src_format_size[i];
|
||||
} else if (instance_div) {
|
||||
/* Per-instance attrib. */
|
||||
|
||||
/* Figure out how many instances we'll render given instance_div. We
|
||||
* can't use the typical div_round_up() pattern because the CTS uses
|
||||
* instance_div = ~0 for a test, which overflows div_round_up()'s
|
||||
* addition.
|
||||
*/
|
||||
unsigned count = num_instances / instance_div;
|
||||
if (count * instance_div != num_instances)
|
||||
count++;
|
||||
|
||||
first += vb->stride * start_instance;
|
||||
size = vb->stride * (count - 1) + ve->src_format_size[i];
|
||||
} else {
|
||||
/* Per-vertex attrib. */
|
||||
first += vb->stride * start_vertex;
|
||||
size = vb->stride * (num_vertices - 1) + ve->src_format_size[i];
|
||||
}
|
||||
|
||||
index_bit = 1 << index;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue