gallium/u_vbuf: add a faster path for uploading non-interleaved attribs

+1% higher FPS in torcs.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5304>
This commit is contained in:
Marek Olšák 2020-06-01 15:56:12 -04:00
parent 88e8f1a38d
commit 90c34aed1d

View file

@ -131,6 +131,9 @@ struct u_vbuf_elements {
* non-instanced. */
uint32_t noninstance_vb_mask_any;
/* Which buffers are used by multiple vertex attribs. */
uint32_t interleaved_vb_mask;
void *driver_cso;
};
@ -802,6 +805,9 @@ u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
ve->src_format_size[i] = util_format_get_blocksize(format);
if (used_buffers & vb_index_bit)
ve->interleaved_vb_mask |= vb_index_bit;
used_buffers |= vb_index_bit;
if (!ve->ve[i].instance_divisor) {
@ -955,6 +961,49 @@ void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
mgr->dirty_real_vb_mask |= ~mask;
}
static ALWAYS_INLINE bool
get_upload_offset_size(struct u_vbuf *mgr,
const struct pipe_vertex_buffer *vb,
struct u_vbuf_elements *ve,
const struct pipe_vertex_element *velem,
unsigned vb_index, unsigned velem_index,
int start_vertex, unsigned num_vertices,
int start_instance, unsigned num_instances,
unsigned *offset, unsigned *size)
{
/* Skip the buffers generated by translate. */
if ((1 << vb_index) & mgr->fallback_vbs_mask || !vb->is_user_buffer)
return false;
unsigned instance_div = velem->instance_divisor;
*offset = vb->buffer_offset + velem->src_offset;
if (!vb->stride) {
/* Constant attrib. */
*size = ve->src_format_size[velem_index];
} else if (instance_div) {
/* Per-instance attrib. */
/* Figure out how many instances we'll render given instance_div. We
* can't use the typical div_round_up() pattern because the CTS uses
* instance_div = ~0 for a test, which overflows div_round_up()'s
* addition.
*/
unsigned count = num_instances / instance_div;
if (count * instance_div != num_instances)
count++;
*offset += vb->stride * start_instance;
*size = vb->stride * (count - 1) + ve->src_format_size[velem_index];
} else {
/* Per-vertex attrib. */
*offset += vb->stride * start_vertex;
*size = vb->stride * (num_vertices - 1) + ve->src_format_size[velem_index];
}
return true;
}
static enum pipe_error
u_vbuf_upload_buffers(struct u_vbuf *mgr,
int start_vertex, unsigned num_vertices,
@ -965,51 +1014,51 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
unsigned nr_velems = ve->count;
const struct pipe_vertex_element *velems =
mgr->using_translate ? mgr->fallback_velems.velems : ve->ve;
/* Faster path when no vertex attribs are interleaved. */
if ((ve->interleaved_vb_mask & mgr->user_vb_mask) == 0) {
for (i = 0; i < nr_velems; i++) {
const struct pipe_vertex_element *velem = &velems[i];
unsigned index = velem->vertex_buffer_index;
struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
unsigned offset, size;
if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
num_vertices, start_instance, num_instances,
&offset, &size))
continue;
struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[index];
const uint8_t *ptr = mgr->vertex_buffer[index].buffer.user;
u_upload_data(mgr->pipe->stream_uploader,
mgr->has_signed_vb_offset ? 0 : offset,
size, 4, ptr + offset, &real_vb->buffer_offset,
&real_vb->buffer.resource);
if (!real_vb->buffer.resource)
return PIPE_ERROR_OUT_OF_MEMORY;
real_vb->buffer_offset -= offset;
}
return PIPE_OK;
}
unsigned start_offset[PIPE_MAX_ATTRIBS];
unsigned end_offset[PIPE_MAX_ATTRIBS];
uint32_t buffer_mask = 0;
/* Slower path supporting interleaved vertex attribs using 2 loops. */
/* Determine how much data needs to be uploaded. */
for (i = 0; i < nr_velems; i++) {
const struct pipe_vertex_element *velem = &velems[i];
unsigned index = velem->vertex_buffer_index;
struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
unsigned instance_div, first, size, index_bit;
unsigned first, size, index_bit;
/* Skip the buffers generated by translate. */
if ((1 << index) & mgr->fallback_vbs_mask) {
if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex,
num_vertices, start_instance, num_instances,
&first, &size))
continue;
}
if (!vb->is_user_buffer) {
continue;
}
instance_div = velem->instance_divisor;
first = vb->buffer_offset + velem->src_offset;
if (!vb->stride) {
/* Constant attrib. */
size = ve->src_format_size[i];
} else if (instance_div) {
/* Per-instance attrib. */
/* Figure out how many instances we'll render given instance_div. We
* can't use the typical div_round_up() pattern because the CTS uses
* instance_div = ~0 for a test, which overflows div_round_up()'s
* addition.
*/
unsigned count = num_instances / instance_div;
if (count * instance_div != num_instances)
count++;
first += vb->stride * start_instance;
size = vb->stride * (count - 1) + ve->src_format_size[i];
} else {
/* Per-vertex attrib. */
first += vb->stride * start_vertex;
size = vb->stride * (num_vertices - 1) + ve->src_format_size[i];
}
index_bit = 1 << index;