gallium/u_vbuf: handle indirect multidraws correctly and efficiently (v3)

v2: need to do MAX{start+count} instead of MAX{count} added piglit tests v3: use malloc Cc: 18.2 <mesa-stable@lists.freedesktop.org> Reviewed-by: Eric Anholt <eric@anholt.net>
2025-12-22 17:50:12 +01:00 · 2018-07-17 01:52:25 -04:00 · 2018-07-17 01:52:25 -04:00 · 0f79b2015b
commit 0f79b2015b
parent 1c7a2433b2
1 changed files with 179 additions and 23 deletions
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@ -1131,6 +1131,31 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
   mgr->dirty_real_vb_mask = 0;
 }
 static void
 u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
                               unsigned *indirect_data, unsigned stride,
                               unsigned draw_count)
 {
   assert(info->index_size);
   info->indirect = NULL;
   for (unsigned i = 0; i < draw_count; i++) {
      unsigned offset = i * stride / 4;
      info->count = indirect_data[offset + 0];
      info->instance_count = indirect_data[offset + 1];
      if (!info->count || !info->instance_count)
         continue;
      info->start = indirect_data[offset + 2];
      info->index_bias = indirect_data[offset + 3];
      info->start_instance = indirect_data[offset + 4];
      u_vbuf_draw_vbo(mgr, info);
   }
 }
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 {
   struct pipe_context *pipe = mgr->pipe;
@ -1160,33 +1185,163 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
   new_info = *info;
-   /* Fallback. We need to know all the parameters. */
+   /* Handle indirect (multi)draws. */
   if (new_info.indirect) {
-      struct pipe_transfer *transfer = NULL;
+      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
-      int *data;
+      unsigned draw_count = 0;
-      if (new_info.index_size) {
+      /* Get the number of draws. */
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
+      if (indirect->indirect_draw_count) {
-                                      new_info.indirect->offset, 20,
+         pipe_buffer_read(pipe, indirect->indirect_draw_count,
-                                      PIPE_TRANSFER_READ, &transfer);
+                          indirect->indirect_draw_count_offset,
-         new_info.index_bias = data[3];
+                          4, &draw_count);
-         new_info.start_instance = data[4];
+      } else {
-      }
+         draw_count = indirect->draw_count;
      else {
         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
                                      new_info.indirect->offset, 16,
                                      PIPE_TRANSFER_READ, &transfer);
         new_info.start_instance = data[3];
      }
-      new_info.count = data[0];
+      if (!draw_count)
      new_info.instance_count = data[1];
      new_info.start = data[2];
      pipe_buffer_unmap(pipe, transfer);
      new_info.indirect = NULL;
      if (!new_info.count)
         return;
      unsigned data_size = (draw_count - 1) * indirect->stride +
                           (new_info.index_size ? 20 : 16);
      unsigned *data = malloc(data_size);
      if (!data)
         return; /* report an error? */
      /* Read the used buffer range only once, because the read can be
       * uncached.
       */
      pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
                       data);
      if (info->index_size) {
         /* Indexed multidraw. */
         unsigned index_bias0 = data[3];
         bool index_bias_same = true;
         /* If we invoke the translate path, we have to split the multidraw. */
         if (incompatible_vb_mask ||
             mgr->ve->incompatible_elem_mask) {
            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
                                           indirect->stride, draw_count);
            free(data);
            return;
         }
         /* See if index_bias is the same for all draws. */
         for (unsigned i = 1; i < draw_count; i++) {
            if (data[i * indirect->stride / 4 + 3] != index_bias0) {
               index_bias_same = false;
               break;
            }
         }
         /* Split the multidraw if index_bias is different. */
         if (!index_bias_same) {
            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
                                           indirect->stride, draw_count);
            free(data);
            return;
         }
         /* If we don't need to use the translate path and index_bias is
          * the same, we can process the multidraw with the time complexity
          * equal to 1 draw call (except for the index range computation).
          * We only need to compute the index range covering all draw calls
          * of the multidraw.
          *
          * The driver will not look at these values because indirect != NULL.
          * These values determine the user buffer bounds to upload.
          */
         new_info.index_bias = index_bias0;
         new_info.min_index = ~0u;
         new_info.max_index = 0;
         new_info.start_instance = ~0u;
         unsigned end_instance = 0;
         struct pipe_transfer *transfer = NULL;
         const uint8_t *indices;
         if (info->has_user_indices) {
            indices = (uint8_t*)info->index.user;
         } else {
            indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
                                                PIPE_TRANSFER_READ, &transfer);
         }
         for (unsigned i = 0; i < draw_count; i++) {
            unsigned offset = i * indirect->stride / 4;
            unsigned start = data[offset + 2];
            unsigned count = data[offset + 0];
            unsigned start_instance = data[offset + 4];
            unsigned instance_count = data[offset + 1];
            if (!count || !instance_count)
               continue;
            /* Update the ranges of instances. */
            new_info.start_instance = MIN2(new_info.start_instance,
                                           start_instance);
            end_instance = MAX2(end_instance, start_instance + instance_count);
            /* Update the index range. */
            unsigned min, max;
            new_info.count = count; /* only used by get_minmax_index */
            u_vbuf_get_minmax_index_mapped(&new_info,
                                           indices +
                                           new_info.index_size * start,
                                           &min, &max);
            new_info.min_index = MIN2(new_info.min_index, min);
            new_info.max_index = MAX2(new_info.max_index, max);
         }
         free(data);
         if (transfer)
            pipe_buffer_unmap(pipe, transfer);
         /* Set the final instance count. */
         new_info.instance_count = end_instance - new_info.start_instance;
         if (new_info.start_instance == ~0u || !new_info.instance_count)
            return;
      } else {
         /* Non-indexed multidraw.
          *
          * Keep the draw call indirect and compute minimums & maximums,
          * which will determine the user buffer bounds to upload, but
          * the driver will not look at these values because indirect != NULL.
          *
          * This efficiently processes the multidraw with the time complexity
          * equal to 1 draw call.
          */
         new_info.start = ~0u;
         new_info.start_instance = ~0u;
         unsigned end_vertex = 0;
         unsigned end_instance = 0;
         for (unsigned i = 0; i < draw_count; i++) {
            unsigned offset = i * indirect->stride / 4;
            unsigned start = data[offset + 2];
            unsigned count = data[offset + 0];
            unsigned start_instance = data[offset + 3];
            unsigned instance_count = data[offset + 1];
            new_info.start = MIN2(new_info.start, start);
            new_info.start_instance = MIN2(new_info.start_instance,
                                           start_instance);
            end_vertex = MAX2(end_vertex, start + count);
            end_instance = MAX2(end_instance, start_instance + instance_count);
         }
         /* Set the final counts. */
         new_info.count = end_vertex - new_info.start;
         new_info.instance_count = end_instance - new_info.start_instance;
         if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
            return;
      }
   }
   if (new_info.index_size) {
@ -1211,7 +1366,8 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
          * We would have to break this drawing operation into several ones. */
         /* Use some heuristic to see if unrolling indices improves
          * performance. */
-         if (!new_info.primitive_restart &&
+         if (!info->indirect &&
             !new_info.primitive_restart &&
             num_vertices > new_info.count*2 &&
             num_vertices - new_info.count > 32 &&
             !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {