diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index ab236e2661b..d58f7b0af62 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -955,7 +955,8 @@ static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_s enum si_prim_discard_outcome si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - bool primitive_restart) + const struct pipe_draw_start_count *draws, + unsigned num_draws, bool primitive_restart) { /* If the compute shader compilation isn't finished, this returns false. */ if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) @@ -966,7 +967,7 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; unsigned prim = info->mode; - unsigned count = info->count; + unsigned count = draws[0].count; unsigned instance_count = info->instance_count; unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); unsigned num_prims = num_prims_per_instance * instance_count; @@ -982,19 +983,21 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) { /* Split draws. */ struct pipe_draw_info split_draw = *info; + struct pipe_draw_start_count split_draw_range = draws[0]; + split_draw.primitive_restart = primitive_restart; - unsigned base_start = split_draw.start; + unsigned base_start = split_draw_range.start; if (prim == PIPE_PRIM_TRIANGLES) { unsigned vert_count_per_subdraw = split_prims_draw_level * 3; assert(vert_count_per_subdraw < count); for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw); + split_draw_range.start = base_start + start; + split_draw_range.count = MIN2(count - start, vert_count_per_subdraw); - sctx->b.draw_vbo(&sctx->b, &split_draw); + sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1); } } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { /* No primitive pair can be split, because strips reverse orientation @@ -1004,10 +1007,10 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe unsigned vert_count_per_subdraw = split_prims_draw_level; for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); + split_draw_range.start = base_start + start; + split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2); - sctx->b.draw_vbo(&sctx->b, &split_draw); + sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1); if (start == 0 && primitive_restart && sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) @@ -1098,13 +1101,14 @@ void si_compute_signal_gfx(struct si_context *sctx) /* Dispatch a primitive discard compute shader. */ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, unsigned index_size, + const struct pipe_draw_info *info, + unsigned count, unsigned index_size, unsigned base_vertex, uint64_t input_indexbuf_va, unsigned input_indexbuf_num_elements) { struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count); if (!num_prims_per_instance) return; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 68dbadc76db..ce906f50e2c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1456,10 +1456,12 @@ enum si_prim_discard_outcome void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); enum si_prim_discard_outcome si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - bool primitive_restart); + const struct pipe_draw_start_count *draws, + unsigned num_draws, bool primitive_restart); void si_compute_signal_gfx(struct si_context *sctx); void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, unsigned index_size, + const struct pipe_draw_info *info, + unsigned count, unsigned index_size, unsigned base_vertex, uint64_t input_indexbuf_va, unsigned input_indexbuf_max_elements); void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 2ee204b23e8..95945b7a81e 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -760,6 +760,8 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr } static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, + const struct pipe_draw_start_count *draws, + unsigned num_draws, struct pipe_resource *indexbuf, unsigned index_size, unsigned index_offset, unsigned instance_count, bool dispatch_prim_discard_cs, unsigned original_index_size) @@ -838,6 +840,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } if (indirect) { + assert(num_draws == 1); uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address; assert(indirect_va % 8 == 0); @@ -912,7 +915,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } /* Base vertex and start instance. */ - base_vertex = original_index_size ? info->index_bias : info->start; + base_vertex = original_index_size ? info->index_bias : draws[0].start; if (sctx->num_vs_blit_sgprs) { /* Re-emit draw constants after we leave u_blitter. */ @@ -938,25 +941,26 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (index_size) { if (dispatch_prim_discard_cs) { - index_va += info->start * original_index_size; - index_max_size = MIN2(index_max_size, info->count); + index_va += draws[0].start * original_index_size; + index_max_size = MIN2(index_max_size, draws[0].count); - si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex, + si_dispatch_prim_discard_cs_and_draw(sctx, info, draws[0].count, + original_index_size, base_vertex, index_va, index_max_size); return; } - index_va += info->start * index_size; + index_va += draws[0].start * index_size; radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); radeon_emit(cs, index_max_size); radeon_emit(cs, index_va); radeon_emit(cs, index_va >> 32); - radeon_emit(cs, info->count); + radeon_emit(cs, draws[0].count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); } else { radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); - radeon_emit(cs, info->count); + radeon_emit(cs, draws[0].count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); } @@ -1537,7 +1541,8 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) } static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info, - unsigned *start, unsigned *count) + const struct pipe_draw_start_count *draws, + unsigned num_draws, unsigned *start, unsigned *count) { struct pipe_draw_indirect_info *indirect = info->indirect; @@ -1593,8 +1598,16 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d *start = *count = 0; } } else { - *start = info->start; - *count = info->count; + unsigned min_element = UINT_MAX; + unsigned max_element = 0; + + for (unsigned i = 0; i < num_draws; i++) { + min_element = MIN2(min_element, draws[i].start); + max_element = MAX2(max_element, draws[i].start + draws[i].count); + } + + *start = min_element; + *count = max_element; } } @@ -1724,7 +1737,10 @@ static ALWAYS_INLINE bool pd_msg(const char *s) return false; } -static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) +static void si_multi_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count *draws, + unsigned num_draws) { struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; @@ -1732,7 +1748,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i unsigned dirty_tex_counter, dirty_buf_counter; enum pipe_prim_type rast_prim, prim = info->mode; unsigned index_size = info->index_size; - unsigned index_offset = info->indirect ? info->start * index_size : 0; + unsigned index_offset = info->indirect ? draws[0].start * index_size : 0; unsigned instance_count = info->instance_count; bool primitive_restart = info->primitive_restart && @@ -1841,7 +1857,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i unsigned start, count, start_offset, size, offset; void *ptr; - si_get_draw_start_count(sctx, info, &start, &count); + si_get_draw_start_count(sctx, info, draws, num_draws, &start, &count); start_offset = start * 2; size = count * 2; @@ -1860,10 +1876,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i unsigned start_offset; assert(!info->indirect); - start_offset = info->start * index_size; + assert(num_draws == 1); + start_offset = draws[0].start * index_size; indexbuf = NULL; - u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size, + u_upload_data(ctx->stream_uploader, start_offset, draws[0].count * index_size, sctx->screen->info.tcc_cache_line_size, (char *)info->index.user + start_offset, &index_offset, &indexbuf); if (unlikely(!indexbuf)) @@ -1882,7 +1899,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i bool dispatch_prim_discard_cs = false; bool prim_discard_cs_instancing = false; unsigned original_index_size = index_size; - unsigned direct_count = 0; + unsigned avg_direct_count = 0; + unsigned min_direct_count = 0; + unsigned total_direct_count = 0; if (info->indirect) { struct pipe_draw_indirect_info *indirect = info->indirect; @@ -1904,17 +1923,21 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } } } else { - /* Multiply by 3 for strips and fans to get an approximate vertex - * count as triangles. */ - direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3); + for (unsigned i = 0; i < num_draws; i++) { + unsigned count = draws[i].count; + + total_direct_count += count; + min_direct_count = MIN2(min_direct_count, count); + } + avg_direct_count = (total_direct_count / num_draws) * instance_count; } /* Determine if we can use the primitive discard compute shader. */ if (si_compute_prim_discard_enabled(sctx) && - (direct_count > sctx->prim_discard_vertex_count_threshold - ? (sctx->compute_num_verts_rejected += direct_count, true) + (avg_direct_count > sctx->prim_discard_vertex_count_threshold + ? (sctx->compute_num_verts_rejected += total_direct_count, true) : /* Add, then return true. */ - (sctx->compute_num_verts_ineligible += direct_count, + (sctx->compute_num_verts_ineligible += total_direct_count, false)) && /* Add, then return false. */ (!info->count_from_stream_output || pd_msg("draw_opaque")) && (primitive_restart ? @@ -1958,7 +1981,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i * dispatches can run ahead. */ (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) { - switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) { + switch (si_prepare_prim_discard_or_split_draw(sctx, info, draws, num_draws, + primitive_restart)) { case SI_PRIM_DISCARD_ENABLED: original_index_size = index_size; prim_discard_cs_instancing = instance_count > 1; @@ -1969,13 +1993,13 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i index_size = 4; instance_count = 1; primitive_restart = false; - sctx->compute_num_verts_rejected -= direct_count; - sctx->compute_num_verts_accepted += direct_count; + sctx->compute_num_verts_rejected -= total_direct_count; + sctx->compute_num_verts_accepted += total_direct_count; break; case SI_PRIM_DISCARD_DISABLED: break; case SI_PRIM_DISCARD_DRAW_SPLIT: - sctx->compute_num_verts_rejected -= direct_count; + sctx->compute_num_verts_rejected -= total_direct_count; goto return_cleanup; } } @@ -1989,9 +2013,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i struct si_shader_selector *hw_vs; if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES && (hw_vs = si_get_vs(sctx)->cso) && - (direct_count > hw_vs->ngg_cull_vert_threshold || + (avg_direct_count > hw_vs->ngg_cull_vert_threshold || (!index_size && - direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold && + avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold && prim & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))))) { unsigned ngg_culling = 0; @@ -2015,7 +2039,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i /* Use NGG fast launch for certain non-indexed primitive types. * A draw must have at least 1 full primitive. */ - if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso && + if (ngg_culling && !index_size && min_direct_count >= 3 && !sctx->tes_shader.cso && !sctx->gs_shader.cso) { if (prim == PIPE_PRIM_TRIANGLES) ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; @@ -2048,7 +2072,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx))) goto return_cleanup; - si_need_gfx_cs_space(sctx, 0); + si_need_gfx_cs_space(sctx, num_draws); /* If we're using a secure context, determine if cs must be secure or not */ if (unlikely(radeon_uses_secure_bos(sctx->ws))) { @@ -2101,7 +2125,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); /* Emit all states except possibly render condition. */ - si_emit_all_states(sctx, info, prim, instance_count, info->count, + si_emit_all_states(sctx, info, prim, instance_count, min_direct_count, primitive_restart, masked_atoms); sctx->emit_cache_flush(sctx); /* <-- CUs are idle here. */ @@ -2118,7 +2142,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count, + si_emit_draw_packets(sctx, info, draws, num_draws, + indexbuf, index_size, index_offset, instance_count, dispatch_prim_discard_cs, original_index_size); /* <-- CUs are busy here. */ @@ -2138,7 +2163,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx, true); - si_emit_all_states(sctx, info, prim, instance_count, info->count, + si_emit_all_states(sctx, info, prim, instance_count, min_direct_count, primitive_restart, masked_atoms); if (gfx9_scissor_bug && @@ -2148,7 +2173,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count, + si_emit_draw_packets(sctx, info, draws, num_draws, + indexbuf, index_size, index_offset, instance_count, dispatch_prim_discard_cs, original_index_size); /* Prefetch the remaining shaders after the draw has been @@ -2189,6 +2215,14 @@ return_cleanup: pipe_resource_reference(&indexbuf, NULL); } +static void si_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info) +{ + struct pipe_draw_start_count draw = {info->start, info->count}; + + si_multi_draw_vbo(ctx, info, &draw, 1); +} + static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso, blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2, float depth, unsigned num_instances, enum blitter_attrib_type type, @@ -2245,6 +2279,7 @@ void si_trace_emit(struct si_context *sctx) void si_init_draw_functions(struct si_context *sctx) { sctx->b.draw_vbo = si_draw_vbo; + sctx->b.multi_draw = si_multi_draw_vbo; sctx->blitter->draw_rectangle = si_draw_rectangle;