mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
radeonsi: add optimal multi draws and draw-level splitting for prim discard CS
This is a partial rewrite of some parts of the code. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11510>
This commit is contained in:
parent
81365c86f9
commit
b141e50282
4 changed files with 208 additions and 139 deletions
|
|
@ -95,23 +95,24 @@
|
|||
* - Bindless textures and images must not occur in the vertex shader.
|
||||
*
|
||||
* User data SGPR layout:
|
||||
* VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.
|
||||
* START_OUT_INDEX: output index buffer offset / 12
|
||||
* START_IN_INDEX: input index buffer offset / index_size
|
||||
* VS.BASE_VERTEX: same value as VS
|
||||
* INDEX_BUFFERS: pointer to constants
|
||||
* 0..3: input index buffer - typed buffer view
|
||||
* 4..7: output index buffer - typed buffer view
|
||||
* 8..11: viewport state - scale.xy, translate.xy
|
||||
* VERTEX_COUNTER: address of "count" in the draw packet incremented
|
||||
* atomically by the shader.
|
||||
* VS.VERTEX_BUFFERS: same value as VS
|
||||
* VS.CONST_AND_SHADER_BUFFERS: same value as VS
|
||||
* VS.SAMPLERS_AND_IMAGES: same value as VS
|
||||
* VS.BASE_VERTEX: same value as VS
|
||||
* VS.START_INSTANCE: same value as VS
|
||||
* SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
|
||||
* NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
|
||||
* per instance for instancing.
|
||||
* NUM_PRIMS_UDIV_TERMS:
|
||||
* - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
|
||||
* - Bits [5:31]: The number of primitives per instance for computing the remainder.
|
||||
* SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
|
||||
*
|
||||
* How to test primitive restart (the most complicated part because it needs
|
||||
* to get the primitive orientation right):
|
||||
|
|
@ -243,24 +244,26 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
|
|||
memset(&ctx->args, 0, sizeof(ctx->args));
|
||||
|
||||
struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
|
||||
struct ac_arg param_vb_desc, param_const_desc;
|
||||
struct ac_arg param_base_vertex, param_start_instance;
|
||||
struct ac_arg param_block_id, param_local_id;
|
||||
struct ac_arg param_smallprim_precision;
|
||||
struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;
|
||||
struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;
|
||||
struct ac_arg param_block_id, param_local_id, param_smallprim_precision;
|
||||
struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
|
||||
struct ac_arg param_sampler_desc;
|
||||
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
|
||||
¶m_index_buffers_and_constants);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_out_index);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_in_index);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_index_buffers_and_constants);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
|
||||
if (key->opt.cs_instancing) {
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
|
||||
}
|
||||
|
||||
/* Block ID and thread ID inputs. */
|
||||
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
|
||||
|
|
@ -358,6 +361,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
|
|||
/* Fetch indices. */
|
||||
if (key->opt.cs_indexed) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");
|
||||
index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
|
||||
1, 0, true, false, false);
|
||||
index[i] = ac_to_integer(&ctx->ac, index[i]);
|
||||
|
|
@ -473,6 +477,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
|
|||
|
||||
/* Write indices for accepted primitives. */
|
||||
LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
|
||||
vindex = LLVMBuildAdd(builder, vindex, ac_get_arg(&ctx->ac, param_start_out_index), "");
|
||||
LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
|
||||
|
||||
if (!ac_has_vec3_support(ctx->ac.chip_class, true))
|
||||
|
|
@ -562,6 +567,8 @@ static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_s
|
|||
sctx->index_ring_size_per_ib;
|
||||
}
|
||||
|
||||
#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)
|
||||
|
||||
enum si_prim_discard_outcome
|
||||
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
|
||||
unsigned drawid_offset,
|
||||
|
|
@ -680,12 +687,21 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
|
|||
return SI_PRIM_DISCARD_DISABLED;
|
||||
}
|
||||
|
||||
unsigned num_subdraws = DIV_ROUND_UP(num_prims, PRIMS_PER_BATCH) * num_draws;
|
||||
unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
|
||||
24 * (num_subdraws - 1) + /* subdraws */
|
||||
30; /* leave some space at the end */
|
||||
unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0) +
|
||||
num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
|
||||
/* Compute how many CS dwords we need to reserve. */
|
||||
unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +
|
||||
11 /* shader */ +
|
||||
30; /* leave some space at the end */
|
||||
unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
|
||||
|
||||
for (unsigned i = 0; i < num_draws; i++) {
|
||||
unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);
|
||||
|
||||
need_compute_dw += 8 * num_subdraws + /* signal REWIND */
|
||||
14 /* user SGPRs */ +
|
||||
4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */
|
||||
11 * num_subdraws;
|
||||
need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
|
||||
}
|
||||
|
||||
if (ring_full ||
|
||||
!sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
|
||||
|
|
@ -708,6 +724,7 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
|
|||
ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
|
||||
assert(compute_has_space);
|
||||
assert(si_check_ring_space(sctx, out_indexbuf_size));
|
||||
assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);
|
||||
return SI_PRIM_DISCARD_ENABLED;
|
||||
}
|
||||
|
||||
|
|
@ -745,22 +762,29 @@ void si_compute_signal_gfx(struct si_context *sctx)
|
|||
/* Dispatch a primitive discard compute shader. */
|
||||
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
||||
const struct pipe_draw_info *info,
|
||||
unsigned count, unsigned index_size,
|
||||
unsigned base_vertex, uint64_t input_indexbuf_va,
|
||||
unsigned input_indexbuf_num_elements)
|
||||
const struct pipe_draw_start_count_bias *draws,
|
||||
unsigned num_draws, unsigned index_size,
|
||||
unsigned total_count, uint64_t input_indexbuf_va,
|
||||
unsigned index_max_size)
|
||||
{
|
||||
struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
|
||||
unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
|
||||
if (!num_prims_per_instance)
|
||||
return;
|
||||
|
||||
unsigned num_prims = num_prims_per_instance * info->instance_count;
|
||||
unsigned num_total_prims;
|
||||
unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
|
||||
|
||||
if (!info->instance_count)
|
||||
return;
|
||||
|
||||
switch (info->mode) {
|
||||
case PIPE_PRIM_TRIANGLES:
|
||||
case PIPE_PRIM_TRIANGLE_STRIP:
|
||||
if (info->mode == PIPE_PRIM_TRIANGLES)
|
||||
num_total_prims = total_count / 3;
|
||||
else if (total_count >= 2)
|
||||
num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */
|
||||
else
|
||||
num_total_prims = 0;
|
||||
|
||||
vertices_per_prim = 3;
|
||||
output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
|
||||
gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
|
||||
|
|
@ -770,8 +794,13 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
return;
|
||||
}
|
||||
|
||||
if (!num_total_prims)
|
||||
return;
|
||||
|
||||
num_total_prims *= info->instance_count;
|
||||
|
||||
unsigned out_indexbuf_offset;
|
||||
uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
|
||||
uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;
|
||||
|
||||
/* Initialize the compute IB if it's empty. */
|
||||
if (!sctx->prim_discard_compute_ib_initialized) {
|
||||
|
|
@ -789,7 +818,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
/* This needs to be done at the beginning of IBs due to possible
|
||||
* TTM buffer moves in the kernel.
|
||||
*/
|
||||
if (sctx->chip_class >= GFX10) {
|
||||
if (sctx->chip_class >= GFX10) { /* 8 DW */
|
||||
radeon_begin(cs);
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
||||
|
|
@ -811,9 +840,9 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
S_0085F0_SH_KCACHE_ACTION_ENA(1));
|
||||
}
|
||||
|
||||
si_emit_initial_compute_regs(sctx, cs);
|
||||
si_emit_initial_compute_regs(sctx, cs); /* 39 DW */
|
||||
|
||||
radeon_begin(cs);
|
||||
radeon_begin(cs); /* 11 DW */
|
||||
radeon_set_sh_reg(
|
||||
cs, R_00B860_COMPUTE_TMPRING_SIZE,
|
||||
S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
|
||||
|
|
@ -832,12 +861,13 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
assert(!sctx->last_ib_barrier_fence);
|
||||
radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_FENCE);
|
||||
si_cp_wait_mem(sctx, cs,
|
||||
si_cp_wait_mem(sctx, cs, /* 7 DW */
|
||||
sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
|
||||
1, 1, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
sctx->prim_discard_compute_ib_initialized = true;
|
||||
assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);
|
||||
}
|
||||
|
||||
/* Allocate the output index buffer. */
|
||||
|
|
@ -864,7 +894,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
/* Input index buffer. */
|
||||
desc[0] = input_indexbuf_va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
|
||||
desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
|
||||
desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);
|
||||
|
||||
if (sctx->chip_class >= GFX10) {
|
||||
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
|
||||
|
|
@ -885,7 +915,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
desc[4] = out_indexbuf_va;
|
||||
desc[5] =
|
||||
S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
|
||||
desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
|
||||
desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
|
||||
|
||||
if (sctx->chip_class >= GFX10) {
|
||||
desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||
|
|
@ -911,7 +941,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
|
||||
/* Set user data SGPRs. */
|
||||
/* This can't be >= 16 if we want the fastest launch rate. */
|
||||
unsigned user_sgprs = 10;
|
||||
unsigned user_sgprs = info->instance_count > 1 ? 12 : 10;
|
||||
|
||||
uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
|
||||
unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
|
||||
|
|
@ -921,18 +951,9 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
uint64_t vb_desc_va = sctx->vb_descriptors_buffer
|
||||
? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
|
||||
: 0;
|
||||
struct si_fast_udiv_info32 num_prims_udiv = {};
|
||||
|
||||
if (info->instance_count > 1)
|
||||
num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
|
||||
|
||||
/* Limitations on how these two are packed in the user SGPR. */
|
||||
assert(num_prims_udiv.post_shift < 32);
|
||||
assert(num_prims_per_instance < 1 << 27);
|
||||
|
||||
si_resource_reference(&indexbuf_desc, NULL);
|
||||
|
||||
/* Set shader registers. */
|
||||
/* Set the compute shader. */
|
||||
struct si_shader *shader = sctx->cs_prim_discard_state.current;
|
||||
|
||||
if (shader != sctx->compute_ib_last_shader) {
|
||||
|
|
@ -969,91 +990,141 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
|
||||
STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
|
||||
|
||||
/* Big draw calls are split into smaller dispatches and draw packets. */
|
||||
for (unsigned start_prim = 0; start_prim < num_prims; start_prim = num_prims /* implement splitting */) {
|
||||
unsigned num_subdraw_prims;
|
||||
struct si_fast_udiv_info32 num_prims_udiv = {};
|
||||
|
||||
if (start_prim + PRIMS_PER_BATCH < num_prims)
|
||||
num_subdraw_prims = PRIMS_PER_BATCH;
|
||||
for (unsigned i = 0; i < num_draws; i++) {
|
||||
unsigned count = draws[i].count;
|
||||
unsigned num_prims_per_instance, num_prims;
|
||||
|
||||
/* Determine the number of primitives per instance. */
|
||||
if (info->mode == PIPE_PRIM_TRIANGLES)
|
||||
num_prims_per_instance = count / 3;
|
||||
else if (count >= 2)
|
||||
num_prims_per_instance = count - 2;
|
||||
else
|
||||
num_subdraw_prims = num_prims - start_prim;
|
||||
num_prims_per_instance = 0;
|
||||
|
||||
/* Small dispatches are executed back to back until a specific primitive
|
||||
* count is reached. Then, a CS_DONE is inserted to signal the gfx IB
|
||||
* to start drawing the batch. This batching adds latency to the gfx IB,
|
||||
* but CS_DONE and REWIND are too slow.
|
||||
*/
|
||||
if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
|
||||
si_compute_signal_gfx(sctx);
|
||||
if (!num_prims_per_instance)
|
||||
continue;
|
||||
|
||||
if (sctx->compute_num_prims_in_batch == 0) {
|
||||
assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
|
||||
sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
|
||||
num_prims = num_prims_per_instance;
|
||||
|
||||
if (info->instance_count > 1) {
|
||||
num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
|
||||
num_prims *= info->instance_count;
|
||||
}
|
||||
|
||||
/* Limitations on how these two are packed in the user SGPR. */
|
||||
assert(num_prims_udiv.post_shift < 32);
|
||||
assert(num_prims_per_instance < 1 << 27);
|
||||
|
||||
/* Big draw calls are split into smaller dispatches and draw packets. */
|
||||
for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {
|
||||
unsigned num_subdraw_prims;
|
||||
|
||||
if (start_prim + PRIMS_PER_BATCH < num_prims) {
|
||||
num_subdraw_prims = PRIMS_PER_BATCH;
|
||||
} else {
|
||||
num_subdraw_prims = num_prims - start_prim;
|
||||
}
|
||||
|
||||
/* Small dispatches are executed back to back until a specific primitive
|
||||
* count is reached. Then, a CS_DONE is inserted to signal the gfx IB
|
||||
* to start drawing the batch. This batching adds latency to the gfx IB,
|
||||
* but CS_DONE and REWIND are too slow.
|
||||
*/
|
||||
if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
|
||||
si_compute_signal_gfx(sctx);
|
||||
|
||||
if (sctx->compute_num_prims_in_batch == 0) {
|
||||
assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
|
||||
sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
|
||||
|
||||
radeon_begin(gfx_cs);
|
||||
radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
|
||||
radeon_emit(gfx_cs, 0);
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
sctx->compute_num_prims_in_batch += num_subdraw_prims;
|
||||
|
||||
uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
|
||||
uint64_t index_va = out_indexbuf_va + start_prim * 12;
|
||||
|
||||
/* Emit the draw packet into the gfx IB. */
|
||||
radeon_begin(gfx_cs);
|
||||
radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
|
||||
radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
|
||||
radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);
|
||||
radeon_emit(gfx_cs, index_va);
|
||||
radeon_emit(gfx_cs, index_va >> 32);
|
||||
radeon_emit(gfx_cs, 0);
|
||||
radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
|
||||
radeon_end();
|
||||
|
||||
radeon_begin_again(cs);
|
||||
|
||||
/* Continue with the compute IB. */
|
||||
if (start_prim == 0) {
|
||||
if (i == 0) {
|
||||
/* First draw. */
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
|
||||
radeon_emit(cs, count_va);
|
||||
radeon_emit(cs, start_prim);
|
||||
radeon_emit(cs, draws[i].start);
|
||||
radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
|
||||
radeon_emit(cs, index_buffers_va);
|
||||
radeon_emit(cs, vb_desc_va);
|
||||
radeon_emit(cs, vs_const_desc_va);
|
||||
radeon_emit(cs, vs_sampler_desc_va);
|
||||
radeon_emit(cs, info->start_instance);
|
||||
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
|
||||
radeon_emit(cs, fui(cull_info.small_prim_precision));
|
||||
|
||||
if (info->instance_count > 1) {
|
||||
radeon_emit(cs, num_prims_udiv.multiplier);
|
||||
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
|
||||
}
|
||||
} else {
|
||||
/* Subsequent draws. */
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);
|
||||
radeon_emit(cs, count_va);
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, draws[i].start);
|
||||
radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
|
||||
|
||||
if (info->instance_count > 1) {
|
||||
radeon_set_sh_reg_seq(cs, R_00B928_COMPUTE_USER_DATA_10, 2);
|
||||
radeon_emit(cs, num_prims_udiv.multiplier);
|
||||
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Draw split. Only update the SGPRs that changed. */
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
|
||||
radeon_emit(cs, count_va);
|
||||
radeon_emit(cs, start_prim);
|
||||
}
|
||||
|
||||
/* Set grid dimensions. */
|
||||
unsigned start_block = start_prim / THREADGROUP_SIZE;
|
||||
unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
|
||||
unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
|
||||
|
||||
radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
|
||||
radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||
S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
|
||||
S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
|
||||
radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
|
||||
radeon_emit(cs, 1);
|
||||
radeon_emit(cs, 1);
|
||||
radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
|
||||
S_00B800_ORDER_MODE(0 /* launch in order */));
|
||||
radeon_end();
|
||||
|
||||
assert(cs->current.cdw <= cs->current.max_dw);
|
||||
assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
|
||||
}
|
||||
|
||||
sctx->compute_num_prims_in_batch += num_subdraw_prims;
|
||||
|
||||
uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
|
||||
uint64_t index_va = out_indexbuf_va + start_prim * 12;
|
||||
|
||||
/* Emit the draw packet into the gfx IB. */
|
||||
radeon_begin(gfx_cs);
|
||||
radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
|
||||
radeon_emit(gfx_cs, num_prims * vertices_per_prim);
|
||||
radeon_emit(gfx_cs, index_va);
|
||||
radeon_emit(gfx_cs, index_va >> 32);
|
||||
radeon_emit(gfx_cs, 0);
|
||||
radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
|
||||
radeon_end();
|
||||
|
||||
radeon_begin_again(cs);
|
||||
|
||||
/* Continue with the compute IB. */
|
||||
if (start_prim == 0) {
|
||||
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
|
||||
radeon_emit(cs, index_buffers_va);
|
||||
radeon_emit(cs, count_va);
|
||||
radeon_emit(cs, vb_desc_va);
|
||||
radeon_emit(cs, vs_const_desc_va);
|
||||
radeon_emit(cs, vs_sampler_desc_va);
|
||||
radeon_emit(cs, base_vertex);
|
||||
radeon_emit(cs, info->start_instance);
|
||||
radeon_emit(cs, num_prims_udiv.multiplier);
|
||||
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
|
||||
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
|
||||
radeon_emit(cs, fui(cull_info.small_prim_precision));
|
||||
} else {
|
||||
#if 0 /* TODO: draw splitting could be enabled */
|
||||
/* Only update the SGPRs that changed. */
|
||||
radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 1);
|
||||
radeon_emit(cs, count_va);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Set grid dimensions. */
|
||||
unsigned start_block = start_prim / THREADGROUP_SIZE;
|
||||
unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
|
||||
unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
|
||||
|
||||
radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
|
||||
radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||
S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
|
||||
S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
|
||||
radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
|
||||
radeon_emit(cs, 1);
|
||||
radeon_emit(cs, 1);
|
||||
radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
|
||||
S_00B800_ORDER_MODE(0 /* launch in order */));
|
||||
radeon_end();
|
||||
|
||||
assert(cs->current.cdw <= cs->current.max_dw);
|
||||
assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1518,9 +1518,10 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
|
|||
void si_compute_signal_gfx(struct si_context *sctx);
|
||||
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
||||
const struct pipe_draw_info *info,
|
||||
unsigned count, unsigned index_size,
|
||||
unsigned base_vertex, uint64_t input_indexbuf_va,
|
||||
unsigned input_indexbuf_max_elements);
|
||||
const struct pipe_draw_start_count_bias *draws,
|
||||
unsigned num_draws, unsigned index_size,
|
||||
unsigned total_count, uint64_t input_indexbuf_va,
|
||||
unsigned index_max_size);
|
||||
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
|
||||
unsigned *prim_discard_vertex_count_threshold,
|
||||
unsigned *index_ring_size_per_ib);
|
||||
|
|
|
|||
|
|
@ -1292,7 +1292,8 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
}
|
||||
|
||||
/* Make sure the input is a pointer and not integer followed by inttoptr. */
|
||||
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
|
||||
if (!shader->key.opt.vs_as_prim_discard_cs)
|
||||
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
|
||||
|
||||
/* Compile to bytecode. */
|
||||
if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
|
||||
|
|
|
|||
|
|
@ -978,7 +978,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
unsigned drawid_base,
|
||||
const struct pipe_draw_indirect_info *indirect,
|
||||
const struct pipe_draw_start_count_bias *draws,
|
||||
unsigned num_draws,
|
||||
unsigned num_draws, unsigned total_count,
|
||||
struct pipe_resource *indexbuf, unsigned index_size,
|
||||
unsigned index_offset, unsigned instance_count,
|
||||
bool dispatch_prim_discard_cs, unsigned original_index_size)
|
||||
|
|
@ -1194,13 +1194,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
|
||||
radeon_end();
|
||||
|
||||
for (unsigned i = 0; i < num_draws; i++) {
|
||||
uint64_t va = index_va + draws[i].start * original_index_size;
|
||||
|
||||
si_dispatch_prim_discard_cs_and_draw(sctx, info, draws[i].count,
|
||||
original_index_size, base_vertex,
|
||||
va, MIN2(index_max_size, draws[i].count));
|
||||
}
|
||||
si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
|
||||
original_index_size, total_count, index_va,
|
||||
index_max_size);
|
||||
EMIT_SQTT_END_DRAW;
|
||||
return;
|
||||
}
|
||||
|
|
@ -2172,8 +2168,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
assert(sctx->dirty_atoms == 0);
|
||||
|
||||
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, index_size,
|
||||
index_offset, instance_count, dispatch_prim_discard_cs,
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
|
||||
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
/* <-- CUs are busy here. */
|
||||
|
||||
|
|
@ -2211,9 +2207,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
assert(sctx->dirty_atoms == 0);
|
||||
|
||||
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, index_size,
|
||||
index_offset, instance_count,
|
||||
dispatch_prim_discard_cs, original_index_size);
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
|
||||
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
|
||||
/* Prefetch the remaining shaders after the draw has been
|
||||
* started. */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue