mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 11:48:06 +02:00
radeonsi: inline gfx10_emit_streamout_begin/end
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16885>
This commit is contained in:
parent
002e34d860
commit
3f900df071
1 changed files with 87 additions and 130 deletions
|
|
@ -211,70 +211,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
|||
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
|
||||
}
|
||||
|
||||
static void gfx10_emit_streamout_begin(struct si_context *sctx)
|
||||
{
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
unsigned last_target = 0;
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (t[i])
|
||||
last_target = i;
|
||||
}
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
||||
|
||||
bool append = sctx->streamout.append_bitmask & (1 << i);
|
||||
uint64_t va = 0;
|
||||
|
||||
if (append) {
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
}
|
||||
|
||||
radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
|
||||
radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
|
||||
S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
|
||||
radeon_emit(va);
|
||||
radeon_emit(va >> 32);
|
||||
radeon_emit(4 * i); /* destination in GDS */
|
||||
radeon_emit(0);
|
||||
radeon_emit(S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
|
||||
}
|
||||
radeon_end();
|
||||
|
||||
sctx->streamout.begin_emitted = true;
|
||||
}
|
||||
|
||||
static void gfx10_emit_streamout_end(struct si_context *sctx)
|
||||
{
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
|
||||
/* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
|
||||
si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
|
||||
t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
|
||||
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
|
||||
sctx->streamout.begin_emitted = false;
|
||||
}
|
||||
|
||||
static void si_flush_vgt_streamout(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
|
|
@ -315,97 +251,121 @@ static void si_emit_streamout_begin(struct si_context *sctx)
|
|||
{
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
uint8_t *stride_in_dw = sctx->streamout.stride_in_dw;
|
||||
unsigned i;
|
||||
|
||||
si_flush_vgt_streamout(sctx);
|
||||
if (!sctx->screen->use_ngg_streamout)
|
||||
si_flush_vgt_streamout(sctx);
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
t[i]->stride_in_dw = stride_in_dw[i];
|
||||
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
||||
|
||||
/* AMD GCN binds streamout buffers as shader resources.
|
||||
* VGT only counts primitives and tells the shader
|
||||
* through SGPRs what to do. */
|
||||
radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
|
||||
radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
|
||||
radeon_emit(stride_in_dw[i]); /* VTX_STRIDE (in DW) */
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
bool append = sctx->streamout.append_bitmask & (1 << i);
|
||||
uint64_t va = 0;
|
||||
|
||||
if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
if (append) {
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
|
||||
/* Append. */
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(va); /* src address lo */
|
||||
radeon_emit(va >> 32); /* src address hi */
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
}
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
|
||||
radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
|
||||
S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1));
|
||||
radeon_emit(va);
|
||||
radeon_emit(va >> 32);
|
||||
radeon_emit(4 * i); /* destination in GDS */
|
||||
radeon_emit(0);
|
||||
radeon_emit(S_415_BYTE_COUNT_GFX9(4));
|
||||
radeon_end();
|
||||
} else {
|
||||
/* Start from the beginning. */
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
|
||||
radeon_emit(0); /* unused */
|
||||
/* Legacy streamout.
|
||||
*
|
||||
* The hw binds streamout buffers as shader resources. VGT only counts primitives
|
||||
* and tells the shader through SGPRs what to do.
|
||||
*/
|
||||
radeon_begin(cs);
|
||||
radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
|
||||
radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
|
||||
radeon_emit(sctx->streamout.stride_in_dw[i]); /* VTX_STRIDE (in DW) */
|
||||
|
||||
if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
|
||||
/* Append. */
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(va); /* src address lo */
|
||||
radeon_emit(va >> 32); /* src address hi */
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
} else {
|
||||
/* Start from the beginning. */
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
|
||||
radeon_emit(0); /* unused */
|
||||
}
|
||||
radeon_end_update_context_roll(sctx);
|
||||
}
|
||||
}
|
||||
radeon_end();
|
||||
|
||||
sctx->streamout.begin_emitted = true;
|
||||
}
|
||||
|
||||
void si_emit_streamout_end(struct si_context *sctx)
|
||||
{
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
gfx10_emit_streamout_end(sctx);
|
||||
return;
|
||||
}
|
||||
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
unsigned i;
|
||||
uint64_t va;
|
||||
|
||||
si_flush_vgt_streamout(sctx);
|
||||
if (!sctx->screen->use_ngg_streamout)
|
||||
si_flush_vgt_streamout(sctx);
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
|
||||
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
|
||||
radeon_emit(va); /* dst address lo */
|
||||
radeon_emit(va >> 32); /* dst address hi */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
/* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
|
||||
si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
|
||||
t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
|
||||
} else {
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
|
||||
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
|
||||
radeon_emit(va); /* dst address lo */
|
||||
radeon_emit(va >> 32); /* dst address hi */
|
||||
radeon_emit(0); /* unused */
|
||||
radeon_emit(0); /* unused */
|
||||
|
||||
/* Zero the buffer size. The counters (primitives generated,
|
||||
* primitives emitted) may be enabled even if there is not
|
||||
* buffer bound. This ensures that the primitives-emitted query
|
||||
* won't increment. */
|
||||
radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
|
||||
/* Zero the buffer size. The counters (primitives generated,
|
||||
* primitives emitted) may be enabled even if there is not
|
||||
* buffer bound. This ensures that the primitives-emitted query
|
||||
* won't increment. */
|
||||
radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
|
||||
radeon_end_update_context_roll(sctx);
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
|
||||
RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
|
||||
}
|
||||
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
radeon_end_update_context_roll(sctx);
|
||||
|
||||
sctx->streamout.begin_emitted = false;
|
||||
}
|
||||
|
|
@ -474,11 +434,8 @@ void si_init_streamout_functions(struct si_context *sctx)
|
|||
sctx->b.create_stream_output_target = si_create_so_target;
|
||||
sctx->b.stream_output_target_destroy = si_so_target_destroy;
|
||||
sctx->b.set_stream_output_targets = si_set_streamout_targets;
|
||||
sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
|
||||
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
|
||||
} else {
|
||||
sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
|
||||
if (!sctx->screen->use_ngg_streamout)
|
||||
sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue