turnip: Implement stream-out emit and vkApis for transform feedback

1. Implement vkCmdBindTransformFeedbackBuffersEXT,
vkCmdBeginTransformFeedbackEXT and vkCmdEndTransformFeedbackEXT.
 - Not handling counter buffers yet.
2. Implement streamout emit function, mostly taken from fd6_emit.c

v2. Replace emit_pkt4 funcs with emit_regs.

v3. Don't copy the state of stream-output from tu_pipeline.

v4. Set zero to VPC_SO_CNTL/VPC_SO_BUF_CNTL in tu6_init_hw.

Signed-off-by: Hyunjun Ko <zzoon@igalia.com>
Reviewed-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3942>
This commit is contained in:
Hyunjun Ko 2020-02-25 10:08:25 +09:00 committed by Marge Bot
parent 374406a7c4
commit 9ff1959ca5
3 changed files with 148 additions and 33 deletions

View file

@ -798,7 +798,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu6_emit_window_offset(cmd, cs, x1, y1);
tu_cs_emit_regs(cs,
A6XX_VPC_SO_OVERRIDE(.so_disable = true));
A6XX_VPC_SO_OVERRIDE(.so_disable = false));
if (use_hw_binning(cmd)) {
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
@ -1144,38 +1144,12 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_BASE(0),
A6XX_VPC_SO_BUFFER_SIZE(0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_FLUSH_BASE(0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUF_CNTL(0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_BASE(1, 0),
A6XX_VPC_SO_BUFFER_SIZE(1, 0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
A6XX_VPC_SO_FLUSH_BASE(1, 0),
A6XX_VPC_SO_BUFFER_BASE(2, 0),
A6XX_VPC_SO_BUFFER_SIZE(2, 0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
A6XX_VPC_SO_FLUSH_BASE(2, 0),
A6XX_VPC_SO_BUFFER_BASE(3, 0),
A6XX_VPC_SO_BUFFER_SIZE(3, 0));
tu_cs_emit_regs(cs,
A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
A6XX_VPC_SO_FLUSH_BASE(3, 0));
/* Set not to use streamout by default, */
tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
tu_cs_emit(cs, 0);
tu_cs_emit_regs(cs,
A6XX_SP_HS_CTRL_REG0(0));
@ -1577,6 +1551,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
if (use_hw_binning(cmd)) {
/* enable stream-out during binning pass: */
tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
tu6_emit_bin_size(cs,
tiling->tile0.extent.width,
tiling->tile0.extent.height,
@ -1586,6 +1563,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu6_emit_binning_pass(cmd, cs);
/* and disable stream-out for draw pass: */
tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
tu6_emit_bin_size(cs,
tiling->tile0.extent.width,
tiling->tile0.extent.height,
@ -1601,6 +1581,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x1);
} else {
/* no binning pass, so enable stream-out for draw pass:: */
tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
tu6_emit_bin_size(cs,
tiling->tile0.extent.width,
tiling->tile0.extent.height,
@ -2173,6 +2156,56 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
}
void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
const VkBuffer *pBuffers,
const VkDeviceSize *pOffsets,
const VkDeviceSize *pSizes)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS);
for (uint32_t i = 0; i < bindingCount; i++) {
uint32_t idx = firstBinding + i;
TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
if (pOffsets[i] != 0)
cmd->state.streamout_reset |= 1 << idx;
cmd->state.streamout_buf.buffers[idx] = buf;
cmd->state.streamout_buf.offsets[idx] = pOffsets[i];
cmd->state.streamout_buf.sizes[idx] = pSizes[i];
cmd->state.streamout_enabled |= 1 << idx;
}
cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS;
}
void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
/* TODO do something with counter buffer? */
}
void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
/* TODO do something with counter buffer? */
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
cmd->state.streamout_enabled = 0;
}
void
tu_CmdPushConstants(VkCommandBuffer commandBuffer,
VkPipelineLayout layout,
@ -3374,6 +3407,67 @@ tu6_emit_border_color(struct tu_cmd_buffer *cmd,
return VK_SUCCESS;
}
static void
tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
struct tu_streamout_state *tf = &cmd->state.pipeline->streamout;
for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
if (!buf)
continue;
uint32_t offset;
offset = cmd->state.streamout_buf.offsets[i];
tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo,
.bo_offset = buf->bo_offset));
tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size));
if (cmd->state.streamout_reset & (1 << i)) {
offset *= tf->stride[i];
tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset));
cmd->state.streamout_reset &= ~(1 << i);
} else {
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
CP_MEM_TO_REG_0_CNT(0));
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_FLUSH * (i + 1));
}
tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo,
.bo_offset = VSC_FLUSH * (i + 1)));
}
if (cmd->state.streamout_enabled) {
tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count));
tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
tu_cs_emit(cs, tf->vpc_so_buf_cntl);
tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0));
tu_cs_emit(cs, tf->ncomp[0]);
tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1));
tu_cs_emit(cs, tf->ncomp[1]);
tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2));
tu_cs_emit(cs, tf->ncomp[2]);
tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3));
tu_cs_emit(cs, tf->ncomp[3]);
tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE);
for (unsigned i = 0; i < tf->prog_count; i++) {
tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
tu_cs_emit(cs, tf->prog[i]);
}
} else {
tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
tu_cs_emit(cs, 0);
}
}
static VkResult
tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
@ -3505,6 +3599,9 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
};
}
if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
tu6_emit_streamout(cmd, cs);
if (cmd->state.dirty &
(TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
bool needs_border = false;
@ -3623,6 +3720,15 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
}
}
}
if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
if (buf) {
tu_bo_list_add(&cmd->bo_list, buf->bo,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
}
}
}
/* Fragment shader state overwrites compute shader state, so flag the
* compute pipeline for re-emit.
@ -3742,6 +3848,13 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
else
tu6_emit_draw_direct(cmd, cs, draw);
if (cmd->state.streamout_enabled) {
for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
if (cmd->state.streamout_enabled & (1 << i))
tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i, false);
}
}
cmd->wait_for_idle = true;
tu_cs_sanity_check(cs);

View file

@ -77,6 +77,7 @@ EXTENSIONS = [
Extension('VK_KHR_external_memory_fd', 1, True),
Extension('VK_EXT_external_memory_dma_buf', 1, True),
Extension('VK_EXT_image_drm_format_modifier', 1, False),
Extension('VK_EXT_transform_feedback', 1, False),
]
class VkVersion:

View file

@ -984,6 +984,7 @@ struct tu_cmd_buffer
uint32_t scratch_seqno;
#define VSC_OVERFLOW 0x8
#define VSC_SCRATCH 0x10
#define VSC_FLUSH 0x20
struct tu_bo vsc_data;
struct tu_bo vsc_data2;