diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index d17b990f491..9b729a562f5 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -517,7 +517,7 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_ tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0()); tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0()); - tu6_emit_vpc(cs, &vs, NULL, NULL, gs, &fs, NULL); + tu6_emit_vpc(cs, &vs, NULL, NULL, gs, &fs); /* REPL_MODE for varying with RECTLIST (2 vertices only) */ tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0)); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 41d6bfcdedd..e3f625d583b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -708,6 +708,13 @@ use_hw_binning(struct tu_cmd_buffer *cmd) { const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible + * with non-hw binning GMEM rendering. this is required because some of the + * XFB commands need to only be executed once + */ + if (cmd->state.xfb_used) + return true; + if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN)) return false; @@ -989,13 +996,6 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - /* Set not to use streamout by default, */ - tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); - tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); - tu_cs_emit(cs, 0); - tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(0)); @@ -1441,6 +1441,9 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) { const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + if (use_hw_binning(cmd)) + cmd->use_vsc_data = true; + tu6_tile_render_begin(cmd, &cmd->cs); for (uint32_t y = 0; y < tiling->tile_count.height; y++) { @@ -1925,33 +1928,86 @@ void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, const VkDeviceSize *pSizes) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS); + struct tu_cs *cs = &cmd->draw_cs; + + /* using COND_REG_EXEC for xfb commands matches the blob behavior + * presumably there isn't any benefit using a draw state when the + * condition is (SYSMEM | BINNING) + */ + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); for (uint32_t i = 0; i < bindingCount; i++) { - uint32_t idx = firstBinding + i; TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]); + uint64_t iova = buf->bo->iova + pOffsets[i]; + uint32_t size = buf->bo->size - pOffsets[i]; + uint32_t idx = i + firstBinding; - if (pOffsets[i] != 0) - cmd->state.streamout_reset |= 1 << idx; + if (pSizes && pSizes[i] != VK_WHOLE_SIZE) + size = pSizes[i]; - cmd->state.streamout_buf.buffers[idx] = buf; - cmd->state.streamout_buf.offsets[idx] = pOffsets[i]; - cmd->state.streamout_buf.sizes[idx] = pSizes[i]; + /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */ + uint32_t offset = iova & 0x1f; + iova &= ~(uint64_t) 0x1f; - cmd->state.streamout_enabled |= 1 << idx; + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3); + tu_cs_emit_qw(cs, iova); + tu_cs_emit(cs, size + offset); + + cmd->state.streamout_offset[idx] = offset; + + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE); } - cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS; + tu_cond_exec_end(cs); } -void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, - uint32_t firstCounterBuffer, - uint32_t counterBufferCount, - const VkBuffer *pCounterBuffers, - const VkDeviceSize *pCounterBufferOffsets) +void +tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) { - assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS); - /* TODO do something with counter buffer? */ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + struct tu_cs *cs = &cmd->draw_cs; + + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); + + /* TODO: only update offset for active buffers */ + for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) + tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i])); + + for (uint32_t i = 0; i < counterBufferCount; i++) { + uint32_t idx = firstCounterBuffer + i; + uint32_t offset = cmd->state.streamout_offset[idx]; + + if (!pCounterBuffers[i]) + continue; + + TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); + + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ); + + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | + CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(1)); + tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]); + + if (offset) { + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | + CP_REG_RMW_0_SRC1_ADD); + tu_cs_emit_qw(cs, 0xffffffff); + tu_cs_emit_qw(cs, offset); + } + } + + tu_cond_exec_end(cs); } void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, @@ -1960,11 +2016,58 @@ void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, const VkBuffer *pCounterBuffers, const VkDeviceSize *pCounterBufferOffsets) { - assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS); - /* TODO do something with counter buffer? */ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - cmd->state.streamout_enabled = 0; + struct tu_cs *cs = &cmd->draw_cs; + + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); + + /* TODO: only flush buffers that need to be flushed */ + for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) { + /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */ + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[i])); + tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i); + } + + for (uint32_t i = 0; i < counterBufferCount; i++) { + uint32_t idx = firstCounterBuffer + i; + uint32_t offset = cmd->state.streamout_offset[idx]; + + if (!pCounterBuffers[i]) + continue; + + TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); + + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE); + + /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | + CP_MEM_TO_REG_0_SHIFT_BY_2 | + 0x40000 | /* ??? */ + CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(1)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[idx])); + + if (offset) { + tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); + tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) | + CP_REG_RMW_0_SRC1_ADD); + tu_cs_emit_qw(cs, 0xffffffff); + tu_cs_emit_qw(cs, -offset); + } + + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | + CP_REG_TO_MEM_0_CNT(1)); + tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]); + } + + tu_cond_exec_end(cs); + + cmd->state.xfb_used = true; } void @@ -2694,10 +2797,6 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, tu_set_input_attachments(cmd, cmd->state.subpass); - /* note: use_hw_binning only checks tiling config */ - if (use_hw_binning(cmd)) - cmd->use_vsc_data = true; - for (uint32_t i = 0; i < fb->attachment_count; ++i) { const struct tu_image_view *iview = fb->attachments[i].attachment; tu_bo_list_add(&cmd->bo_list, iview->image->bo, @@ -2972,67 +3071,6 @@ tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd, return tu_cs_end_sub_stream(&cmd->sub_cs, &cs); } -static void -tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - struct tu_streamout_state *tf = &cmd->state.pipeline->streamout; - - for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i]; - if (!buf) - continue; - - uint32_t offset; - offset = cmd->state.streamout_buf.offsets[i]; - - tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo, - .bo_offset = buf->bo_offset)); - tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size)); - - if (cmd->state.streamout_reset & (1 << i)) { - tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset)); - cmd->state.streamout_reset &= ~(1 << i); - } else { - tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); - tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | - CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(0)); - tu_cs_emit_qw(cs, cmd->scratch_bo.iova + - ctrl_offset(flush_base[i].offset)); - } - - tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo, - .bo_offset = - ctrl_offset(flush_base[i]))); - } - - if (cmd->state.streamout_enabled) { - tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count)); - tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); - tu_cs_emit(cs, tf->vpc_so_buf_cntl); - tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0)); - tu_cs_emit(cs, tf->ncomp[0]); - tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1)); - tu_cs_emit(cs, tf->ncomp[1]); - tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2)); - tu_cs_emit(cs, tf->ncomp[2]); - tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3)); - tu_cs_emit(cs, tf->ncomp[3]); - tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); - tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE); - for (unsigned i = 0; i < tf->prog_count; i++) { - tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); - tu_cs_emit(cs, tf->prog[i]); - } - } else { - tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); - tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); - tu_cs_emit(cs, 0); - } -} - static uint64_t get_tess_param_bo_size(const struct tu_pipeline *pipeline, const struct tu_draw_info *draw_info) @@ -3180,9 +3218,6 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT); } - if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) - tu6_emit_streamout(cmd, cs); - if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { /* We need to reload the descriptors every time the descriptor sets * change. However, the commands we send only depend on the pipeline @@ -3300,17 +3335,6 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu_cs_sanity_check(cs); - /* track BOs */ - if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) { - for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i]; - if (buf) { - tu_bo_list_add(&cmd->bo_list, buf->bo, - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); - } - } - } - /* There are too many graphics dirty bits to list here, so just list the * bits to preserve instead. The only things not emitted here are * compute-related state. @@ -3470,13 +3494,6 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) else tu6_emit_draw_direct(cmd, cs, draw); - if (cmd->state.streamout_enabled) { - for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - if (cmd->state.streamout_enabled & (1 << i)) - tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i); - } - } - tu_cs_sanity_check(cs); } diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index fe6c346c2d1..a3251621e8f 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -591,20 +591,28 @@ tu6_link_streamout(struct ir3_shader_linkage *l, } static void -tu6_setup_streamout(const struct ir3_shader_variant *v, - struct ir3_shader_linkage *l, struct tu_streamout_state *tf) +tu6_setup_streamout(struct tu_cs *cs, + const struct ir3_shader_variant *v, + struct ir3_shader_linkage *l) { const struct ir3_stream_output_info *info = &v->shader->stream_output; + uint32_t prog[IR3_MAX_SO_OUTPUTS * 2] = {}; + uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {}; + uint32_t prog_count = align(l->max_loc, 2) / 2; - memset(tf, 0, sizeof(*tf)); + /* TODO: streamout state should be in a non-GMEM draw state */ - tf->prog_count = align(l->max_loc, 2) / 2; + /* no streamout: */ + if (info->num_outputs == 0) { + tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); + tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); + tu_cs_emit(cs, 0); + return; + } - debug_assert(tf->prog_count < ARRAY_SIZE(tf->prog)); - - /* set stride info to the streamout state */ - for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) - tf->stride[i] = info->stride[i]; + /* is there something to do with info->stride[i]? */ for (unsigned i = 0; i < info->num_outputs; i++) { const struct ir3_stream_output *out = &info->output[i]; @@ -615,7 +623,7 @@ tu6_setup_streamout(const struct ir3_shader_variant *v, if (v->outputs[k].regid == INVALID_REG) continue; - tf->ncomp[out->output_buffer] += out->num_components; + ncomp[out->output_buffer] += out->num_components; /* linkage map sorted by order frag shader wants things, so * a bit less ideal here.. @@ -632,22 +640,35 @@ tu6_setup_streamout(const struct ir3_shader_variant *v, unsigned off = j + out->dst_offset; /* in dwords */ if (loc & 1) { - tf->prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN | - A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_B_OFF(off * 4); + prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN | + A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_B_OFF(off * 4); } else { - tf->prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN | - A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_A_OFF(off * 4); + prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN | + A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_A_OFF(off * 4); } } } - tf->vpc_so_buf_cntl = A6XX_VPC_SO_BUF_CNTL_ENABLE | - COND(tf->ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) | - COND(tf->ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) | - COND(tf->ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) | - COND(tf->ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3); + tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + 2 * prog_count); + tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); + tu_cs_emit(cs, A6XX_VPC_SO_BUF_CNTL_ENABLE | + COND(ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) | + COND(ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) | + COND(ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) | + COND(ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3)); + for (uint32_t i = 0; i < 4; i++) { + tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i)); + tu_cs_emit(cs, ncomp[i]); + } + /* note: "VPC_SO_CNTL" write seems to be responsible for resetting the SO_PROG */ + tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); + tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE); + for (uint32_t i = 0; i < prog_count; i++) { + tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); + tu_cs_emit(cs, prog[i]); + } } static void @@ -710,8 +731,7 @@ tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *hs, const struct ir3_shader_variant *ds, const struct ir3_shader_variant *gs, - const struct ir3_shader_variant *fs, - struct tu_streamout_state *tf) + const struct ir3_shader_variant *fs) { const struct ir3_shader_variant *last_shader; if (gs) { @@ -762,8 +782,7 @@ tu6_emit_vpc(struct tu_cs *cs, ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); } - if (last_shader->shader->stream_output.num_outputs) - tu6_setup_streamout(last_shader, &linkage, tf); + tu6_setup_streamout(cs, last_shader, &linkage); /* map outputs of the last shader to VPC */ assert(linkage.cnt <= 32); @@ -1316,8 +1335,7 @@ static void tu6_emit_program(struct tu_cs *cs, struct tu_pipeline_builder *builder, const struct tu_bo *binary_bo, - bool binning_pass, - struct tu_streamout_state *tf) + bool binning_pass) { const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; const struct ir3_shader_variant *bs = builder->binning_variant; @@ -1355,7 +1373,7 @@ tu6_emit_program(struct tu_cs *cs, tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1); tu_cs_emit(cs, 0); - tu6_emit_vpc(cs, vs, hs, ds, gs, fs, tf); + tu6_emit_vpc(cs, vs, hs, ds, gs, fs); tu6_emit_vpc_varying_modes(cs, fs); if (fs) { @@ -2040,11 +2058,11 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, { struct tu_cs prog_cs; tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false, &pipeline->streamout); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false); pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true, &pipeline->streamout); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true); pipeline->program.binning_state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); VkShaderStageFlags stages = 0; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 532ab64362c..0109b5c2ba5 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -674,19 +674,10 @@ enum tu_cmd_dirty_bits TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3, TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 4, TU_CMD_DIRTY_SHADER_CONSTS = 1 << 5, - TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 6, /* all draw states were disabled and need to be re-enabled: */ TU_CMD_DIRTY_DRAW_STATE = 1 << 7, }; -struct tu_streamout_state { - uint16_t stride[IR3_MAX_SO_BUFFERS]; - uint32_t ncomp[IR3_MAX_SO_BUFFERS]; - uint32_t prog[IR3_MAX_SO_OUTPUTS * 2]; - uint32_t prog_count; - uint32_t vpc_so_buf_cntl; -}; - /* There are only three cache domains we have to care about: the CCU, or * color cache unit, which is used for color and depth/stencil attachments * and copy/blit destinations, and is split conceptually into color and depth, @@ -824,17 +815,6 @@ struct tu_cmd_state struct tu_cs_entry desc_sets_ib, desc_sets_load_ib; struct tu_cs_entry ia_gmem_ib, ia_sysmem_ib; - /* Stream output buffers */ - struct - { - struct tu_buffer *buffers[IR3_MAX_SO_BUFFERS]; - VkDeviceSize offsets[IR3_MAX_SO_BUFFERS]; - VkDeviceSize sizes[IR3_MAX_SO_BUFFERS]; - } streamout_buf; - - uint8_t streamout_reset; - uint8_t streamout_enabled; - /* Index buffer */ struct tu_buffer *index_buffer; uint64_t index_offset; @@ -842,6 +822,12 @@ struct tu_cmd_state uint32_t max_index_count; uint64_t index_va; + /* because streamout base has to be 32-byte aligned + * there is an extra offset to deal with when it is + * unaligned + */ + uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; + /* Renderpasses are tricky, because we may need to flush differently if * using sysmem vs. gmem and therefore we have to delay any flushing that * happens before a renderpass. So we have to have two copies of the flush @@ -860,6 +846,8 @@ struct tu_cmd_state struct tu_tiling_config tiling_config; struct tu_cs_entry tile_store_ib; + + bool xfb_used; }; struct tu_cmd_pool @@ -1067,8 +1055,6 @@ struct tu_pipeline VkShaderStageFlags active_stages; uint32_t active_desc_sets; - struct tu_streamout_state streamout; - /* mask of enabled dynamic states * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used */ @@ -1169,8 +1155,7 @@ tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *hs, const struct ir3_shader_variant *ds, const struct ir3_shader_variant *gs, - const struct ir3_shader_variant *fs, - struct tu_streamout_state *tf); + const struct ir3_shader_variant *fs); void tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);