From 3b3cd59fb85b5b200acce45f950869eb9d7b69a6 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 1 Apr 2022 17:23:09 -0400 Subject: [PATCH] panfrost: Launch transform feedback shaders We now have infrastructure in place to generate variants of vertex shaders specialized for transform feedback. All that's left is launching these compute-like kernels before the IDVS job, implementing both the transform feedback and the regular rasterization pipeline. This implements transform feedback on Valhall, passing the relevant GLES3.1 tests. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/gallium/drivers/panfrost/pan_assemble.c | 4 + src/gallium/drivers/panfrost/pan_cmdstream.c | 154 +++++++++++++++---- src/gallium/drivers/panfrost/pan_job.c | 6 +- src/panfrost/bifrost/bifrost_compile.c | 24 +-- 4 files changed, 139 insertions(+), 49 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index ae86112b26e..48a4993e991 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -86,6 +86,10 @@ panfrost_shader_compile(struct pipe_screen *pscreen, .fixed_varying_mask = state->key.fixed_varying_mask }; + /* No IDVS for internal XFB shaders */ + if (s->info.stage == MESA_SHADER_VERTEX && s->info.has_transform_feedback_varyings) + inputs.no_idvs = true; + memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats)); struct util_dynarray binary; diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 2def313f71c..5ddc31f7765 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -2215,6 +2215,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch, return ptr; } +#if PAN_ARCH <= 5 static void panfrost_emit_streamout(struct panfrost_batch *batch, struct mali_attribute_buffer_packed *slot, @@ -2256,6 +2257,7 @@ pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc) unreachable("Varying not captured"); } +#endif /* Given a varying, figure out which index it corresponds to */ @@ -2445,8 +2447,12 @@ panfrost_emit_varying(const struct panfrost_device *dev, gl_varying_slot loc = varying.location; mali_pixel_format format = dev->formats[pipe_format].hw; +#if PAN_ARCH <= 5 struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ? pan_get_so(xfb, loc) : NULL; +#else + struct pipe_stream_output *o = NULL; +#endif if (util_varying_is_point_coord(loc, point_sprite_mask)) { pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD); @@ -2604,7 +2610,7 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, /* In good conditions, we only need to link varyings once */ bool prelink = (point_coord_mask == 0) && - (ctx->streamout.num_targets == 0) && + (PAN_ARCH >= 6 || ctx->streamout.num_targets == 0) && !vs->info.separable && !fs->info.separable; @@ -2620,7 +2626,6 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage); } - struct pipe_stream_output_info *so = &vs->stream_output; unsigned present = linkage->present, stride = linkage->stride; unsigned xfb_base = pan_xfb_base(present); struct panfrost_ptr T = @@ -2637,11 +2642,12 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, #if PAN_ARCH >= 6 /* Suppress prefetch on Bifrost */ memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings)); -#endif - +#else /* Emit the stream out buffers. We need enough room for all the * vertices we emit across all instances */ + struct pipe_stream_output_info *so = &vs->stream_output; + unsigned out_count = ctx->instance_count * u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count); @@ -2651,6 +2657,7 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, out_count, ctx->streamout.targets[i]); } +#endif if (stride) { panfrost_emit_varyings(batch, @@ -2683,6 +2690,11 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, *fs_attribs = linkage->consumer; } +/* + * Emit jobs required for the rasterization pipeline. If there are side effects + * from the vertex shader, these are handled ahead-of-time with a compute + * shader. This function should not be called if rasterization is skipped. + */ static void panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch, const struct panfrost_ptr *vertex_job, @@ -2690,20 +2702,16 @@ panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch, { struct panfrost_context *ctx = batch->ctx; - /* If rasterizer discard is enable, only submit the vertex. XXX - set - * job_barrier in case buffers get ping-ponged and we need to enforce - * ordering, this has a perf hit! See - * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */ - + /* XXX - set job_barrier in case buffers get ping-ponged and we need to + * enforce ordering, this has a perf hit! See + * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations + */ unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, true, false, ctx->indirect_draw ? batch->indirect_draw_job_id : 0, 0, vertex_job, false); - if (panfrost_batch_skip_rasterization(batch)) - return; - panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, false, vertex, 0, tiler_job, false); @@ -3533,6 +3541,89 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch, } #endif +static void +panfrost_launch_xfb(struct panfrost_batch *batch, + const struct pipe_draw_info *info, + mali_ptr attribs, mali_ptr attrib_bufs, + unsigned count) +{ + struct panfrost_context *ctx = batch->ctx; + + struct panfrost_ptr t = + pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); + + /* Nothing to do */ + if (batch->ctx->streamout.num_targets == 0) + return; + + /* TODO: XFB with index buffers */ + //assert(info->index_size == 0); + u_trim_pipe_prim(info->mode, &count); + + if (count == 0) + return; + + struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); + struct panfrost_shader_variants v = { .variants = vs->xfb }; + + vs->xfb->stream_output = vs->stream_output; + + struct panfrost_shader_variants *saved_vs = ctx->shader[PIPE_SHADER_VERTEX]; + mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX]; + mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX]; + mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX]; + + ctx->shader[PIPE_SHADER_VERTEX] = &v; + batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); + +#if PAN_ARCH >= 9 + pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; + + cfg.workgroup_count_x = count; + cfg.workgroup_count_y = info->instance_count; + cfg.workgroup_count_z = 1; + + panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX, + batch->rsd[PIPE_SHADER_VERTEX], + batch->tls.gpu); + + /* TODO: Indexing. Also, this is a legacy feature... */ + cfg.compute.attribute_offset = batch->ctx->offset_start; + + /* Transform feedback shaders do not use barriers or shared + * memory, so we may merge workgroups. + */ + cfg.allow_merging_workgroups = true; + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; + } +#else + struct mali_invocation_packed invocation; + + panfrost_pack_work_groups_compute(&invocation, + 1, count, info->instance_count, + 1, 1, 1, false, false); + + batch->uniform_buffers[PIPE_SHADER_VERTEX] = + panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL, + &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL); + + panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, + attribs, attrib_bufs, t.cpu); +#endif + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_COMPUTE, true, false, + 0, 0, &t, false); + + ctx->shader[PIPE_SHADER_VERTEX] = saved_vs; + batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd; + batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo; + batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; +} + static void panfrost_direct_draw(struct panfrost_batch *batch, const struct pipe_draw_info *info, @@ -3657,6 +3748,24 @@ panfrost_direct_draw(struct panfrost_batch *batch, panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT); panfrost_clean_state_3d(ctx); +#if PAN_ARCH >= 6 + if (vs->xfb) { +#if PAN_ARCH >= 9 + mali_ptr attribs = 0, attrib_bufs = 0; +#endif + panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count); + } +#endif + + /* Increment transform feedback offsets */ + panfrost_update_streamout_offsets(ctx); + + /* Any side effects must be handled by the XFB shader, so we only need + * to run vertex shaders if we need rasterization. + */ + if (panfrost_batch_skip_rasterization(batch)) + return; + #if PAN_ARCH >= 9 assert(idvs && "Memory allocated IDVS required on Valhall"); @@ -3667,10 +3776,10 @@ panfrost_direct_draw(struct panfrost_batch *batch, MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, 0, &tiler, false); #else + /* Fire off the draw itself */ panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, fs_vary, varyings, pos, psiz, secondary_shader, tiler.cpu); - if (idvs) { #if PAN_ARCH >= 6 panfrost_draw_emit_vertex_section(batch, @@ -3688,9 +3797,6 @@ panfrost_direct_draw(struct panfrost_batch *batch, panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); } #endif - - /* Increment transform feedback offsets */ - panfrost_update_streamout_offsets(ctx); } #if PAN_GPU_INDIRECTS @@ -3912,22 +4018,6 @@ panfrost_draw_vbo(struct pipe_context *pipe, if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR)) batch->viewport = panfrost_emit_viewport(batch); - /* If rasterization discard is enabled but the vertex shader does not - * have side effects (including transform feedback), skip the draw - * altogether. This is always an optimization. Additionally, this is - * required for Index-Driven Vertex Shading, since IDVS always - * rasterizes. The compiler will not use IDVS if the vertex shader has - * side effects. So the only problem case is rasterizer discard with a - * shader without side effects -- but these draws are useless. - */ - if (panfrost_batch_skip_rasterization(batch)) { - struct panfrost_shader_state *vs = - panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); - - if (!vs->info.writes_global) - return; - } - /* Mark everything dirty when debugging */ if (unlikely(dev->debug & PAN_DBG_DIRTY)) panfrost_dirty_state_all(ctx); diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 6e051fd892d..3d1a472de31 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -901,6 +901,9 @@ panfrost_batch_union_scissor(struct panfrost_batch *batch, /** * Checks if rasterization should be skipped. If not, a TILER job must be * created for each draw, or the IDVS flow must be used. + * + * As a special case, if there is no vertex shader, no primitives are generated, + * meaning the whole pipeline (including rasterization) should be skipped. */ bool panfrost_batch_skip_rasterization(struct panfrost_batch *batch) @@ -909,5 +912,6 @@ panfrost_batch_skip_rasterization(struct panfrost_batch *batch) struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer; return (rast->rasterizer_discard || - batch->scissor_culls_everything); + batch->scissor_culls_everything || + !batch->rsd[PIPE_SHADER_VERTEX]); } diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 65360b60096..5a376da963d 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -5133,6 +5133,14 @@ bi_compile_variant(nir_shader *nir, unsigned offset = binary->size; + /* If there is no position shader (gl_Position is not written), then + * there is no need to build a varying shader either. This case is hit + * for transform feedback only vertex shaders which only make sense with + * rasterizer discard. + */ + if ((offset == 0) && (idvs == BI_IDVS_VARYING)) + return; + /* Software invariant: Only a secondary shader can appear at a nonzero * offset, to keep the ABI simple. */ assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); @@ -5213,22 +5221,6 @@ bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) if (nir->info.stage != MESA_SHADER_VERTEX) return false; - /* Transform feedback requires running all varying shaders regardless - * of clipping, but IDVS does clipping before running varying shaders. - * So shaders destined for transform feedback must not use IDVS. - * - * The issue with general memory stores is more subtle: these shaders - * have side effects and only make sense if vertex shaders run exactly - * once per vertex. IDVS requires the hardware to rerun position or - * varying shaders in certain circumstances. So if there is any memory - * write, disable IDVS. - * - * NIR considers transform feedback to be a memory write, so we only - * need to check writes_memory to handle both cases. - */ - if (nir->info.writes_memory) - return false; - /* Bifrost cannot write gl_PointSize during IDVS */ if ((inputs->gpu_id < 0x9000) && nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))