diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index b02c7579ffe..a1bf4c108e8 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -3046,27 +3046,23 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
 }
 #endif
 
+/* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
+ * jobs and Valhall IDVS jobs
+ */
 static void
-panfrost_draw_emit_tiler(struct panfrost_batch *batch,
-                         const struct pipe_draw_info *info,
-                         const struct pipe_draw_start_count_bias *draw,
-                         void *invocation_template,
-                         mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
-                         mali_ptr pos, mali_ptr psiz, bool secondary_shader,
-                         void *job)
+panfrost_emit_primitive(struct panfrost_context *ctx,
+                        const struct pipe_draw_info *info,
+                        const struct pipe_draw_start_count_bias *draw,
+                        mali_ptr indices, bool secondary_shader, void *out)
 {
-        struct panfrost_context *ctx = batch->ctx;
-        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+        UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 
-        void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
-        memcpy(section, invocation_template, pan_size(INVOCATION));
-
-        section = pan_section_ptr(job, TILER_JOB, PRIMITIVE);
-        pan_pack(section, PRIMITIVE, cfg) {
+        pan_pack(out, PRIMITIVE, cfg) {
                 cfg.draw_mode = pan_draw_mode(info->mode);
                 if (panfrost_writes_point_size(ctx))
                         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
 
+#if PAN_ARCH <= 8
                 /* For line primitives, PRIMITIVE.first_provoking_vertex must
                  * be set to true and the provoking vertex is selected with
                  * DRAW.flat_shading_vertex.
@@ -3086,34 +3082,124 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
                 }
 
                 cfg.job_task_split = 6;
+#else
+                cfg.allow_rotating_primitives = false;
+                cfg.primitive_restart = info->primitive_restart;
+
+                /* Non-fixed restart indices should have been lowered */
+                assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
+#endif
 
                 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
                 cfg.index_type = panfrost_translate_index_size(info->index_size);
 
-                if (cfg.index_type) {
-                        cfg.indices = indices;
+
+                if (PAN_ARCH >= 9) {
+                        /* Base vertex offset on Valhall is used for both
+                         * indexed and non-indexed draws, in a simple way for
+                         * either. Handle both cases.
+                         */
+                        if (cfg.index_type)
+                                cfg.base_vertex_offset = draw->index_bias;
+                        else
+                                cfg.base_vertex_offset = draw->start;
+
+                        /* Indices are moved outside the primitive descriptor
+                         * on Valhall, so we don't need to set that here
+                         */
+                } else if (cfg.index_type) {
                         cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
+
+#if PAN_ARCH <= 7
+                        cfg.indices = indices;
+#endif
                 }
 
 #if PAN_ARCH >= 6
                 cfg.secondary_shader = secondary_shader;
 #endif
         }
+}
 
-        enum pipe_prim_type prim = u_reduced_prim(info->mode);
-        bool polygon = (prim == PIPE_PRIM_TRIANGLES);
-        void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
+#if PAN_ARCH >= 9
+static mali_ptr
+panfrost_emit_resources(struct panfrost_batch *batch,
+                        enum pipe_shader_type stage,
+                        mali_ptr ubos, unsigned ubo_count)
+{
+        struct panfrost_context *ctx = batch->ctx;
+        struct panfrost_ptr T;
+        unsigned nr_tables = 12;
 
-#if PAN_ARCH >= 6
-        pan_section_pack(job, TILER_JOB, TILER, cfg) {
-                cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
+        /* Although individual resources need only 16 byte alignment, the
+         * resource table as a whole must be 64-byte aligned.
+         */
+        T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), 64);
+        memset(T.cpu, 0, nr_tables * pan_size(RESOURCE));
+
+        panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count);
+
+        panfrost_make_resource_table(T, PAN_TABLE_TEXTURE,
+                                     batch->textures[stage],
+                                     ctx->sampler_view_count[stage]);
+
+        panfrost_make_resource_table(T, PAN_TABLE_SAMPLER,
+                                     batch->samplers[stage],
+                                     ctx->sampler_count[stage]);
+
+        panfrost_make_resource_table(T, PAN_TABLE_IMAGE,
+                                     batch->images[stage],
+                                     util_last_bit(ctx->image_mask[stage]));
+
+        if (stage == PIPE_SHADER_VERTEX) {
+                panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE,
+                                             batch->attribs[stage],
+                                             ctx->vertex->num_elements);
+
+                panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER,
+                                             batch->attrib_bufs[stage],
+                                             util_last_bit(ctx->vb_mask));
         }
 
-        pan_section_pack(job, TILER_JOB, PADDING, cfg);
+        return T.gpu | nr_tables;
+}
+
+static void
+panfrost_emit_shader(struct panfrost_batch *batch,
+                     struct MALI_SHADER_ENVIRONMENT *cfg,
+                     enum pipe_shader_type stage,
+                     mali_ptr shader_ptr,
+                     mali_ptr thread_storage)
+{
+        unsigned fau_words = 0, ubo_count = 0;
+        mali_ptr ubos, resources;
+
+        ubos = panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau,
+                                       &fau_words);
+
+        resources = panfrost_emit_resources(batch, stage, ubos, ubo_count);
+
+        cfg->thread_storage = thread_storage;
+        cfg->shader = shader_ptr;
+        cfg->resources = resources;
+
+        /* Each entry of FAU is 64-bits */
+        cfg->fau_count = DIV_ROUND_UP(fau_words, 2);
+}
 #endif
 
-        section = pan_section_ptr(job, TILER_JOB, DRAW);
-        pan_pack(section, DRAW, cfg) {
+static void
+panfrost_emit_draw(void *out,
+                   struct panfrost_batch *batch,
+                   bool fs_required,
+                   enum pipe_prim_type prim,
+                   mali_ptr pos, mali_ptr fs_vary, mali_ptr varyings)
+{
+        struct panfrost_context *ctx = batch->ctx;
+        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+        bool polygon = (prim == PIPE_PRIM_TRIANGLES);
+
+        pan_pack(out, DRAW, cfg) {
                 /*
                  * From the Gallium documentation,
                  * pipe_rasterizer_state::cull_face "indicates which faces of
@@ -3125,6 +3211,101 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
                 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
                 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
                 cfg.front_face_ccw = rast->front_ccw;
+
+                if (ctx->occlusion_query && ctx->active_queries) {
+                        if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
+                                cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
+                        else
+                                cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
+
+                        struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
+                        cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
+                        panfrost_batch_write_rsrc(ctx->batch, rsrc,
+                                              PIPE_SHADER_FRAGMENT);
+                }
+
+#if PAN_ARCH >= 9
+                struct panfrost_shader_state *fs =
+                        panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
+
+                cfg.multisample_enable = rast->multisample;
+                cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
+
+                /* Use per-sample shading if required by API Also use it when a
+                 * blend shader is used with multisampling, as this is handled
+                 * by a single ST_TILE in the blend shader with the current
+                 * sample ID, requiring per-sample shading.
+                 */
+                cfg.evaluate_per_sample =
+                        (rast->multisample &&
+                         ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
+
+                cfg.single_sampled_lines = !rast->multisample;
+
+                cfg.vertex_array.packet = true;
+
+                cfg.minimum_z = batch->minimum_z;
+                cfg.maximum_z = batch->maximum_z;
+
+                cfg.depth_stencil = batch->depth_stencil;
+
+                if (fs_required) {
+                        struct pan_pixel_kill kill = pan_shader_classify_pixel_kill_coverage(&fs->info);
+                        cfg.pixel_kill_operation = kill.pixel_kill;
+                        cfg.zs_update_operation = kill.zs_update;
+
+                        cfg.allow_forward_pixel_to_be_killed = !fs->info.fs.sidefx;
+
+                        /* Mask of render targets that may be written. A render
+                         * target may be written if the fragment shader writes
+                         * to it AND it actually exists. If the render target
+                         * doesn't actually exist, the blend descriptor will be
+                         * OFF so it may be omitted from the mask.
+                         *
+                         * Only set when there is a fragment shader, since
+                         * otherwise no colour updates are possible.
+                         */
+                        cfg.render_target_mask =
+                                (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
+                                ctx->fb_rt_mask;
+
+                        /* Also use per-sample shading if required by the shader
+                         */
+                        cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
+
+                        cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
+                                                       fs->info.fs.can_discard;
+
+                        /* Blend descriptors are only accessed by a BLEND
+                         * instruction on Valhall. It follows that if the
+                         * fragment shader is omitted, we may also emit the
+                         * blend descriptors.
+                         */
+                        cfg.blend = batch->blend;
+                        cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
+                        cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
+
+                        panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
+                                             batch->rsd[PIPE_SHADER_FRAGMENT],
+                                             batch->tls.gpu);
+                } else {
+                        /* These operations need to be FORCE to benefit from the
+                         * depth-only pass optimizations.
+                         */
+                        cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+                        cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+
+                        /* No shader and no blend => no shader or blend
+                         * reasons to disable FPK. The only FPK-related state
+                         * not covered is alpha-to-coverage which we don't set
+                         * without blend.
+                         */
+                        cfg.allow_forward_pixel_to_kill = true;
+
+                        /* No shader => no shader side effects */
+                        cfg.allow_forward_pixel_to_be_killed = true;
+                }
+#else
                 cfg.position = pos;
                 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
                 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
@@ -3145,22 +3326,135 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
                 }
 
                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
+#endif
+        }
+}
 
-                if (ctx->occlusion_query && ctx->active_queries) {
-                        if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
-                                cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
-                        else
-                                cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
+#if PAN_ARCH >= 9
+static void
+panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
+                            const struct pipe_draw_info *info,
+                            const struct pipe_draw_start_count_bias *draw,
+                            mali_ptr indices, bool secondary_shader,
+                            void *job)
+{
+        struct panfrost_context *ctx = batch->ctx;
 
-                        struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
-                        cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
-                        panfrost_batch_write_rsrc(ctx->batch, rsrc,
-                                              PIPE_SHADER_FRAGMENT);
+        struct panfrost_shader_state *vs =
+                panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
+
+        struct panfrost_shader_state *fs =
+                panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
+
+        bool fs_required = panfrost_fs_required(fs, ctx->blend,
+                                                &ctx->pipe_framebuffer,
+                                                ctx->depth_stencil);
+
+        /* Varying shaders only feed data to the fragment shader, so if we omit
+         * the fragment shader, we should omit the varying shader too.
+         */
+        secondary_shader &= fs_required;
+
+        panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader,
+                                pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
+
+        pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
+                cfg.count = info->instance_count;
+        }
+
+        pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
+                if (secondary_shader) {
+                        /* Assumes 16 byte slots. We could do better. */
+                        cfg.vertex_packet_stride = vs->info.varyings.output_count * 16;
+                        cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16;
+                } else {
+                        /* Hardware requirement for "no varyings" */
+                        cfg.vertex_packet_stride = 16;
+                        cfg.vertex_attribute_stride = 0;
                 }
         }
 
+        pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
+                cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
+        }
+
+        STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
+        memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR),
+               &batch->scissor, pan_size(SCISSOR));
+
+        panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0,
+                                     pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
+
+        pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
+                cfg.address = indices;
+        }
+
+        panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW),
+                           batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0);
+
+        pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
+                /* IDVS/points vertex shader */
+                mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX];
+
+                /* IDVS/triangle vertex shader */
+                if (vs_ptr && info->mode != PIPE_PRIM_POINTS)
+                        vs_ptr += pan_size(SHADER_PROGRAM);
+
+                panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr,
+                                     batch->tls.gpu);
+        }
+
+        pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
+                /* If a varying shader is used, we configure it with the same
+                 * state as the position shader for backwards compatible
+                 * behaviour with Bifrost. This could be optimized.
+                 */
+                if (!secondary_shader) continue;
+
+                mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
+                                (2 * pan_size(SHADER_PROGRAM));
+
+                panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX,
+                             ptr, batch->tls.gpu);
+        }
+}
+#endif
+
+#if PAN_ARCH <= 7
+static void
+panfrost_draw_emit_tiler(struct panfrost_batch *batch,
+                         const struct pipe_draw_info *info,
+                         const struct pipe_draw_start_count_bias *draw,
+                         void *invocation_template,
+                         mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
+                         mali_ptr pos, mali_ptr psiz, bool secondary_shader,
+                         void *job)
+{
+        struct panfrost_context *ctx = batch->ctx;
+
+        void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
+        memcpy(section, invocation_template, pan_size(INVOCATION));
+
+        panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader,
+                                pan_section_ptr(job, TILER_JOB, PRIMITIVE));
+
+        void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
+        enum pipe_prim_type prim = u_reduced_prim(info->mode);
+
+#if PAN_ARCH >= 6
+        pan_section_pack(job, TILER_JOB, TILER, cfg) {
+                cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
+        }
+
+        pan_section_pack(job, TILER_JOB, PADDING, cfg);
+#endif
+
+        panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW),
+                           batch, true, prim, pos, fs_vary, varyings);
+
         panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
 }
+#endif
 
 static void
 panfrost_direct_draw(struct panfrost_batch *batch,
@@ -3187,10 +3481,12 @@ panfrost_direct_draw(struct panfrost_batch *batch,
         bool idvs = vs->info.vs.idvs;
         bool secondary_shader = vs->info.vs.secondary_enable;
 
-        struct panfrost_ptr tiler, vertex;
+        UNUSED struct panfrost_ptr tiler, vertex;
 
         if (idvs) {
-#if PAN_ARCH >= 6
+#if PAN_ARCH >= 9
+                tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
+#elif PAN_ARCH >= 6
                 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
 #else
                 unreachable("IDVS is unsupported on Midgard");
@@ -3234,6 +3530,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
 
         panfrost_statistics_record(ctx, info, draw);
 
+#if PAN_ARCH <= 7
         struct mali_invocation_packed invocation;
         if (info->instance_count > 1) {
                 panfrost_pack_work_groups_compute(&invocation,
@@ -3263,13 +3560,23 @@ panfrost_direct_draw(struct panfrost_batch *batch,
 
         mali_ptr attribs, attrib_bufs;
         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
+#endif
 
         panfrost_update_state_3d(batch);
         panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
         panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
         panfrost_clean_state_3d(ctx);
 
-        /* Fire off the draw itself */
+#if PAN_ARCH >= 9
+        assert(idvs && "Memory allocated IDVS required on Valhall");
+
+        panfrost_emit_malloc_vertex(batch, info, draw, indices,
+                                    secondary_shader, tiler.cpu);
+
+        panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                         MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
+                         0, &tiler, false);
+#else
         panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
                                  fs_vary, varyings, pos, psiz, secondary_shader,
                                  tiler.cpu);
@@ -3290,6 +3597,7 @@ panfrost_direct_draw(struct panfrost_batch *batch,
                                           vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
                 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
         }
+#endif
 
         /* Increment transform feedback offsets */
         panfrost_update_streamout_offsets(ctx);