freedreno/a6xx: Add support to load driver-params via UBO

In this case, we can't use CP_LOAD_STATE to push the consts inline in the cmdstream, but instead need to setup a UBO. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31534>
2026-05-01 01:38:06 +02:00 · 2024-09-30 15:03:56 -07:00 · 2024-09-30 15:03:56 -07:00 · 3a0b022136
commit 3a0b022136
parent f193c61c6b
7 changed files with 268 additions and 61 deletions
--- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc
@ -152,7 +152,7 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
      cs->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
      cs_program_emit<CHIP>(ctx, cs->stateobj, cs->v);

-      cs->user_consts_cmdstream_size = fd6_user_consts_cmdstream_size(cs->v);
+      cs->user_consts_cmdstream_size = fd6_user_consts_cmdstream_size<CHIP>(cs->v);
   }

   trace_start_compute(&ctx->batch->trace, ring, !!info->indirect, info->work_dim,
@ -190,10 +190,10 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
      fd6_emit_cs_state<CHIP>(ctx, ring, cs);

   if (ctx->gen_dirty & BIT(FD6_GROUP_CONST))
-      fd6_emit_cs_user_consts(ctx, ring, cs);
+      fd6_emit_cs_user_consts<CHIP>(ctx, ring, cs);

   if (cs->v->need_driver_params || info->input)
-      fd6_emit_cs_driver_params(ctx, ring, cs, info);
+      fd6_emit_cs_driver_params<CHIP>(ctx, ring, cs, info);

   OUT_PKT7(ring, CP_SET_MARKER, 1);
   OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
@ -6,6 +6,7 @@

 #define FD_BO_NO_HARDPIN 1

+#include "fd6_barrier.h"
 #include "fd6_const.h"
 #include "fd6_compute.h"
 #include "fd6_pack.h"
@ -36,6 +37,40 @@ fd6_emit_driver_ubo(struct fd_ringbuffer *ring, const struct ir3_shader_variant
             ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32), 0);
 }

+/* A helper to upload driver-params to a UBO, for the case where constants are
+ * loaded by shader preamble rather than ST6_CONSTANTS
+ */
+static void
+fd6_upload_emit_driver_ubo(struct fd_context *ctx, struct fd_ringbuffer *ring,
+                           const struct ir3_shader_variant *v, int base,
+                           uint32_t sizedwords, const void *dwords)
+{
+   struct pipe_context *pctx = &ctx->base;
+
+   assert(ctx->screen->info->chip >= 7 && ctx->screen->info->a7xx.load_shader_consts_via_preamble);
+
+   if (!sizedwords || (base < 0))
+      return;
+
+   unsigned buffer_offset;
+   struct pipe_resource *buffer = NULL;
+   u_upload_data(pctx->const_uploader, 0, sizedwords * sizeof(uint32_t),
+                 16, dwords,  &buffer_offset, &buffer);
+   if (!buffer)
+      return;  /* nothing good will come of this.. */
+
+   /* The backing BO may otherwise not be tracked by the resource, as
+    * this allocation happens outside of the context of batch resource
+    * tracking.
+    */
+   fd_ringbuffer_attach_bo(ring, fd_resource(buffer)->bo);
+
+   fd6_emit_driver_ubo(ring, v, base, sizedwords, buffer_offset,
+                       fd_resource(buffer)->bo);
+
+   pipe_resource_reference(&buffer, NULL);
+}
+
 /* regid:          base const register
 * prsc or dwords: buffer containing constant values
 * sizedwords:     size of const value buffer
@ -71,6 +106,7 @@ fd6_emit_const_user(struct fd_ringbuffer *ring,
         CP_LOAD_STATE6_2());
   }
 }
+
 void
 fd6_emit_const_bo(struct fd_ringbuffer *ring,
                  const struct ir3_shader_variant *v, uint32_t regid,
@ -115,16 +151,31 @@ emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v,
 }

 static void
-emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v,
-                       uint32_t *params, int num_params)
+wait_mem_writes(struct fd_context *ctx)
 {
-   const struct ir3_const_state *const_state = ir3_const_state(v);
-   const unsigned regid = const_state->offsets.primitive_param;
-   int size = MIN2(1 + regid, v->constlen) - regid;
-   if (size > 0)
-      fd6_emit_const_user(ring, v, regid * 4, num_params, params);
+   ctx->batch->barrier |= FD6_WAIT_MEM_WRITES | FD6_INVALIDATE_CACHE | FD6_WAIT_FOR_IDLE;
 }

+template <chip CHIP>
+static void
+emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v,
+                       struct fd_context *ctx, uint32_t *params, int num_params)
+{
+   const struct ir3_const_state *const_state = ir3_const_state(v);
+
+   if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) {
+      int base = const_state->primitive_param_ubo.idx;
+
+      fd6_upload_emit_driver_ubo(ctx, ring, v, base, num_params, params);
+   } else {
+      const unsigned regid = const_state->offsets.primitive_param;
+      int size = MIN2(1 + regid, v->constlen) - regid;
+      if (size > 0)
+         fd6_emit_const_user(ring, v, regid * 4, num_params, params);
+   }
+}
+
+template <chip CHIP>
 struct fd_ringbuffer *
 fd6_build_tess_consts(struct fd6_emit *emit)
 {
@ -144,7 +195,7 @@ fd6_build_tess_consts(struct fd6_emit *emit)
      emit->vs->output_size * 4,                /* vs vertex stride */
      0, 0};

-   emit_stage_tess_consts(constobj, emit->vs, vs_params, ARRAY_SIZE(vs_params));
+   emit_stage_tess_consts<CHIP>(constobj, emit->vs, emit->ctx, vs_params, ARRAY_SIZE(vs_params));

   if (emit->hs) {
      struct fd_bo *tess_bo = ctx->screen->tess_bo;
@ -164,8 +215,8 @@ fd6_build_tess_consts(struct fd6_emit *emit)
         tess_factor_iova >> 32,
      };

-      emit_stage_tess_consts(constobj, emit->hs, hs_params,
-                             ARRAY_SIZE(hs_params));
+      emit_stage_tess_consts<CHIP>(constobj, emit->hs, emit->ctx,
+                                   hs_params, ARRAY_SIZE(hs_params));

      if (emit->gs)
         num_vertices = emit->gs->gs.vertices_in;
@ -181,8 +232,8 @@ fd6_build_tess_consts(struct fd6_emit *emit)
         tess_factor_iova >> 32,
      };

-      emit_stage_tess_consts(constobj, emit->ds, ds_params,
-                             ARRAY_SIZE(ds_params));
+      emit_stage_tess_consts<CHIP>(constobj, emit->ds, emit->ctx,
+                                   ds_params,  ARRAY_SIZE(ds_params));
   }

   if (emit->gs) {
@ -200,12 +251,13 @@ fd6_build_tess_consts(struct fd6_emit *emit)
      };

      num_vertices = emit->gs->gs.vertices_in;
-      emit_stage_tess_consts(constobj, emit->gs, gs_params,
-                             ARRAY_SIZE(gs_params));
+      emit_stage_tess_consts<CHIP>(constobj, emit->gs, emit->ctx,
+                                   gs_params, ARRAY_SIZE(gs_params));
   }

   return constobj;
 }
+FD_GENX(fd6_build_tess_consts);

 static void
 fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
@ -240,6 +292,7 @@ fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
   }
 }

+template <chip CHIP>
 unsigned
 fd6_user_consts_cmdstream_size(const struct ir3_shader_variant *v)
 {
@ -250,8 +303,13 @@ fd6_user_consts_cmdstream_size(const struct ir3_shader_variant *v)
   const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
   unsigned packets, size;

-   /* pre-calculate size required for userconst stateobj: */
-   ir3_user_consts_size(ubo_state, &packets, &size);
+   if (CHIP == A7XX && v->compiler->load_shader_consts_via_preamble) {
+      packets = 0;
+      size = 0;
+   } else {
+      /* pre-calculate size required for userconst stateobj: */
+      ir3_user_consts_size(ubo_state, &packets, &size);
+   }

   /* also account for UBO addresses: */
   packets += 1;
@ -260,17 +318,23 @@ fd6_user_consts_cmdstream_size(const struct ir3_shader_variant *v)
   unsigned sizedwords = (4 * packets) + size;
   return sizedwords * 4;
 }
+FD_GENX(fd6_user_consts_cmdstream_size);

+template <chip CHIP>
 static void
 emit_user_consts(const struct ir3_shader_variant *v,
                 struct fd_ringbuffer *ring,
                 struct fd_constbuf_stateobj *constbuf)
 {
-   ir3_emit_user_consts(v, ring, constbuf);
   fd6_emit_ubos(v, ring, constbuf);
+
+   if (CHIP == A7XX && v->compiler->load_shader_consts_via_preamble)
+      return;
+
+   ir3_emit_user_consts(v, ring, constbuf);
 }

-template <fd6_pipeline_type PIPELINE>
+template <chip CHIP, fd6_pipeline_type PIPELINE>
 struct fd_ringbuffer *
 fd6_build_user_consts(struct fd6_emit *emit)
 {
@ -280,33 +344,82 @@ fd6_build_user_consts(struct fd6_emit *emit)
   struct fd_ringbuffer *constobj =
      fd_submit_new_ringbuffer(ctx->batch->submit, sz, FD_RINGBUFFER_STREAMING);

-   emit_user_consts(emit->vs, constobj, &ctx->constbuf[PIPE_SHADER_VERTEX]);
+   emit_user_consts<CHIP>(emit->vs, constobj, &ctx->constbuf[PIPE_SHADER_VERTEX]);
+
   if (PIPELINE == HAS_TESS_GS) {
      if (emit->hs) {
-         emit_user_consts(emit->hs, constobj, &ctx->constbuf[PIPE_SHADER_TESS_CTRL]);
-         emit_user_consts(emit->ds, constobj, &ctx->constbuf[PIPE_SHADER_TESS_EVAL]);
+         emit_user_consts<CHIP>(emit->hs, constobj, &ctx->constbuf[PIPE_SHADER_TESS_CTRL]);
+         emit_user_consts<CHIP>(emit->ds, constobj, &ctx->constbuf[PIPE_SHADER_TESS_EVAL]);
      }
      if (emit->gs) {
-         emit_user_consts(emit->gs, constobj, &ctx->constbuf[PIPE_SHADER_GEOMETRY]);
+         emit_user_consts<CHIP>(emit->gs, constobj, &ctx->constbuf[PIPE_SHADER_GEOMETRY]);
      }
   }
-   emit_user_consts(emit->fs, constobj, &ctx->constbuf[PIPE_SHADER_FRAGMENT]);
+   emit_user_consts<CHIP>(emit->fs, constobj, &ctx->constbuf[PIPE_SHADER_FRAGMENT]);

   return constobj;
 }
+template struct fd_ringbuffer * fd6_build_user_consts<A6XX, HAS_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_user_consts<A7XX, HAS_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_user_consts<A6XX, NO_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_user_consts<A7XX, NO_TESS_GS>(struct fd6_emit *emit);

-template struct fd_ringbuffer * fd6_build_user_consts<HAS_TESS_GS>(struct fd6_emit *emit);
-template struct fd_ringbuffer * fd6_build_user_consts<NO_TESS_GS>(struct fd6_emit *emit);
+template <chip CHIP>
+static inline void
+emit_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *dpconstobj,
+                   struct fd_context *ctx, const struct pipe_draw_info *info,
+                   const struct pipe_draw_indirect_info *indirect,
+                   const struct ir3_driver_params_vs *vertex_params)
+{
+   if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) {
+      const struct ir3_const_state *const_state = ir3_const_state(v);
+      int base = const_state->driver_params_ubo.idx;

-template <fd6_pipeline_type PIPELINE>
+      fd6_upload_emit_driver_ubo(ctx, dpconstobj, v, base,
+                                 dword_sizeof(*vertex_params),
+                                 vertex_params);
+   } else {
+      ir3_emit_driver_params(v, dpconstobj, ctx, info, indirect, vertex_params);
+   }
+}
+
+template <chip CHIP>
+static inline void
+emit_hs_driver_params(const struct ir3_shader_variant *v,
+                      struct fd_ringbuffer *dpconstobj,
+                      struct fd_context *ctx)
+{
+   if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) {
+      const struct ir3_const_state *const_state = ir3_const_state(v);
+      struct ir3_driver_params_tcs hs_params = ir3_build_driver_params_tcs(ctx);
+      int base = const_state->driver_params_ubo.idx;
+
+      fd6_upload_emit_driver_ubo(ctx, dpconstobj, v, base,
+                                 dword_sizeof(hs_params),
+                                 &hs_params);
+   } else {
+      ir3_emit_hs_driver_params(v, dpconstobj, ctx);
+   }
+}
+
+template <chip CHIP, fd6_pipeline_type PIPELINE>
 struct fd_ringbuffer *
 fd6_build_driver_params(struct fd6_emit *emit)
 {
   struct fd_context *ctx = emit->ctx;
   struct fd6_context *fd6_ctx = fd6_context(ctx);
   unsigned num_dp = emit->prog->num_driver_params;
+   unsigned num_ubo_dp;

-   if (!num_dp) {
+   if (CHIP == A6XX) {
+      assert(!emit->prog->num_ubo_driver_params);
+      /* Make it easier for compiler to see that this path isn't used on a6xx: */
+      num_ubo_dp = 0;
+   } else {
+      num_ubo_dp = emit->prog->num_ubo_driver_params;
+   }
+
+   if (!num_dp && !num_ubo_dp) {
      fd6_ctx->has_dp_state = false;
      return NULL;
   }
@ -323,53 +436,104 @@ fd6_build_driver_params(struct fd6_emit *emit)
      ir3_build_driver_params_vs(ctx, emit->info, emit->draw, emit->draw_id, needs_ucp);

   unsigned size_dwords =
-      num_dp * (4 + dword_sizeof(p));  /* 4dw PKT7 header */
+      num_dp * (4 + dword_sizeof(p)) + /* 4dw PKT7 header */
+      num_ubo_dp * 6;                  /* 6dw per UBO descriptor */
+
   struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer(
         ctx->batch->submit, size_dwords * 4, FD_RINGBUFFER_STREAMING);

+   /* VS still works the old way*/
   if (emit->vs->need_driver_params) {
      ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info, emit->indirect, &p);
   }

   if (PIPELINE == HAS_TESS_GS) {
      if (emit->gs && emit->gs->need_driver_params) {
-         ir3_emit_driver_params(emit->gs, dpconstobj, ctx, emit->info, emit->indirect, &p);
+         emit_driver_params<CHIP>(emit->gs, dpconstobj, ctx, emit->info, emit->indirect, &p);
      }

      if (emit->hs && emit->hs->need_driver_params) {
-         ir3_emit_hs_driver_params(emit->hs, dpconstobj, ctx);
+         emit_hs_driver_params<CHIP>(emit->hs, dpconstobj, ctx);
      }

      if (emit->ds && emit->ds->need_driver_params) {
-         ir3_emit_driver_params(emit->ds, dpconstobj, ctx, emit->info, emit->indirect, &p);
+         emit_driver_params<CHIP>(emit->ds, dpconstobj, ctx, emit->info, emit->indirect, &p);
      }
   }

+   if (emit->indirect)
+      wait_mem_writes(ctx);
+
   fd6_ctx->has_dp_state = true;

   return dpconstobj;
 }

-template struct fd_ringbuffer * fd6_build_driver_params<HAS_TESS_GS>(struct fd6_emit *emit);
-template struct fd_ringbuffer * fd6_build_driver_params<NO_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_driver_params<A6XX, HAS_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_driver_params<A7XX, HAS_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_driver_params<A6XX, NO_TESS_GS>(struct fd6_emit *emit);
+template struct fd_ringbuffer * fd6_build_driver_params<A7XX, NO_TESS_GS>(struct fd6_emit *emit);

+template <chip CHIP>
 void
 fd6_emit_cs_driver_params(struct fd_context *ctx,
                          struct fd_ringbuffer *ring,
                          struct fd6_compute_state *cs,
                          const struct pipe_grid_info *info)
 {
-   ir3_emit_cs_driver_params(cs->v, ring, ctx, info);
-}
+   /* info->input not handled in the UBO path.  I believe this was only
+    * ever used by clover
+    */
+   assert(!info->input);

+   if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) {
+      const struct ir3_const_state *const_state = ir3_const_state(cs->v);
+      struct ir3_driver_params_cs compute_params =
+         ir3_build_driver_params_cs(cs->v, info);
+      int base = const_state->driver_params_ubo.idx;
+
+      if (base < 0)
+         return;
+
+      struct pipe_resource *buffer = NULL;
+      unsigned buffer_offset;
+
+      u_upload_data(ctx->base.const_uploader, 0, sizeof(compute_params),
+                     16, &compute_params,  &buffer_offset, &buffer);
+
+      if (info->indirect) {
+         /* Copy indirect params into UBO: */
+         ctx->screen->mem_to_mem(ring, buffer, buffer_offset, info->indirect,
+                                 info->indirect_offset, 3);
+
+         wait_mem_writes(ctx);
+      } else {
+         fd_ringbuffer_attach_bo(ring, fd_resource(buffer)->bo);
+      }
+
+      fd6_emit_driver_ubo(ring, cs->v, base, dword_sizeof(compute_params),
+                          buffer_offset, fd_resource(buffer)->bo);
+
+      pipe_resource_reference(&buffer, NULL);
+   } else {
+      ir3_emit_cs_driver_params(cs->v, ring, ctx, info);
+      if (info->indirect)
+         wait_mem_writes(ctx);
+   }
+}
+FD_GENX(fd6_emit_cs_driver_params);
+
+template <chip CHIP>
 void
 fd6_emit_cs_user_consts(struct fd_context *ctx,
                        struct fd_ringbuffer *ring,
                        struct fd6_compute_state *cs)
 {
-   emit_user_consts(cs->v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]);
+   emit_user_consts<CHIP>(cs->v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]);
 }
+FD_GENX(fd6_emit_cs_user_consts);

+template <chip CHIP>
 void
 fd6_emit_immediates(const struct ir3_shader_variant *v,
                    struct fd_ringbuffer *ring)
@ -383,13 +547,28 @@ fd6_emit_immediates(const struct ir3_shader_variant *v,
                          v->info.constant_data_offset, v->bo);
   }

+   if (CHIP == A7XX && v->compiler->load_inline_uniforms_via_preamble_ldgk)
+      return;
+
   ir3_emit_immediates(v, ring);
 }
+FD_GENX(fd6_emit_immediates);

+template <chip CHIP>
 void
-fd6_emit_link_map(const struct ir3_shader_variant *producer,
+fd6_emit_link_map(struct fd_context *ctx,
+                  const struct ir3_shader_variant *producer,
                  const struct ir3_shader_variant *consumer,
                  struct fd_ringbuffer *ring)
 {
-   ir3_emit_link_map(producer, consumer, ring);
+   if (CHIP == A7XX && producer->compiler->load_shader_consts_via_preamble) {
+      const struct ir3_const_state *const_state = ir3_const_state(consumer);
+      int base = const_state->primitive_map_ubo.idx;
+      uint32_t size = ALIGN(consumer->input_size, 4);
+
+      fd6_upload_emit_driver_ubo(ctx, ring, consumer, base, size, producer->output_loc);
+   } else {
+      ir3_emit_link_map(producer, consumer, ring);
+   }
 }
+FD_GENX(fd6_emit_link_map);
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.h
@ -9,26 +9,33 @@

 #include "fd6_emit.h"

+template <chip CHIP>
 struct fd_ringbuffer *fd6_build_tess_consts(struct fd6_emit *emit) assert_dt;
+template <chip CHIP>
 unsigned fd6_user_consts_cmdstream_size(const struct ir3_shader_variant *v);

-template <fd6_pipeline_type PIPELINE>
+template <chip CHIP, fd6_pipeline_type PIPELINE>
 struct fd_ringbuffer *fd6_build_user_consts(struct fd6_emit *emit) assert_dt;

-template <fd6_pipeline_type PIPELINE>
+template <chip CHIP, fd6_pipeline_type PIPELINE>
 struct fd_ringbuffer *
 fd6_build_driver_params(struct fd6_emit *emit) assert_dt;

+template <chip CHIP>
 void fd6_emit_cs_driver_params(struct fd_context *ctx,
                               struct fd_ringbuffer *ring,
                               struct fd6_compute_state *cs,
                               const struct pipe_grid_info *info) assert_dt;
+template <chip CHIP>
 void fd6_emit_cs_user_consts(struct fd_context *ctx,
                             struct fd_ringbuffer *ring,
                             struct fd6_compute_state *cs) assert_dt;
+template <chip CHIP>
 void fd6_emit_immediates(const struct ir3_shader_variant *v,
                         struct fd_ringbuffer *ring) assert_dt;
-void fd6_emit_link_map(const struct ir3_shader_variant *producer,
+template <chip CHIP>
+void fd6_emit_link_map(struct fd_context *ctx,
+                       const struct ir3_shader_variant *producer,
                       const struct ir3_shader_variant *consumer,
                       struct fd_ringbuffer *ring) assert_dt;

--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc
@ -679,16 +679,16 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
         fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_BINDLESS);
         break;
      case FD6_GROUP_CONST:
-         state = fd6_build_user_consts<PIPELINE>(emit);
+         state = fd6_build_user_consts<CHIP, PIPELINE>(emit);
         fd6_state_take_group(&emit->state, state, FD6_GROUP_CONST);
         break;
      case FD6_GROUP_DRIVER_PARAMS:
-         state = fd6_build_driver_params<PIPELINE>(emit);
+         state = fd6_build_driver_params<CHIP, PIPELINE>(emit);
         fd6_state_take_group(&emit->state, state, FD6_GROUP_DRIVER_PARAMS);
         break;
      case FD6_GROUP_PRIMITIVE_PARAMS:
         if (PIPELINE == HAS_TESS_GS) {
-            state = fd6_build_tess_consts(emit);
+            state = fd6_build_tess_consts<CHIP>(emit);
            fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIMITIVE_PARAMS);
         }
         break;
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc
@ -236,7 +236,7 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
      OUT_RELOC(ring, so->bo, 0, 0, 0);
   }

-   fd6_emit_immediates(so, ring);
+   fd6_emit_immediates<CHIP>(so, ring);
 }
 FD_GENX(fd6_emit_shader);

@ -839,8 +839,8 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
      OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
      OUT_RING(ring, b->hs->tess.tcs_vertices_out);

-      fd6_emit_link_map(b->vs, b->hs, ring);
-      fd6_emit_link_map(b->hs, b->ds, ring);
+      fd6_emit_link_map<CHIP>(b->ctx, b->vs, b->hs, ring);
+      fd6_emit_link_map<CHIP>(b->ctx, b->hs, b->ds, ring);
   }

   if (b->gs) {
@ -849,10 +849,11 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
         b->ds ? b->ds->output_size : b->vs->output_size;

      if (b->hs) {
-         fd6_emit_link_map(b->ds, b->gs, ring);
+         fd6_emit_link_map<CHIP>(b->ctx, b->ds, b->gs, ring);
      } else {
-         fd6_emit_link_map(b->vs, b->gs, ring);
+         fd6_emit_link_map<CHIP>(b->ctx, b->vs, b->gs, ring);
      }
+
      vertices_out = MAX2(1, b->gs->gs.vertices_out) - 1;
      enum a6xx_tess_output output =
         primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive);
@ -1451,23 +1452,33 @@ fd6_program_create(void *data, const struct ir3_shader_variant *bs,

   /* Note that binning pass uses same const state as draw pass: */
   state->user_consts_cmdstream_size =
-         fd6_user_consts_cmdstream_size(state->vs) +
-         fd6_user_consts_cmdstream_size(state->hs) +
-         fd6_user_consts_cmdstream_size(state->ds) +
-         fd6_user_consts_cmdstream_size(state->gs) +
-         fd6_user_consts_cmdstream_size(state->fs);
+         fd6_user_consts_cmdstream_size<CHIP>(state->vs) +
+         fd6_user_consts_cmdstream_size<CHIP>(state->hs) +
+         fd6_user_consts_cmdstream_size<CHIP>(state->ds) +
+         fd6_user_consts_cmdstream_size<CHIP>(state->gs) +
+         fd6_user_consts_cmdstream_size<CHIP>(state->fs);

   unsigned num_dp = 0;
+   unsigned num_ubo_dp = 0;
+
   if (vs->need_driver_params)
      num_dp++;
+
   if (gs && gs->need_driver_params)
-      num_dp++;
+      num_ubo_dp++;
   if (hs && hs->need_driver_params)
-      num_dp++;
+      num_ubo_dp++;
   if (ds && ds->need_driver_params)
-      num_dp++;
+      num_ubo_dp++;
+
+   if (!(CHIP == A7XX && vs->compiler->load_inline_uniforms_via_preamble_ldgk)) {
+      /* On a6xx all shader stages use driver params pushed in cmdstream: */
+      num_dp += num_ubo_dp;
+      num_ubo_dp = 0;
+   }

   state->num_driver_params = num_dp;
+   state->num_ubo_driver_params = num_ubo_dp;

   /* dual source blending has an extra fs output in the 2nd slot */
   if (fs->fs.color_is_dual_source) {
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h
@ -38,13 +38,18 @@ struct fd6_program_state {
    * Whether multiple viewports are used is determined by whether
    * the last shader stage writes viewport id
    */
-   uint16_t num_viewports;
+   uint8_t num_viewports;

   /**
    * The # of shader stages that need driver params.
    */
   uint8_t num_driver_params;

+   /**
+    * The # of shader stages that need ubo driver params
+    */
+   uint8_t num_ubo_driver_params;
+
   /**
    * Output components from frag shader.  It is possible to have
    * a fragment shader that only writes a subset of the bound
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@ -574,6 +574,11 @@ ir3_screen_init(struct pipe_screen *pscreen)
   if (screen->gen >= 6) {
      options.lower_base_vertex = true;
   }
+
+   if (screen->gen >= 7) {
+      options.push_ubo_with_preamble = true;
+   }
+
   screen->compiler =
      ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);