turnip,ir3: Implement A7XX push consts load via preamble

New push consts loading consist of: - Push consts are set for the entire pipeline via HLSQ_SHARED_CONSTS_IMM array which could fit up to 256b of push consts. - For each shader stage that uses push consts READ_IMM_SHARED_CONSTS should be set in HLSQ_*_CNTL, otherwise push consts may get overwritten by new push consts that are set after the draw. - Push consts are loaded into consts reg file in a shader preamble via stsc at the very start of the preamble. OPC_PUSH_CONSTS_LOAD_MACRO is used instead of directly translating NIR intrinsic into stsc because: we don't want to teach legalize pass how to set (ss) between stores and loads of consts reg file, don't want for stsc to be reordered, etc. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25086>
2026-04-23 05:10:36 +02:00 · 2023-09-05 18:24:03 +02:00 · 2023-09-05 18:24:03 +02:00 · a5f0f7d4b1
commit a5f0f7d4b1
parent e39b6e2b9b
21 changed files with 215 additions and 48 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1329,6 +1329,10 @@ store("uniform_ir3", [], indices=[BASE])
 # vec4's.
 intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])

+# IR3-specific intrinsic for stsc. Loads from push consts to constant file
+# Should be used in the shader preamble.
+intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE])
+
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@ -191,6 +191,10 @@ struct fd_dev_info {
   } a6xx;

   struct {
+      /* stsc may need to be done twice for the same range to workaround
+       * _something_, observed in blob's disassembly.
+       */
+      bool stsc_duplication_quirk;
   } a7xx;
 };

--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@ -704,7 +704,9 @@ add_gpus([

 a7xx_730 = A7XXProps()

-a7xx_740 = A7XXProps()
+a7xx_740 = A7XXProps(
+        stsc_duplication_quirk = True,
+    )

 add_gpus([
        GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@ -194,6 +194,7 @@ static const struct opc_info {
   OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
   OPC(1, OPC_SCAN_MACRO, scan.macro),
   OPC(1, OPC_SHPS_MACRO, shps.macro),
+   OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro),

   /* category 2: */
   OPC(2, OPC_ADD_F,        add.f),
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@ -131,6 +131,11 @@ typedef enum {
   /* Macros that expand to a loop */
   OPC_SCAN_MACRO      = _OPC(1, 58),

+   /* Macros that expand to an stsc at the start of the preamble.
+    * It loads into const file and should not be optimized in any way.
+    */
+   OPC_PUSH_CONSTS_LOAD_MACRO = _OPC(1, 59),
+
   /* category 2: */
   OPC_ADD_F           = _OPC(2, 0),
   OPC_MIN_F           = _OPC(2, 1),
@ -406,7 +411,7 @@ typedef enum {
   /*
    * A manually encoded opcode
    */
-   OPC_META_RAW = _OPC(OPC_META, 7)
+   OPC_META_RAW = _OPC(OPC_META, 7),
 } opc_t;
 /* clang-format on */

--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -445,6 +445,10 @@ struct ir3_instruction {
          */
         gl_system_value sysval;
      } input;
+      struct {
+         unsigned src_base, src_size;
+         unsigned dst_base;
+      } push_consts;
      struct {
         uint64_t value;
      } raw;
@ -2485,6 +2489,7 @@ INSTR1(QUAD_SHUFFLE_VERT)
 INSTR1(QUAD_SHUFFLE_DIAG)
 INSTR2NODST(LDC_K)
 INSTR2NODST(STC)
+INSTR2NODST(STSC)
 #ifndef GPU
 #elif GPU >= 600
 INSTR3NODST(STIB);
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -202,13 +202,13 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
         compiler->shared_consts_size = 8;
         compiler->geom_shared_consts_size_quirk = 16;
      } else {
-         /* A7XX TODO: properly use new shared consts mechanism */
         compiler->shared_consts_base_offset = -1;
         compiler->shared_consts_size = 0;
         compiler->geom_shared_consts_size_quirk = 0;
      }

      compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
+      compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
   } else {
      compiler->max_const_pipeline = 512;
      compiler->max_const_geom = 512;
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -245,6 +245,8 @@ struct ir3_compiler {
   uint64_t geom_shared_consts_size_quirk;

   bool has_fs_tex_prefetch;
+
+   bool stsc_duplication_quirk;
 };

 void ir3_compiler_destroy(struct ir3_compiler *compiler);
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -2678,6 +2678,16 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      array_insert(b, b->keeps, stc);
      break;
   }
+   case nir_intrinsic_copy_push_const_to_uniform_ir3: {
+      struct ir3_instruction *load =
+         ir3_instr_create(ctx->block, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
+      array_insert(b, b->keeps, load);
+
+      load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
+      load->push_consts.src_base = nir_intrinsic_base(intr);
+      load->push_consts.src_size = nir_intrinsic_range(intr);
+      break;
+   }
   default:
      ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
                        nir_intrinsic_infos[intr->intrinsic].name);
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@ -58,6 +58,7 @@ struct ir3_legalize_state {
   regmask_t needs_ss;
   regmask_t needs_ss_war; /* write after read */
   regmask_t needs_sy;
+   bool needs_ss_for_const;
 };

 struct ir3_legalize_block_data {
@ -65,6 +66,17 @@ struct ir3_legalize_block_data {
   struct ir3_legalize_state state;
 };

+static inline void
+apply_ss(struct ir3_instruction *instr,
+         struct ir3_legalize_state *state,
+         bool mergedregs)
+{
+   instr->flags |= IR3_INSTR_SS;
+   regmask_init(&state->needs_ss_war, mergedregs);
+   regmask_init(&state->needs_ss, mergedregs);
+   state->needs_ss_for_const = false;
+}
+
 /* We want to evaluate each block from the position of any other
 * predecessor block, in order that the flags set are the union of
 * all possible program paths.
@ -109,6 +121,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
      regmask_or(&state->needs_ss_war, &state->needs_ss_war,
                 &pstate->needs_ss_war);
      regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
+      state->needs_ss_for_const |= pstate->needs_ss_for_const;
   }

   /* We need to take phsyical-only edges into account when tracking shared
@ -162,17 +175,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
      }

      if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
-         n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-         last_input_needs_ss = false;
-         regmask_init(&state->needs_ss_war, mergedregs);
-         regmask_init(&state->needs_ss, mergedregs);
+         apply_ss(n, state, mergedregs);
+
+         n->flags |= IR3_INSTR_SY;
         regmask_init(&state->needs_sy, mergedregs);
+         last_input_needs_ss = false;
      }

      if (last_n && (last_n->opc == OPC_PREDT)) {
-         n->flags |= IR3_INSTR_SS;
-         regmask_init(&state->needs_ss_war, mergedregs);
-         regmask_init(&state->needs_ss, mergedregs);
+         apply_ss(n, state, mergedregs);
      }

      /* NOTE: consider dst register too.. it could happen that
@ -195,25 +206,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
             * some tests for both this and (sy)..
             */
            if (regmask_get(&state->needs_ss, reg)) {
-               n->flags |= IR3_INSTR_SS;
+               apply_ss(n, state, mergedregs);
               last_input_needs_ss = false;
-               regmask_init(&state->needs_ss_war, mergedregs);
-               regmask_init(&state->needs_ss, mergedregs);
            }

            if (regmask_get(&state->needs_sy, reg)) {
               n->flags |= IR3_INSTR_SY;
               regmask_init(&state->needs_sy, mergedregs);
            }
+         } else if ((reg->flags & IR3_REG_CONST) && state->needs_ss_for_const) {
+            apply_ss(n, state, mergedregs);
+            last_input_needs_ss = false;
         }
      }

      foreach_dst (reg, n) {
         if (regmask_get(&state->needs_ss_war, reg)) {
-            n->flags |= IR3_INSTR_SS;
+            apply_ss(n, state, mergedregs);
            last_input_needs_ss = false;
-            regmask_init(&state->needs_ss_war, mergedregs);
-            regmask_init(&state->needs_ss, mergedregs);
         }
      }

@ -230,7 +240,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
      }

      /* need to be able to set (ss) on first instruction: */
-      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n))
         ir3_NOP(block);

      if (ctx->compiler->samgq_workaround &&
@ -281,6 +291,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
         } else {
            regmask_set(&state->needs_ss, n->dsts[0]);
         }
+      } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+         state->needs_ss_for_const = true;
      }

      if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
@ -324,9 +336,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)

            last_input->dsts[0]->flags |= IR3_REG_EI;
            if (last_input_needs_ss) {
-               last_input->flags |= IR3_INSTR_SS;
-               regmask_init(&state->needs_ss_war, mergedregs);
-               regmask_init(&state->needs_ss, mergedregs);
+               apply_ss(last_input, state, mergedregs);
            }
         }
      }
@ -407,6 +417,36 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
   return true;
 }

+static void
+apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
+                             struct ir3_block *block)
+{
+   foreach_instr (n, &block->instr_list) {
+      if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+         struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
+         ir3_instr_move_after(stsc, n);
+         ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+            n->push_consts.dst_base;
+         ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+            n->push_consts.src_base;
+         stsc->cat6.iim_val = n->push_consts.src_size;
+         stsc->cat6.type = TYPE_U32;
+
+         if (ctx->compiler->stsc_duplication_quirk) {
+            struct ir3_instruction *nop = ir3_NOP(block);
+            ir3_instr_move_after(nop, stsc);
+            nop->flags |= IR3_INSTR_SS;
+            ir3_instr_move_after(ir3_instr_clone(stsc), nop);
+         }
+
+         list_delinit(&n->node);
+         break;
+      } else if (!is_meta(n)) {
+         break;
+      }
+   }
+}
+
 /* NOTE: branch instructions are always the last instruction(s)
 * in the block.  We take advantage of this as we resolve the
 * branches, since "if (foo) break;" constructs turn into
@ -1180,6 +1220,13 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
      progress |= apply_fine_deriv_macro(ctx, block);
   }

+   foreach_block (block, &ir->block_list) {
+      if (block->brtype == IR3_BRANCH_GETONE) {
+         apply_push_consts_load_macro(ctx, block->successors[0]);
+         break;
+      }
+   }
+
   nop_sched(ir, so);

   while (opt_jump(ir))
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -747,6 +747,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)

   progress |= OPT(s, ir3_nir_lower_ubo_loads, so);

+   if (so->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE)
+      progress |= OPT(s, ir3_nir_lower_push_consts_to_preamble, so);
+
   progress |= OPT(s, ir3_nir_lower_preamble, so);

   OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@ -40,6 +40,8 @@ bool ir3_nir_lower_imul(nir_shader *shader);
 bool ir3_nir_lower_io_offsets(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
+bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
+                                           struct ir3_shader_variant *v);
 bool ir3_nir_move_varying_inputs(nir_shader *shader);
 int ir3_nir_coord_offset(nir_def *ssa);
 bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
--- a/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c
+++ b/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c
@ -0,0 +1,28 @@
+/*
+ * Copyright © 2023 Igalia S.L.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/u_math.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+bool
+ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
+                                      struct ir3_shader_variant *v)
+{
+   nir_function_impl *preamble = nir_shader_get_preamble(nir);
+   nir_builder _b = nir_builder_at(nir_before_impl(preamble));
+   nir_builder *b = &_b;
+
+   nir_copy_push_const_to_uniform_ir3(
+      b, nir_imm_int(b, 0), .base = v->shader_options.push_consts_base,
+      .range = v->shader_options.push_consts_dwords);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+   }
+   return true;
+}
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@ -691,6 +691,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
      if (instr->opc == OPC_META_TEX_PREFETCH)
         schedule(ctx, instr);

+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
+         schedule(ctx, instr);
+
   while (!list_is_empty(&ctx->unscheduled_list)) {
      struct ir3_instruction *instr = choose_instr(ctx);

--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@ -181,7 +181,8 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
         }
      }

-      if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO) {
+      if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO &&
+          instr->opc != OPC_PUSH_CONSTS_LOAD_MACRO) {
         mesa_log_stream_printf(stream, ".%s%s",
                                type_name(instr->cat1.src_type),
                                type_name(instr->cat1.dst_type));
@ -405,6 +406,11 @@ print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
      mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d",
                             instr->prefetch.tex, instr->prefetch.samp,
                             instr->prefetch.input_offset);
+   } else if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+      mesa_log_stream_printf(
+         stream, " dst_offset=%d, src_offset = %d, src_size = %d",
+         instr->push_consts.dst_base, instr->push_consts.src_base,
+         instr->push_consts.src_size);
   }

   if (is_flow(instr) && instr->cat0.target) {
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@ -1235,6 +1235,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
      if (instr->opc == OPC_META_TEX_PREFETCH)
         schedule(ctx, instr);

+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
+         schedule(ctx, instr);
+
   while (!list_is_empty(&ctx->unscheduled_list)) {
      struct ir3_sched_notes notes = {0};
      struct ir3_instruction *instr;
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@ -151,6 +151,7 @@ enum ir3_push_consts_type {
   IR3_PUSH_CONSTS_NONE,
   IR3_PUSH_CONSTS_PER_STAGE,
   IR3_PUSH_CONSTS_SHARED,
+   IR3_PUSH_CONSTS_SHARED_PREAMBLE,
 };

 /**
@ -507,6 +508,9 @@ struct ir3_shader_options {
    */
   enum ir3_wavesize_option real_wavesize;
   enum ir3_push_consts_type push_consts_type;
+
+   uint32_t push_consts_base;
+   uint32_t push_consts_dwords;
 };

 /**
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@ -95,6 +95,7 @@ libfreedreno_ir3_files = files(
  'ir3_nir_lower_64b.c',
  'ir3_nir_lower_load_barycentric_at_sample.c',
  'ir3_nir_lower_load_barycentric_at_offset.c',
+  'ir3_nir_lower_push_consts_to_preamble.c',
  'ir3_nir_lower_io_offsets.c',
  'ir3_nir_lower_tess.c',
  'ir3_nir_lower_tex_prefetch.c',
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -4256,9 +4256,10 @@ tu6_user_consts_size(const struct tu_const_state *const_state,
 {
   uint32_t dwords = 0;

-   if (const_state->push_consts.dwords > 0) {
+   if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
      unsigned num_units = const_state->push_consts.dwords;
      dwords += 4 + num_units;
+      assert(num_units > 0);
   }

   dwords += 8 * const_state->num_inline_ubos;
@ -4267,12 +4268,10 @@ tu6_user_consts_size(const struct tu_const_state *const_state,
 }

 static void
-tu6_emit_user_consts(struct tu_cs *cs,
-                     const struct tu_const_state *const_state,
-                     unsigned constlen,
-                     gl_shader_stage type,
-                     struct tu_descriptor_state *descriptors,
-                     uint32_t *push_constants)
+tu6_emit_per_stage_push_consts(struct tu_cs *cs,
+                               const struct tu_const_state *const_state,
+                               gl_shader_stage type,
+                               uint32_t *push_constants)
 {
   if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
      unsigned num_units = const_state->push_consts.dwords;
@ -4291,7 +4290,15 @@ tu6_emit_user_consts(struct tu_cs *cs,
      for (unsigned i = 0; i < num_units; i++)
         tu_cs_emit(cs, push_constants[i + offset]);
   }
+}

+static void
+tu6_emit_inline_ubo(struct tu_cs *cs,
+                    const struct tu_const_state *const_state,
+                    unsigned constlen,
+                    gl_shader_stage type,
+                    struct tu_descriptor_state *descriptors)
+{
   /* Emit loads of inline uniforms. These load directly from the uniform's
    * storage space inside the descriptor set.
    */
@ -4349,6 +4356,18 @@ tu6_emit_shared_consts(struct tu_cs *cs,
   }
 }

+static void
+tu7_emit_shared_preamble_consts(
+   struct tu_cs *cs,
+   const struct tu_push_constant_range *shared_consts,
+   uint32_t *push_constants)
+{
+   tu_cs_emit_pkt4(cs, REG_A7XX_HLSQ_SHARED_CONSTS_IMM(shared_consts->lo),
+                   shared_consts->dwords);
+   tu_cs_emit_array(cs, push_constants + shared_consts->lo,
+                    shared_consts->dwords);
+}
+
 static uint32_t
 tu6_const_size(struct tu_cmd_buffer *cmd,
               const struct tu_push_constant_range *shared_consts,
@ -4358,6 +4377,8 @@ tu6_const_size(struct tu_cmd_buffer *cmd,

   if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
      dwords += shared_consts->dwords + 4;
+   } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+      dwords += shared_consts->dwords + 1;
   }

   if (compute) {
@ -4372,8 +4393,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
 }

 static struct tu_draw_state
-tu6_emit_consts(struct tu_cmd_buffer *cmd,
-                bool compute)
+tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
 {
   uint32_t dwords = 0;
   const struct tu_push_constant_range *shared_consts =
@ -4390,24 +4410,30 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,

   if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
      tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute);
+   } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+      tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants);
   }

   if (compute) {
-      tu6_emit_user_consts(&cs,
-                           &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
-                           cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
-                           MESA_SHADER_COMPUTE,
-                           tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE),
-                           cmd->push_constants);
+      tu6_emit_per_stage_push_consts(
+         &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
+         MESA_SHADER_COMPUTE, cmd->push_constants);
+      tu6_emit_inline_ubo(
+         &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
+         cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
+         MESA_SHADER_COMPUTE,
+         tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE));
   } else {
-      struct tu_descriptor_state *descriptors  =
+      struct tu_descriptor_state *descriptors =
         tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
      for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) {
         const struct tu_program_descriptor_linkage *link =
            &cmd->state.program.link[type];
-         tu6_emit_user_consts(&cs, &link->tu_const_state, link->constlen,
-                              (gl_shader_stage) type,
-                              descriptors, cmd->push_constants);
+         tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state,
+                                        (gl_shader_stage) type,
+                                        cmd->push_constants);
+         tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen,
+                             (gl_shader_stage) type, descriptors);
      }
   }

@ -4751,7 +4777,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
   }

   if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
-      cmd->state.shader_const = tu6_emit_consts(cmd, false);
+      cmd->state.shader_const = tu_emit_consts(cmd, false);

   if (dirty & TU_CMD_DIRTY_DESC_SETS)
      tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
@ -5502,7 +5528,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
   tu_emit_cache_flush<CHIP>(cmd);

   /* note: no reason to have this in a separate IB */
-   tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, true));
+   tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true));

   tu_emit_compute_driver_params<CHIP>(cmd, cs, info);

--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -318,7 +318,11 @@ tu_push_consts_type(const struct tu_pipeline_layout *layout,
   if (tu6_shared_constants_enable(layout, compiler)) {
      return IR3_PUSH_CONSTS_SHARED;
   } else {
-      return IR3_PUSH_CONSTS_PER_STAGE;
+      if (compiler->gen >= 7) {
+         return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
+      } else {
+         return IR3_PUSH_CONSTS_PER_STAGE;
+      }
   }
 }

@ -385,7 +389,9 @@ tu6_emit_xs_config(struct tu_cs *cs,

   tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
   tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
-                  A6XX_HLSQ_VS_CNTL_ENABLED);
+                     A6XX_HLSQ_VS_CNTL_ENABLED |
+                     COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
+                          A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
 }
 TU_GENX(tu6_emit_xs_config);

@ -2335,10 +2341,11 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
                              &pipeline->shaders[i]->const_state,
                              variants[i]);

-      if (pipeline->shaders[i]->const_state.push_consts.type ==
-          IR3_PUSH_CONSTS_SHARED) {
-         pipeline->program.shared_consts =
-            pipeline->shaders[i]->const_state.push_consts;
+      struct tu_push_constant_range *push_consts =
+         &pipeline->shaders[i]->const_state.push_consts;
+      if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
+          push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+         pipeline->program.shared_consts = *push_consts;
      }
   }

--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -2286,6 +2286,8 @@ tu_shader_create(struct tu_device *dev,
      .api_wavesize = key->api_wavesize,
      .real_wavesize = key->real_wavesize,
      .push_consts_type = shader->const_state.push_consts.type,
+      .push_consts_base = shader->const_state.push_consts.lo,
+      .push_consts_dwords = shader->const_state.push_consts.dwords,
   };

   struct ir3_shader *ir3_shader =