nir: Generate load_ubo_vec4 directly for !PIPE_CAP_NATIVE_INTEGERS

The prog_to_nir->NIR-to-TGSI change ended up causing regressions on r300, and svga against r300-class hardware, because nir_lower_uniforms_to_ubo() introduced shifts that nir_lower_ubo_vec4() tried to reverse, but that NIR couldn't prove are no-ops (since shifting up and back down may drop bits), and the hardware can't do the integer ops. Instead, make it so that nir_lower_uniforms_to_ubo can generate nir_intrinsic_load_ubo_vec4 directly for !INTEGER hardware. Fixes: cf3fc79cd0 ("st/mesa: Replace mesa_to_tgsi() with prog_to_nir() and nir_to_tgsi().") Closes: #4602 Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10194>
2026-02-04 08:30:27 +01:00 · 2021-04-09 16:10:30 -07:00 · 2021-04-09 16:10:30 -07:00 · 5de3cbbb2e
commit 5de3cbbb2e
parent 71d6d1b1ab
11 changed files with 67 additions and 63 deletions
--- a/docs/gallium/screen.rst
+++ b/docs/gallium/screen.rst
@ -481,7 +481,8 @@ The integer capabilities:
  those bits set, pipe_context::set_constant_buffer(.., 0, ..) is ignored
  by the driver, and the driver can throw assertion failures.
 * ``PIPE_CAP_PACKED_UNIFORMS``: True if the driver supports packed uniforms
-  as opposed to padding to vec4s.
+  as opposed to padding to vec4s.  Requires ``PIPE_SHADER_CAP_INTEGERS`` if
+  ``lower_uniforms_to_ubo`` is set.
 * ``PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES``: Whether the
  ``PIPE_CONSERVATIVE_RASTER_POST_SNAP`` mode is supported for triangles.
  The post-snap mode means the conservative rasterization occurs after
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -3435,8 +3435,6 @@ typedef struct nir_shader_compiler_options {
   unsigned max_unroll_iterations;
   unsigned max_unroll_iterations_aggressive;

-   /* For the non-zero value of the enum corresponds multiplier when
-    * calling lower_uniforms_to_ubo */
   bool lower_uniforms_to_ubo;

   nir_lower_int64_options lower_int64_options;
@ -4633,7 +4631,7 @@ bool nir_vectorize_tess_levels(nir_shader *shader);
 bool nir_lower_fragcolor(nir_shader *shader);
 bool nir_lower_fragcoord_wtrans(nir_shader *shader);
 void nir_lower_viewport_transform(nir_shader *shader);
-bool nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier);
+bool nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4);

 typedef struct nir_lower_subgroups_options {
   uint8_t subgroup_size;
--- a/src/compiler/nir/nir_lower_uniforms_to_ubo.c
+++ b/src/compiler/nir/nir_lower_uniforms_to_ubo.c
@ -22,27 +22,24 @@
 */

 /*
- * Remap load_uniform intrinsics to UBO accesses of UBO binding point 0.
- * Simultaneously, remap existing UBO accesses by increasing their binding
- * point by 1.
+ * Remap load_uniform intrinsics to nir_load_ubo or nir_load_ubo_vec4 accesses
+ * of UBO binding point 0. Simultaneously, remap existing UBO accesses by
+ * increasing their binding point by 1.
 *
- * Note that nir_intrinsic_load_uniform base/ranges can be set in different
- * units, and the multiplier argument caters to supporting these different
- * units.
+ * For PIPE_CAP_PACKED_UNIFORMS, dword_packed should be set to indicate that
+ * nir_intrinsic_load_uniform is in increments of dwords instead of vec4s.
 *
- * For example:
- * - st_glsl_to_nir for PIPE_CAP_PACKED_UNIFORMS uses dwords (4 bytes) so the
- *   multiplier should be 4
- * - st_glsl_to_nir for !PIPE_CAP_PACKED_UNIFORMS uses vec4s so the
- *   multiplier should be 16
- * - tgsi_to_nir uses vec4s, so the multiplier should be 16
+ * If load_vec4 is set, then nir_intrinsic_load_ubo_vec4 will be generated
+ * instead of nir_intrinsic_load_ubo, saving addressing math for hardawre
+ * needing aligned vec4 loads in increments of vec4s (such as TGSI CONST file
+ * loads).
 */

 #include "nir.h"
 #include "nir_builder.h"

 static bool
-lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
+lower_instr(nir_intrinsic_instr *instr, nir_builder *b, bool dword_packed, bool load_vec4)
 {
   b->cursor = nir_before_instr(&instr->instr);

@ -58,43 +55,51 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)

   if (instr->intrinsic == nir_intrinsic_load_uniform) {
      nir_ssa_def *ubo_idx = nir_imm_int(b, 0);
-      nir_ssa_def *ubo_offset =
-         nir_iadd(b, nir_imm_int(b, multiplier * nir_intrinsic_base(instr)),
-                  nir_imul(b, nir_imm_int(b, multiplier),
-                           nir_ssa_for_src(b, instr->src[0], 1)));
+      nir_ssa_def *uniform_offset = nir_ssa_for_src(b, instr->src[0], 1);

-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
-      load->num_components = instr->num_components;
-      load->src[0] = nir_src_for_ssa(ubo_idx);
-      load->src[1] = nir_src_for_ssa(ubo_offset);
      assert(instr->dest.ssa.bit_size >= 8);
-
-      /* If it's const, set the alignment to our known constant offset.  If
-       * not, set it to a pessimistic value based on the multiplier (or the
-       * scalar size, for qword loads).
-       *
-       * We could potentially set up stricter alignments for indirects by
-       * knowing what features are enabled in the APIs (see comment in
-       * nir_lower_ubo_vec4.c)
-       */
-      if (nir_src_is_const(instr->src[0])) {
-         nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
-                                 (nir_src_as_uint(instr->src[0]) +
-                                  nir_intrinsic_base(instr) * multiplier) %
-                                 NIR_ALIGN_MUL_MAX);
+      nir_ssa_def *load_result;
+      if (load_vec4) {
+         /* No asking us to generate load_vec4 when you've packed your uniforms
+          * as dwords instead of vec4s.
+          */
+         assert(!dword_packed);
+         load_result = nir_load_ubo_vec4(b, instr->num_components, instr->dest.ssa.bit_size,
+                                         ubo_idx,
+                                         nir_iadd_imm(b, uniform_offset, nir_intrinsic_base(instr)));
      } else {
-         nir_intrinsic_set_align(load, MAX2(multiplier,
-                                            instr->dest.ssa.bit_size / 8), 0);
-      }
-      nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, instr->dest.ssa.bit_size,
-                        instr->dest.ssa.name);
-      nir_builder_instr_insert(b, &load->instr);
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa, &load->dest.ssa);
+         /* For PIPE_CAP_PACKED_UNIFORMS, the uniforms are packed with the
+          * base/offset in dword units instead of vec4 units.
+          */
+         int multiplier = dword_packed ? 4 : 16;
+         load_result = nir_load_ubo(b, instr->num_components, instr->dest.ssa.bit_size,
+                             ubo_idx,
+                             nir_iadd_imm(b, nir_imul_imm(b, uniform_offset, multiplier),
+                                          nir_intrinsic_base(instr) * multiplier));
+         nir_intrinsic_instr *load = nir_instr_as_intrinsic(load_result->parent_instr);

-      nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
-      nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
+         /* If it's const, set the alignment to our known constant offset.  If
+          * not, set it to a pessimistic value based on the multiplier (or the
+          * scalar size, for qword loads).
+          *
+          * We could potentially set up stricter alignments for indirects by
+          * knowing what features are enabled in the APIs (see comment in
+          * nir_lower_ubo_vec4.c)
+          */
+         if (nir_src_is_const(instr->src[0])) {
+            nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
+                                    (nir_src_as_uint(instr->src[0]) +
+                                    nir_intrinsic_base(instr) * multiplier) %
+                                    NIR_ALIGN_MUL_MAX);
+         } else {
+            nir_intrinsic_set_align(load, MAX2(multiplier,
+                                             instr->dest.ssa.bit_size / 8), 0);
+         }
+
+         nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
+         nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
+      }
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, load_result);

      nir_instr_remove(&instr->instr);
      return true;
@ -104,7 +109,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
 }

 bool
-nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
+nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4)
 {
   bool progress = false;

@ -117,7 +122,7 @@ nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
               if (instr->type == nir_instr_type_intrinsic)
                  progress |= lower_instr(nir_instr_as_intrinsic(instr),
                                          &builder,
-                                          multiplier);
+                                          dword_packed, load_vec4);
            }
         }

--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@ -100,7 +100,7 @@ draw_create_vs_llvm(struct draw_context *draw,
      vs->base.state.ir.nir = state->ir.nir;
      nir_shader *nir = (nir_shader *)state->ir.nir;
      if (!nir->options->lower_uniforms_to_ubo)
-         NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, 16);
+         NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, false, false);
      nir_tgsi_scan_shader(state->ir.nir, &vs->base.info, true);
   } else {
      /* we make a private copy of the tokens */
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@ -2718,8 +2718,8 @@ nir_to_tgsi(struct nir_shader *s,

   if (!original_options->lower_uniforms_to_ubo) {
      NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
-                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS) ?
-                 4 : 16);
+                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
+                 !native_integers);
   }

   /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@ -2496,7 +2496,7 @@ ttn_finalize_nir(struct ttn_compile *c, struct pipe_screen *screen)
   }

   if (nir->options->lower_uniforms_to_ubo)
-      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);

   if (!c->cap_samplers_as_deref)
      NIR_PASS_V(nir, nir_lower_samplers);
--- a/src/gallium/drivers/d3d12/d3d12_compiler.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_compiler.cpp
@ -147,7 +147,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,

   uint32_t num_ubos_before_lower_to_ubo = nir->info.num_ubos;
   uint32_t num_uniforms_before_lower_to_ubo = nir->num_uniforms;
-   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
   shader->has_default_ubo0 = num_uniforms_before_lower_to_ubo > 0 &&
                              nir->info.num_ubos > num_ubos_before_lower_to_ubo;

--- a/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp
+++ b/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp
@ -123,7 +123,7 @@ TEST_F(nir_lower_ubo_test, basic)
   nir_ssa_def *offset = nir_imm_int(&b, 4);
   nir_load_uniform(&b, 1, 32, offset);

-   nir_lower_uniforms_to_ubo(b.shader, 16);
+   nir_lower_uniforms_to_ubo(b.shader, false, false);
   nir_opt_constant_folding(b.shader);

   ASSERT_TRUE(etna_nir_lower_ubo_to_uniform(b.shader));
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@ -914,7 +914,7 @@ zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr, bool optimize)
      tex_opts.lower_tg4_offsets = true;
      NIR_PASS_V(nir, nir_lower_tex, &tex_opts);
   }
-   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
   if (nir->info.stage == MESA_SHADER_GEOMETRY)
      NIR_PASS_V(nir, nir_lower_gs_intrinsics, nir_lower_gs_intrinsics_per_stream);
   optimize_nir(nir);
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@ -1005,12 +1005,10 @@ st_unpacked_uniforms_type_size(const struct glsl_type *type, bool bindless)
 void
 st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
 {
-   unsigned multiplier = 16;
   if (st->ctx->Const.PackedDriverUniformStorage) {
      NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
                 st_packed_uniforms_type_size,
                 (nir_lower_io_options)0);
-      multiplier = 4;
   } else {
      NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
                 st_unpacked_uniforms_type_size,
@ -1018,7 +1016,9 @@ st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
   }

   if (nir->options->lower_uniforms_to_ubo)
-      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, multiplier);
+      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo,
+                 st->ctx->Const.PackedDriverUniformStorage,
+                 !st->ctx->Const.NativeIntegers);
 }

 /* Last third of preparing nir from glsl, which happens after shader
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@ -338,7 +338,7 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
         * to UBO ordinarily, but it isn't as aggressive as we need. */

        NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
-        NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+        NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);

        do {
                progress = false;