dxil: Delete load_ubo_dxil intrinsic

Instead of splitting unaligned UBO loads while still using derefs, and then lowering load_ubo to load_ubo_dxil in lower_loads_stores_to_dxil, use lower_mem_access_bit_sizes and lower_ubo_vec4 to handle load size and alignment restrictions while converting to load_ubo_vec4 instead, which has the same semantics as load_ubo_dxil. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3842 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23173>
2026-01-06 11:00:11 +01:00 · 2023-05-22 10:03:36 -07:00 · 2023-05-22 10:03:36 -07:00 · ecfbc16f61
commit ecfbc16f61
parent 42877c8b63
6 changed files with 70 additions and 120 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1259,8 +1259,6 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
 # DXIL specific intrinsics
 # src[] = { value, mask, index, offset }.
 intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
-# src[] = { index, 16-byte-based-offset }
-load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE, CAN_REORDER])

 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
--- a/src/microsoft/clc/clc_compiler.c
+++ b/src/microsoft/clc/clc_compiler.c
@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,

   NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);

-   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~nir_var_mem_constant);
+   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo));

   assert(nir->info.cs.ptr_size == 64);
   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
@ -969,7 +969,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
   }

   NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var);
-   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ubo);
   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
              nir_address_format_32bit_index_offset);
   NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var);
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@ -67,89 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
   return nir_vec(b, dst_comps, num_dst_comps);
 }

-static nir_ssa_def *
-ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
-                          nir_ssa_def *offset, unsigned alignment)
-{
-   assert(alignment >= 16 || alignment == 8 ||
-          alignment == 4 || alignment == 2 ||
-          alignment == 1);
-   assert(vec32->num_components == 4);
-
-   if (alignment > 8)
-      return vec32;
-
-   nir_ssa_def *comps[4];
-   nir_ssa_def *cond;
-
-   for (unsigned i = 0; i < 4; i++)
-      comps[i] = nir_channel(b, vec32, i);
-
-   /* If we have 8bytes alignment or less, select which half the vec4 should
-    * be used.
-    */
-   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)),
-                                 nir_imm_int(b, 0));
-
-   comps[0] = nir_bcsel(b, cond, comps[2], comps[0]);
-   comps[1] = nir_bcsel(b, cond, comps[3], comps[1]);
-
-   if (alignment == 8)
-      return nir_vec(b, comps, 2);
-
-   /* 4 byte align or less needed, select which of the 32bit component should be
-    * used and return it. The sub-32bit split is handled in nir_extract_bits().
-    */
-   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)),
-                                 nir_imm_int(b, 0));
-   return nir_bcsel(b, cond, comps[1], comps[0]);
-}
-
-nir_ssa_def *
-build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
-                    nir_ssa_def *offset, unsigned num_components,
-                    unsigned bit_size, unsigned alignment)
-{
-   nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4));
-   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
-   unsigned num_bits = num_components * bit_size;
-   unsigned comp_idx = 0;
-
-   /* We need to split loads in 16byte chunks because that's the
-    * granularity of cBufferLoadLegacy().
-    */
-   for (unsigned i = 0; i < num_bits; i += (16 * 8)) {
-      /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec
-       * load.
-       */
-      unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8);
-      nir_ssa_def *vec32 =
-         nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8))));
-
-      /* First re-arrange the vec32 to account for intra 16-byte offset. */
-      assert(subload_num_bits / 8 <= alignment);
-      vec32 = ubo_load_select_32b_comps(b, vec32, offset, alignment);
-
-      /* If we have 2 bytes or less to load we need to adjust the u32 value so
-       * we can always extract the LSB.
-       */
-      if (alignment <= 2) {
-         nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset,
-                                                      nir_imm_int(b, 3)),
-                                          nir_imm_int(b, 8));
-         vec32 = nir_ushr(b, vec32, shift);
-      }
-
-      /* And now comes the pack/unpack step to match the original type. */
-      nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, subload_num_bits / bit_size, bit_size);
-      for (unsigned comp = 0; comp < subload_num_bits / bit_size; ++comp, ++comp_idx)
-         comps[comp_idx] = nir_channel(b, temp_vec, comp);
-   }
-
-   assert(comp_idx == num_components);
-   return nir_vec(b, comps, num_components);
-}
-
 static bool
 lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
 {
@ -888,26 +805,6 @@ dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
   return true;
 }

-static bool
-lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
-{
-   assert(intr->dest.is_ssa);
-   assert(intr->src[0].is_ssa);
-   assert(intr->src[1].is_ssa);
-
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_ssa_def *result =
-      build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
-                             nir_dest_num_components(intr->dest),
-                             nir_dest_bit_size(intr->dest),
-                             nir_intrinsic_align(intr));
-
-   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
 static bool
 lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
 {
@ -978,9 +875,6 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
            case nir_intrinsic_load_ssbo:
               progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
               break;
-            case nir_intrinsic_load_ubo:
-               progress |= lower_load_ubo(&b, intr);
-               break;
            case nir_intrinsic_store_shared:
               progress |= lower_32b_offset_store(&b, intr, shared_var);
               break;
--- a/src/microsoft/compiler/dxil_nir.h
+++ b/src/microsoft/compiler/dxil_nir.h
@ -58,11 +58,6 @@ bool dxil_nir_split_typed_samplers(nir_shader *shader);
 bool dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars);
 bool dxil_nir_lower_vs_vertex_conversion(nir_shader *s, enum pipe_format target_formats[]);

-nir_ssa_def *
-build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
-                    nir_ssa_def *offset, unsigned num_components,
-                    unsigned bit_size, unsigned alignment);
-
 uint64_t
 dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes);

--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@ -4843,7 +4843,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
      return emit_atomic_deref(ctx, intr);
   case nir_intrinsic_deref_atomic_swap:
      return emit_atomic_deref_swap(ctx, intr);
-   case nir_intrinsic_load_ubo_dxil:
   case nir_intrinsic_load_ubo_vec4:
      return emit_load_ubo_vec4(ctx, intr);
   case nir_intrinsic_load_primitive_id:
@ -6188,6 +6187,48 @@ lower_bit_size_callback(const nir_instr* instr, void *data)
   return ret;
 }

+static bool
+vectorize_filter(
+   unsigned align_mul,
+   unsigned align_offset,
+   unsigned bit_size,
+   unsigned num_components,
+   nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+   void *data)
+{
+   return util_is_power_of_two_nonzero(num_components);
+}
+
+struct lower_mem_bit_sizes_data {
+   const nir_shader_compiler_options *nir_options;
+   const struct nir_to_dxil_options *dxil_options;
+};
+
+static nir_mem_access_size_align
+lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
+                              uint8_t bytes,
+                              uint8_t bit_size_in,
+                              uint32_t align_mul,
+                              uint32_t align_offset,
+                              bool offset_is_const,
+                              const void *cb_data)
+{
+   const struct lower_mem_bit_sizes_data *data = cb_data;
+   unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64;
+   unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16;
+   unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in));
+   /* UBO loads can be done at whatever (supported) bit size, but require 16 byte
+    * alignment and can load up to 16 bytes per instruction. However this pass requires
+    * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
+    * which can deal with unaligned vec4s, so for this pass let's just deal with bit size
+    * and total size restrictions. */
+   return (nir_mem_access_size_align) {
+      .align = closest_bit_size / 8,
+      .bit_size = closest_bit_size,
+      .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
+   };
+}
+
 static void
 optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts)
 {
@ -6222,6 +6263,7 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts)
      NIR_PASS(progress, s, nir_lower_64bit_phis);
      NIR_PASS(progress, s, nir_lower_phis_to_scalar, true);
      NIR_PASS(progress, s, nir_opt_loop_unroll);
+      NIR_PASS(progress, s, nir_lower_pack);
      NIR_PASS_V(s, nir_lower_system_values);
   } while (progress);

@ -6488,10 +6530,34 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
   NIR_PASS_V(s, nir_lower_flrp, 16 | 32 | 64, true);
   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, nir_lower_io_lower_64bit_to_32);
   NIR_PASS_V(s, dxil_nir_ensure_position_writes);
-   NIR_PASS_V(s, nir_lower_pack);
   NIR_PASS_V(s, dxil_nir_lower_system_values);
   NIR_PASS_V(s, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_system_value | nir_var_shader_out);

+   /* Do a round of optimization to try to vectorize loads/stores. Otherwise the addresses used for loads
+    * might be too opaque for the pass to see that they're next to each other. */
+   optimize_nir(s, opts);
+
+   /* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better
+    * chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll
+    * address them with lower_mem_access_bit_sizes */
+   nir_load_store_vectorize_options vectorize_opts = {
+      .callback = vectorize_filter,
+      .modes = nir_var_mem_ubo,
+   };
+   NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts);
+
+   /* Now that they're bloated to the max, address bit size restrictions and overall size limitations for
+    * a single load/store op. */
+   struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts };
+   nir_lower_mem_access_bit_sizes_options mem_size_options = {
+      .modes = nir_var_mem_ubo,
+      .callback = lower_mem_access_bit_sizes_cb,
+      .cb_data = &mem_size_data
+   };
+   NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options);
+
+   /* Lastly, conver byte-address UBO loads to vec-addressed. This pass can also deal with selecting sub-
+    * components from the load and dealing with vec-straddling loads. */
   NIR_PASS_V(s, nir_lower_ubo_vec4);

   if (opts->shader_model_max < SHADER_MODEL_6_6) {
--- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
+++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c
@ -1048,9 +1048,7 @@ dxil_spirv_nir_passes(nir_shader *nir,
   }

   NIR_PASS_V(nir, nir_opt_deref);
-   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores,
-              nir_var_mem_ubo | nir_var_mem_push_const |
-              nir_var_mem_ssbo);
+   NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo);

   if (conf->inferred_read_only_images_as_srvs) {
      const nir_opt_access_options opt_access_options = {