zink: move all 64-32bit shader load rewriting to nir pass

this also enables natural 64bit loads on drivers that support it Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13484>
2025-12-24 17:30:12 +01:00 · 2021-10-20 10:02:08 -04:00 · 2021-10-20 10:02:08 -04:00 · 150d6ee97e
commit 150d6ee97e
parent 3a1ecd1e8c
2 changed files with 57 additions and 58 deletions
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
@ -49,10 +49,10 @@ struct ntv_context {
   gl_shader_stage stage;
   const struct zink_so_info *so_info;

-   SpvId ubos[PIPE_MAX_CONSTANT_BUFFERS][3]; //8, 16, 32
+   SpvId ubos[PIPE_MAX_CONSTANT_BUFFERS][5]; //8, 16, 32, unused, 64
   nir_variable *ubo_vars[PIPE_MAX_CONSTANT_BUFFERS];

-   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS][3]; //8, 16, 32
+   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS][5]; //8, 16, 32, unused, 64
   nir_variable *ssbo_vars[PIPE_MAX_SHADER_BUFFERS];
   SpvId image_types[PIPE_MAX_SAMPLERS];
   SpvId images[PIPE_MAX_SAMPLERS];
@ -1915,9 +1915,9 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
   bool ssbo = intr->intrinsic == nir_intrinsic_load_ssbo;
   assert(const_block_index); // no dynamic indexing for now

-   unsigned idx = 0;
   unsigned bit_size = nir_dest_bit_size(intr->dest);
-   idx = MIN2(bit_size, 32) >> 4;
+   assert(bit_size <= 64);
+   unsigned idx = bit_size >> 4;
   if (ssbo) {
      assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
      if (!ctx->ssbos[const_block_index->u32][idx])
@ -1928,15 +1928,12 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
         emit_bo(ctx, ctx->ubo_vars[const_block_index->u32], nir_dest_bit_size(intr->dest));
   }
   SpvId bo = ssbo ? ctx->ssbos[const_block_index->u32][idx] : ctx->ubos[const_block_index->u32][idx];
-   SpvId uint_type = get_uvec_type(ctx, MIN2(bit_size, 32), 1);
+   SpvId uint_type = get_uvec_type(ctx, bit_size, 1);
   SpvId one = emit_uint_const(ctx, 32, 1);

   /* number of components being loaded */
   unsigned num_components = nir_dest_num_components(intr->dest);
-   /* we need to grab 2x32 to fill the 64bit value */
-   if (bit_size == 64)
-      num_components *= 2;
-   SpvId constituents[NIR_MAX_VEC_COMPONENTS * 2];
+   SpvId constituents[NIR_MAX_VEC_COMPONENTS];
   SpvId result;

   /* destination type for the load */
@ -1950,7 +1947,7 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
   /* our generated uniform has a memory layout like
    *
    * struct {
-    *    uint base[array_size];
+    *    uintN base[array_size];
    * };
    *
    * first, access 'base'
@ -1983,18 +1980,6 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
      offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, one);
   }

-   /* if we're loading a 64bit value, we have to reassemble all the u32 values we've loaded into u64 values
-    * by creating uvec2 composites and bitcasting them to u64 values
-    */
-   if (bit_size == 64) {
-      num_components /= 2;
-      type = get_uvec_type(ctx, 64, num_components);
-      SpvId u64_type = get_uvec_type(ctx, 64, 1);
-      for (unsigned i = 0; i < num_components; i++) {
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 32, 2), constituents + i * 2, 2);
-         constituents[i] = emit_bitcast(ctx, u64_type, constituents[i]);
-      }
-   }
   /* if loading more than 1 value, reassemble the results into the desired type,
    * otherwise just use the loaded result
    */
@ -2194,7 +2179,6 @@ emit_load_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
   SpvId dest_type = get_dest_type(ctx, &intr->dest, nir_type_uint);
   unsigned num_components = nir_dest_num_components(intr->dest);
   unsigned bit_size = nir_dest_bit_size(intr->dest);
-   bool qword = bit_size == 64;
   SpvId uint_type = get_uvec_type(ctx, 32, 1);
   SpvId ptr_type = spirv_builder_type_pointer(&ctx->builder,
                                               SpvStorageClassWorkgroup,
@ -2203,17 +2187,10 @@ emit_load_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
   SpvId constituents[NIR_MAX_VEC_COMPONENTS];
   /* need to convert array -> vec */
   for (unsigned i = 0; i < num_components; i++) {
-      SpvId parts[2];
-      for (unsigned j = 0; j < 1 + !!qword; j++) {
-         SpvId member = spirv_builder_emit_access_chain(&ctx->builder, ptr_type,
-                                                        ctx->shared_block_var, &offset, 1);
-         parts[j] = spirv_builder_emit_load(&ctx->builder, uint_type, member);
-         offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, emit_uint_const(ctx, 32, 1));
-      }
-      if (qword)
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 64, 1), parts, 2);
-      else
-         constituents[i] = parts[0];
+      SpvId member = spirv_builder_emit_access_chain(&ctx->builder, ptr_type,
+                                                     ctx->shared_block_var, &offset, 1);
+      constituents[i] = spirv_builder_emit_load(&ctx->builder, uint_type, member);
+      offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, emit_uint_const(ctx, 32, 1));
   }
   SpvId result;
   if (num_components > 1)
@ -2258,15 +2235,11 @@ emit_store_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 static void
 emit_load_push_const(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
-   unsigned bit_size = nir_dest_bit_size(intr->dest);
   SpvId uint_type = get_uvec_type(ctx, 32, 1);
   SpvId load_type = get_uvec_type(ctx, 32, 1);

   /* number of components being loaded */
   unsigned num_components = nir_dest_num_components(intr->dest);
-   /* we need to grab 2x32 to fill the 64bit value */
-   if (bit_size == 64)
-      num_components *= 2;
   SpvId constituents[NIR_MAX_VEC_COMPONENTS * 2];
   SpvId result;

@ -2298,18 +2271,6 @@ emit_load_push_const(struct ntv_context *ctx, nir_intrinsic_instr *intr)
      offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, one);
   }

-   /* if we're loading a 64bit value, we have to reassemble all the u32 values we've loaded into u64 values
-    * by creating uvec2 composites and bitcasting them to u64 values
-    */
-   if (bit_size == 64) {
-      num_components /= 2;
-      type = get_uvec_type(ctx, 64, num_components);
-      SpvId u64_type = get_uvec_type(ctx, 64, 1);
-      for (unsigned i = 0; i < num_components; i++) {
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 32, 2), constituents + i * 2, 2);
-         constituents[i] = emit_bitcast(ctx, u64_type, constituents[i]);
-      }
-   }
   /* if loading more than 1 value, reassemble the results into the desired type,
    * otherwise just use the loaded result
    */
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@ -640,16 +640,54 @@ decompose_attribs(nir_shader *nir, uint32_t decomposed_attrs, uint32_t decompose
 static bool
 rewrite_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
 {
+   struct zink_screen *screen = data;
+   const bool has_int64 = screen->info.feats.features.shaderInt64;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   b->cursor = nir_before_instr(instr);
   switch (intr->intrinsic) {
   case nir_intrinsic_load_ssbo:
-   case nir_intrinsic_load_ubo:
-   case nir_intrinsic_load_ubo_vec4:
-      b->cursor = nir_before_instr(instr);
-      nir_instr_rewrite_src_ssa(instr, &intr->src[1], nir_udiv_imm(b, intr->src[1].ssa, MIN2(nir_dest_bit_size(intr->dest), 32) / 8));
+   case nir_intrinsic_load_ubo: {
+      /* ubo0 can have unaligned 64bit loads, particularly for bindless texture ids */
+      bool force_2x32 = intr->intrinsic == nir_intrinsic_load_ubo &&
+                        nir_src_as_uint(intr->src[0]) == 0 &&
+                        nir_dest_bit_size(intr->dest) == 64 &&
+                        nir_intrinsic_align_offset(intr) % 8 != 0;
+      nir_instr_rewrite_src_ssa(instr, &intr->src[1], nir_udiv_imm(b, intr->src[1].ssa,
+                                (force_2x32 ? 32 : nir_dest_bit_size(intr->dest)) / 8));
+      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
+      if (force_2x32 || (nir_dest_bit_size(intr->dest) == 64 && !has_int64)) {
+         /* this is always scalarized */
+         assert(intr->dest.ssa.num_components == 1);
+         /* rewrite as 2x32 */
+         nir_ssa_def *load;
+         if (intr->intrinsic == nir_intrinsic_load_ssbo)
+            load = nir_load_ssbo(b, 2, 32, intr->src[0].ssa, intr->src[1].ssa, .align_mul = 4, .align_offset = 0);
+         else
+            load = nir_load_ubo(b, 2, 32, intr->src[0].ssa, intr->src[1].ssa, .align_mul = 4, .align_offset = 0, .range = 4);
+         nir_intrinsic_set_access(nir_instr_as_intrinsic(load->parent_instr), nir_intrinsic_access(intr));
+         /* cast back to 64bit */
+         nir_ssa_def *casted = nir_pack_64_2x32(b, load);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, casted);
+         nir_instr_remove(instr);
+      }
      return true;
+   }
+   case nir_intrinsic_load_shared:
+      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
+      if (nir_dest_bit_size(intr->dest) == 64 && !has_int64) {
+         /* this is always scalarized */
+         assert(intr->dest.ssa.num_components == 1);
+         /* rewrite as 2x32 */
+         nir_ssa_def *load = nir_load_shared(b, 2, 32, intr->src[0].ssa, .align_mul = 4, .align_offset = 0);
+         /* cast back to 64bit */
+         nir_ssa_def *casted = nir_pack_64_2x32(b, load);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, casted);
+         nir_instr_remove(instr);
+         return true;
+      }
+      break;
   case nir_intrinsic_store_ssbo:
   default:
      break;
@ -658,9 +696,9 @@ rewrite_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
 }

 static bool
-rewrite_bo_access(nir_shader *shader)
+rewrite_bo_access(nir_shader *shader, struct zink_screen *screen)
 {
-   return nir_shader_instructions_pass(shader, rewrite_bo_access_instr, nir_metadata_dominance, NULL);
+   return nir_shader_instructions_pass(shader, rewrite_bo_access_instr, nir_metadata_dominance, screen);
 }

 static void
@ -900,7 +938,7 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shad
   }
   if (screen->driconf.inline_uniforms) {
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared);
-      NIR_PASS_V(nir, rewrite_bo_access);
+      NIR_PASS_V(nir, rewrite_bo_access, screen);
   }
   if (inlined_uniforms) {
      optimize_nir(nir);
@ -1417,7 +1455,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
   /* run in compile if there could be inlined uniforms */
   if (!screen->driconf.inline_uniforms) {
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared);
-      NIR_PASS_V(nir, rewrite_bo_access);
+      NIR_PASS_V(nir, rewrite_bo_access, screen);
   }

   if (zink_debug & ZINK_DEBUG_NIR) {