From 196dc72838806dea963e247da618b2a9505d3e61 Mon Sep 17 00:00:00 2001
From: Jesse Natalie <jenatali@microsoft.com>
Date: Wed, 1 Feb 2023 09:06:55 -0800
Subject: [PATCH] microsoft/compiler: Handle 48-bit stores to SSBO/shared

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21029>
---
 src/microsoft/compiler/dxil_nir.c | 40 +++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c
index 3eb710e8675..ab6f1497a8f 100644
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@@ -323,6 +323,14 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
          ++num_src_comps_stored;
          substore_num_bits += bit_size;
       }
+      if (substore_num_bits == 48) {
+         /* Split this into two, one unmasked store of the first 32 bits,
+          * and then the second loop iteration will handle a masked store
+          * for the other 16. */
+         assert(num_src_comps_stored == 3);
+         --num_src_comps_stored;
+         substore_num_bits = 32;
+      }
       nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
       nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
                                                num_src_comps_stored);
@@ -331,11 +339,8 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
       if (substore_num_bits < 32) {
          nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);
 
-        /* If we have 16 bits or less to store we need to place them
-         * correctly in the u32 component. Anything greater than 16 bits
-         * (including uchar3) is naturally aligned on 32bits.
-         */
-         if (substore_num_bits <= 16) {
+        /* If we have small alignments we need to place them correctly in the u32 component. */
+         if (nir_intrinsic_align(intr) <= 2) {
             nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));
             nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
 
@@ -451,7 +456,6 @@ lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)
 static void
 lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)
 {
-
    for (unsigned i = 0; i < vec32->num_components; i++) {
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, op);
@@ -465,15 +469,12 @@ lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_in
 
 static void
 lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,
-                         nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)
+                         nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op, unsigned alignment)
 {
    nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
 
-   /* If we have 16 bits or less to store we need to place them correctly in
-    * the u32 component. Anything greater than 16 bits (including uchar3) is
-    * naturally aligned on 32bits.
-    */
-   if (num_bits <= 16) {
+   /* If we have small alignments, we need to place them correctly in the u32 component. */
+   if (alignment <= 2) {
       nir_ssa_def *shift =
          nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);
 
@@ -522,20 +523,19 @@ lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
    for (unsigned i = 0; i < num_components; i++)
       comps[i] = nir_channel(b, intr->src[0].ssa, i);
 
-   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
-      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store.
-       */
-      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
+   unsigned step = MAX2(bit_size, 32);
+   for (unsigned i = 0; i < num_bits; i += step) {
+      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */
+      unsigned substore_num_bits = MIN2(num_bits - i, step);
       nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
       nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
                                                substore_num_bits / bit_size);
       nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));
 
       /* For anything less than 32bits we need to use the masked version of the
-       * intrinsic to preserve data living in the same 32bit slot.
-       */
-      if (num_bits < 32) {
-         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);
+       * intrinsic to preserve data living in the same 32bit slot. */
+      if (substore_num_bits < 32) {
+         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op, nir_intrinsic_align(intr));
       } else {
          lower_store_vec32(b, index, vec32, op);
       }