From 6a5ed9e2e9d158c17e4e6d678cda06efdb2fc6be Mon Sep 17 00:00:00 2001
From: Jesse Natalie <jenatali@microsoft.com>
Date: Mon, 22 May 2023 09:56:27 -0700
Subject: [PATCH] microsoft/compiler: Support load_ubo_vec4

Add support for 16-bit UBO loads, delete handling of byte-addressed
UBO loads (which I think was never used anyway) and add handling
for the component const index to optimize out unneeded extractResults.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23173>
---
 src/microsoft/compiler/dxil_module.c | 12 ++++--
 src/microsoft/compiler/nir_to_dxil.c | 55 ++++++----------------------
 2 files changed, 20 insertions(+), 47 deletions(-)

diff --git a/src/microsoft/compiler/dxil_module.c b/src/microsoft/compiler/dxil_module.c
index 8b13c04c24a..be309828eb8 100644
--- a/src/microsoft/compiler/dxil_module.c
+++ b/src/microsoft/compiler/dxil_module.c
@@ -705,12 +705,12 @@ const struct dxil_type *
 dxil_module_get_cbuf_ret_type(struct dxil_module *mod, enum overload_type overload)
 {
    const struct dxil_type *overload_type = dxil_get_overload_type(mod, overload);
-   const struct dxil_type *fields[4] = { overload_type, overload_type, overload_type, overload_type };
+   const struct dxil_type *fields[8] = { overload_type, overload_type, overload_type, overload_type,
+                                         overload_type, overload_type, overload_type, overload_type };
    unsigned num_fields;
 
    char name[64];
-   snprintf(name, sizeof(name), "dx.types.CBufRet.%s", dxil_overload_suffix(overload));
-
+   const char *additional = "";
    switch (overload) {
    case DXIL_I32:
    case DXIL_F32:
@@ -720,9 +720,15 @@ dxil_module_get_cbuf_ret_type(struct dxil_module *mod, enum overload_type overlo
    case DXIL_F64:
       num_fields = 2;
       break;
+   case DXIL_I16:
+   case DXIL_F16:
+      num_fields = 8;
+      additional = ".8";
+      break;
    default:
       unreachable("unexpected overload type");
    }
+   snprintf(name, sizeof(name), "dx.types.CBufRet.%s%s", dxil_overload_suffix(overload), additional);
 
    return dxil_module_get_struct_type(mod, name, fields, num_fields);
 }
diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c
index 204e40345de..d585d29122b 100644
--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@@ -126,6 +126,7 @@ nir_options = {
    .lower_pack_unorm_2x16 = true,
    .lower_pack_64_2x32_split = true,
    .lower_pack_32_2x16_split = true,
+   .lower_pack_64_4x16 = true,
    .lower_unpack_64_2x32_split = true,
    .lower_unpack_32_2x16_split = true,
    .lower_unpack_half_2x16 = true,
@@ -3557,47 +3558,9 @@ emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 }
 
 static bool
-emit_load_ubo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+emit_load_ubo_vec4(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 {
-   const struct dxil_value* handle = get_resource_handle(ctx, &intr->src[0], DXIL_RESOURCE_CLASS_CBV, DXIL_RESOURCE_KIND_CBUFFER);
-   if (!handle)
-      return false;
-
-   const struct dxil_value *offset;
-   nir_const_value *const_offset = nir_src_as_const_value(intr->src[1]);
-   if (const_offset) {
-      offset = dxil_module_get_int32_const(&ctx->mod, const_offset->i32 >> 4);
-   } else {
-      const struct dxil_value *offset_src = get_src(ctx, &intr->src[1], 0, nir_type_uint);
-      const struct dxil_value *c4 = dxil_module_get_int32_const(&ctx->mod, 4);
-      if (!offset_src || !c4)
-         return false;
-
-      offset = dxil_emit_binop(&ctx->mod, DXIL_BINOP_ASHR, offset_src, c4, 0);
-   }
-
-   enum overload_type overload = get_ambiguous_overload_alu_type(ctx, intr, nir_type_float);
-   const struct dxil_value *agg = load_ubo(ctx, handle, offset, overload);
-
-   if (!agg)
-      return false;
-
-   for (unsigned i = 0; i < nir_dest_num_components(intr->dest); ++i) {
-      const struct dxil_value *retval = dxil_emit_extractval(&ctx->mod, agg, i);
-      store_dest(ctx, &intr->dest, i, retval);
-   }
-   if (nir_dest_bit_size(intr->dest) == 16)
-      ctx->mod.feats.native_low_precision = true;
-   return true;
-}
-
-static bool
-emit_load_ubo_dxil(struct ntd_context *ctx, nir_intrinsic_instr *intr)
-{
-   assert(nir_dest_num_components(intr->dest) <= 4);
-   assert(nir_dest_bit_size(intr->dest) == 32);
-
-   const struct dxil_value* handle = get_resource_handle(ctx, &intr->src[0], DXIL_RESOURCE_CLASS_CBV, DXIL_RESOURCE_KIND_CBUFFER);
+   const struct dxil_value *handle = get_resource_handle(ctx, &intr->src[0], DXIL_RESOURCE_CLASS_CBV, DXIL_RESOURCE_KIND_CBUFFER);
    const struct dxil_value *offset =
       get_src(ctx, &intr->src[1], 0, nir_type_uint);
 
@@ -3609,9 +3572,11 @@ emit_load_ubo_dxil(struct ntd_context *ctx, nir_intrinsic_instr *intr)
    if (!agg)
       return false;
 
+   unsigned first_component = nir_intrinsic_has_component(intr) ?
+      nir_intrinsic_component(intr) : 0;
    for (unsigned i = 0; i < nir_dest_num_components(intr->dest); i++)
       store_dest(ctx, &intr->dest, i,
-                 dxil_emit_extractval(&ctx->mod, agg, i));
+                 dxil_emit_extractval(&ctx->mod, agg, i + first_component));
 
    if (nir_dest_bit_size(intr->dest) == 16)
       ctx->mod.feats.native_low_precision = true;
@@ -4878,10 +4843,9 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
       return emit_atomic_deref(ctx, intr);
    case nir_intrinsic_deref_atomic_swap:
       return emit_atomic_deref_swap(ctx, intr);
-   case nir_intrinsic_load_ubo:
-      return emit_load_ubo(ctx, intr);
    case nir_intrinsic_load_ubo_dxil:
-      return emit_load_ubo_dxil(ctx, intr);
+   case nir_intrinsic_load_ubo_vec4:
+      return emit_load_ubo_vec4(ctx, intr);
    case nir_intrinsic_load_primitive_id:
       return emit_load_unary_external_function(ctx, intr, "dx.op.primitiveID",
                                                DXIL_INTR_PRIMITIVE_ID, nir_type_int);
@@ -6527,6 +6491,9 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
    NIR_PASS_V(s, nir_lower_pack);
    NIR_PASS_V(s, dxil_nir_lower_system_values);
    NIR_PASS_V(s, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_system_value | nir_var_shader_out);
+
+   NIR_PASS_V(s, nir_lower_ubo_vec4);
+
    if (opts->shader_model_max < SHADER_MODEL_6_6) {
       /* In a later pass, load_helper_invocation will be lowered to sample mask based fallback,
        * so both load- and is- will be emulated eventually.