From 16a94b1b7f5fc82154657178eec16666401b7403 Mon Sep 17 00:00:00 2001
From: Aitor Camacho <aitor@lunarg.com>
Date: Sun, 7 Dec 2025 22:21:24 +0900
Subject: [PATCH] kk: Enable float16 and int8

Metal does not seem to respect memory coherency for threads. Workaround 6
enforces device coherency for global loads/stores even if it should not
be needed.

Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com>
Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38847>
---
 docs/drivers/kosmickrisp/workarounds.rst    | 31 +++++++++++++++++++++
 docs/features.txt                           |  2 +-
 src/kosmickrisp/compiler/nir_to_msl.c       | 14 +++++++---
 src/kosmickrisp/vulkan/kk_physical_device.c | 22 ++-------------
 4 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/docs/drivers/kosmickrisp/workarounds.rst b/docs/drivers/kosmickrisp/workarounds.rst
index d391a664eb5..6e556704cf1 100644
--- a/docs/drivers/kosmickrisp/workarounds.rst
+++ b/docs/drivers/kosmickrisp/workarounds.rst
@@ -49,6 +49,37 @@ info on what was updated.
 Workarounds
 ===========
 
+KK_WORKAROUND_6
+---------------
+| macOS version: 26.0.1
+| Metal ticket: Not reported
+| Metal ticket status:
+| CTS test failure: ``dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*``
+| Comments:
+
+Metal does not respect its own Memory Coherency Model (MSL spec 4.8). From
+the spec:
+``By default, memory in the device address space has threadgroup coherence.``
+
+If we have a single thread compute dispatch so that we do (simplified version):
+
+.. code-block:: c
+
+   for (...) {
+      value = ssbo_data[0]; // ssbo_data is a device buffer
+      ...
+      ssbo_data[0] = new_value;
+   }
+
+``ssbo_data[0]`` will not correctly store/load the values so the value
+written in iteration 0, will not be available in iteration 1. The workaround
+to this issue is marking the device memory pointer through which the memory
+is accessed as coherent so that the value is stored and loaded correctly.
+Hopefully this does not affect performance much.
+
+| Log:
+| 2025-12-08: Workaround implemented and reported to Apple
+
 KK_WORKAROUND_5
 ---------------
 | macOS version: 26.0.1
diff --git a/docs/features.txt b/docs/features.txt
index 760fb5798a0..8ad9e3662b8 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -475,7 +475,7 @@ Vulkan 1.2 -- all DONE: anv, hk, nvk, panvk/v10+, pvr, tu, vn
   VK_KHR_sampler_mirror_clamp_to_edge                   DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
   VK_KHR_separate_depth_stencil_layouts                 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
   VK_KHR_shader_atomic_int64                            DONE (anv, lvp, nvk, panvk/v10+, radv, vn, tu/a740+)
-  VK_KHR_shader_float16_int8                            DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, vn)
+  VK_KHR_shader_float16_int8                            DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, radv, tu, vn)
   VK_KHR_shader_float_controls                          DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
   VK_KHR_shader_subgroup_extended_types                 DONE (anv, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, vn)
   VK_KHR_spirv_1_4                                      DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, v3dv, vn)
diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c
index 152256c0388..05ae9eca9a3 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@@ -1063,8 +1063,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_global: {
       enum gl_access_qualifier access = nir_intrinsic_access(instr);
       const char *type = msl_type_for_def(ctx->types, &instr->def);
-      const char *addressing =
-         access & ACCESS_COHERENT ? "coherent device" : "device";
+      const bool apply_workaround =
+         !(ctx->disabled_workarounds & BITFIELD64_BIT(6));
+      const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
+                                  ? "coherent device"
+                                  : "device";
       if (access & ACCESS_ATOMIC) {
          assert(instr->num_components == 1u &&
                 "We can only do single component with atomics");
@@ -1133,8 +1136,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_store_global: {
       enum gl_access_qualifier access = nir_intrinsic_access(instr);
       const char *type = msl_type_for_src(ctx->types, &instr->src[0]);
-      const char *addressing =
-         access & ACCESS_COHERENT ? "coherent device" : "device";
+      const bool apply_workaround =
+         !(ctx->disabled_workarounds & BITFIELD64_BIT(6));
+      const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
+                                  ? "coherent device"
+                                  : "device";
       if (access & ACCESS_ATOMIC) {
          assert(instr->num_components == 1u &&
                 "We can only do single component with atomics");
diff --git a/src/kosmickrisp/vulkan/kk_physical_device.c b/src/kosmickrisp/vulkan/kk_physical_device.c
index abecafdf507..37defbf0e3a 100644
--- a/src/kosmickrisp/vulkan/kk_physical_device.c
+++ b/src/kosmickrisp/vulkan/kk_physical_device.c
@@ -74,8 +74,7 @@ kk_get_device_extensions(const struct kk_instance *instance,
       .KHR_separate_depth_stencil_layouts = true,
       .KHR_shader_atomic_int64 = false,
       .KHR_shader_float_controls = true,
-      .KHR_shader_float16_int8 =
-         false, /* TODO_KOSMICKRISP shaderInt8 shaderFloat16 */
+      .KHR_shader_float16_int8 = true,
       .KHR_shader_subgroup_extended_types = true,
       .KHR_spirv_1_4 = true,
       .KHR_timeline_semaphore = true,
@@ -214,25 +213,10 @@ kk_get_device_features(
       .samplerMirrorClampToEdge = true,
       .scalarBlockLayout = true,
       .separateDepthStencilLayouts = true,
-      /* TODO_KOSMICKRISP shaderFloat16
-       * Failing:
-       * dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*
-       * dEQP-VK.memory_model.shared.16bit.nested_structs_arrays.*
-       */
-      .shaderFloat16 = false,
+      .shaderFloat16 = true,
       .shaderInputAttachmentArrayDynamicIndexing = true,
       .shaderInputAttachmentArrayNonUniformIndexing = true,
-      /* TODO_KOSMICKRISP shaderInt8
-       * Multiple MSL compiler crashes if we enable shaderInt8, need to
-       * understand why and a workaround:
-       * dEQP-VK.memory_model.shared.8bit.vector_types.9
-       * dEQP-VK.memory_model.shared.8bit.basic_types.8
-       * dEQP-VK.memory_model.shared.8bit.basic_arrays.2
-       * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.1
-       * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.8
-       * Probably more
-       */
-      .shaderInt8 = false,
+      .shaderInt8 = true,
       .shaderOutputLayer = true,
       .shaderOutputViewportIndex = true,
       .shaderSampledImageArrayNonUniformIndexing = true,