kk: Enable float16 and int8

Metal does not seem to respect memory coherency for threads. Workaround 6 enforces device coherency for global loads/stores even if it should not be needed. Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com> Signed-off-by: Aitor Camacho <aitor@lunarg.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38847>
2025-12-20 07:20:10 +01:00 · 2025-12-07 22:21:24 +09:00 · 2025-12-07 22:21:24 +09:00 · 16a94b1b7f
commit 16a94b1b7f
parent 1e0e7d4296
4 changed files with 45 additions and 24 deletions
--- a/docs/drivers/kosmickrisp/workarounds.rst
+++ b/docs/drivers/kosmickrisp/workarounds.rst
@ -49,6 +49,37 @@ info on what was updated.
 Workarounds
 ===========
 KK_WORKAROUND_6
 ---------------
 | macOS version: 26.0.1
 | Metal ticket: Not reported
 | Metal ticket status:
 | CTS test failure: ``dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*``
 | Comments:
 Metal does not respect its own Memory Coherency Model (MSL spec 4.8). From
 the spec:
 ``By default, memory in the device address space has threadgroup coherence.``
 If we have a single thread compute dispatch so that we do (simplified version):
 .. code-block:: c
   for (...) {
      value = ssbo_data[0]; // ssbo_data is a device buffer
      ...
      ssbo_data[0] = new_value;
   }
 ``ssbo_data[0]`` will not correctly store/load the values so the value
 written in iteration 0, will not be available in iteration 1. The workaround
 to this issue is marking the device memory pointer through which the memory
 is accessed as coherent so that the value is stored and loaded correctly.
 Hopefully this does not affect performance much.
 | Log:
 | 2025-12-08: Workaround implemented and reported to Apple
 KK_WORKAROUND_5
 ---------------
 | macOS version: 26.0.1
--- a/docs/features.txt
+++ b/docs/features.txt
@ -475,7 +475,7 @@ Vulkan 1.2 -- all DONE: anv, hk, nvk, panvk/v10+, pvr, tu, vn
  VK_KHR_sampler_mirror_clamp_to_edge                   DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
  VK_KHR_separate_depth_stencil_layouts                 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
  VK_KHR_shader_atomic_int64                            DONE (anv, lvp, nvk, panvk/v10+, radv, vn, tu/a740+)
-  VK_KHR_shader_float16_int8                            DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, vn)
+  VK_KHR_shader_float16_int8                            DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, radv, tu, vn)
  VK_KHR_shader_float_controls                          DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
  VK_KHR_shader_subgroup_extended_types                 DONE (anv, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, vn)
  VK_KHR_spirv_1_4                                      DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, v3dv, vn)
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@ -1063,8 +1063,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
   case nir_intrinsic_load_global: {
      enum gl_access_qualifier access = nir_intrinsic_access(instr);
      const char *type = msl_type_for_def(ctx->types, &instr->def);
-      const char *addressing =
+      const bool apply_workaround =
-         access & ACCESS_COHERENT ? "coherent device" : "device";
+         !(ctx->disabled_workarounds & BITFIELD64_BIT(6));
      const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
                                  ? "coherent device"
                                  : "device";
      if (access & ACCESS_ATOMIC) {
         assert(instr->num_components == 1u &&
                "We can only do single component with atomics");
@ -1133,8 +1136,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
   case nir_intrinsic_store_global: {
      enum gl_access_qualifier access = nir_intrinsic_access(instr);
      const char *type = msl_type_for_src(ctx->types, &instr->src[0]);
-      const char *addressing =
+      const bool apply_workaround =
-         access & ACCESS_COHERENT ? "coherent device" : "device";
+         !(ctx->disabled_workarounds & BITFIELD64_BIT(6));
      const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
                                  ? "coherent device"
                                  : "device";
      if (access & ACCESS_ATOMIC) {
         assert(instr->num_components == 1u &&
                "We can only do single component with atomics");
--- a/src/kosmickrisp/vulkan/kk_physical_device.c
+++ b/src/kosmickrisp/vulkan/kk_physical_device.c
@ -74,8 +74,7 @@ kk_get_device_extensions(const struct kk_instance *instance,
      .KHR_separate_depth_stencil_layouts = true,
      .KHR_shader_atomic_int64 = false,
      .KHR_shader_float_controls = true,
-      .KHR_shader_float16_int8 =
+      .KHR_shader_float16_int8 = true,
         false, /* TODO_KOSMICKRISP shaderInt8 shaderFloat16 */
      .KHR_shader_subgroup_extended_types = true,
      .KHR_spirv_1_4 = true,
      .KHR_timeline_semaphore = true,
@ -214,25 +213,10 @@ kk_get_device_features(
      .samplerMirrorClampToEdge = true,
      .scalarBlockLayout = true,
      .separateDepthStencilLayouts = true,
-      /* TODO_KOSMICKRISP shaderFloat16
+      .shaderFloat16 = true,
       * Failing:
       * dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*
       * dEQP-VK.memory_model.shared.16bit.nested_structs_arrays.*
       */
      .shaderFloat16 = false,
      .shaderInputAttachmentArrayDynamicIndexing = true,
      .shaderInputAttachmentArrayNonUniformIndexing = true,
-      /* TODO_KOSMICKRISP shaderInt8
+      .shaderInt8 = true,
       * Multiple MSL compiler crashes if we enable shaderInt8, need to
       * understand why and a workaround:
       * dEQP-VK.memory_model.shared.8bit.vector_types.9
       * dEQP-VK.memory_model.shared.8bit.basic_types.8
       * dEQP-VK.memory_model.shared.8bit.basic_arrays.2
       * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.1
       * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.8
       * Probably more
       */
      .shaderInt8 = false,
      .shaderOutputLayer = true,
      .shaderOutputViewportIndex = true,
      .shaderSampledImageArrayNonUniformIndexing = true,