From 16a94b1b7f5fc82154657178eec16666401b7403 Mon Sep 17 00:00:00 2001 From: Aitor Camacho Date: Sun, 7 Dec 2025 22:21:24 +0900 Subject: [PATCH] kk: Enable float16 and int8 Metal does not seem to respect memory coherency for threads. Workaround 6 enforces device coherency for global loads/stores even if it should not be needed. Reviewed-by: Arcady Goldmints-Orlov Signed-off-by: Aitor Camacho Part-of: --- docs/drivers/kosmickrisp/workarounds.rst | 31 +++++++++++++++++++++ docs/features.txt | 2 +- src/kosmickrisp/compiler/nir_to_msl.c | 14 +++++++--- src/kosmickrisp/vulkan/kk_physical_device.c | 22 ++------------- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/docs/drivers/kosmickrisp/workarounds.rst b/docs/drivers/kosmickrisp/workarounds.rst index d391a664eb5..6e556704cf1 100644 --- a/docs/drivers/kosmickrisp/workarounds.rst +++ b/docs/drivers/kosmickrisp/workarounds.rst @@ -49,6 +49,37 @@ info on what was updated. Workarounds =========== +KK_WORKAROUND_6 +--------------- +| macOS version: 26.0.1 +| Metal ticket: Not reported +| Metal ticket status: +| CTS test failure: ``dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*`` +| Comments: + +Metal does not respect its own Memory Coherency Model (MSL spec 4.8). From +the spec: +``By default, memory in the device address space has threadgroup coherence.`` + +If we have a single thread compute dispatch so that we do (simplified version): + +.. code-block:: c + + for (...) { + value = ssbo_data[0]; // ssbo_data is a device buffer + ... + ssbo_data[0] = new_value; + } + +``ssbo_data[0]`` will not correctly store/load the values so the value +written in iteration 0, will not be available in iteration 1. The workaround +to this issue is marking the device memory pointer through which the memory +is accessed as coherent so that the value is stored and loaded correctly. +Hopefully this does not affect performance much. + +| Log: +| 2025-12-08: Workaround implemented and reported to Apple + KK_WORKAROUND_5 --------------- | macOS version: 26.0.1 diff --git a/docs/features.txt b/docs/features.txt index 760fb5798a0..8ad9e3662b8 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -475,7 +475,7 @@ Vulkan 1.2 -- all DONE: anv, hk, nvk, panvk/v10+, pvr, tu, vn VK_KHR_sampler_mirror_clamp_to_edge DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn) VK_KHR_separate_depth_stencil_layouts DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn) VK_KHR_shader_atomic_int64 DONE (anv, lvp, nvk, panvk/v10+, radv, vn, tu/a740+) - VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, vn) + VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, radv, tu, vn) VK_KHR_shader_float_controls DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn) VK_KHR_shader_subgroup_extended_types DONE (anv, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, vn) VK_KHR_spirv_1_4 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, v3dv, vn) diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c index 152256c0388..05ae9eca9a3 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.c +++ b/src/kosmickrisp/compiler/nir_to_msl.c @@ -1063,8 +1063,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_global: { enum gl_access_qualifier access = nir_intrinsic_access(instr); const char *type = msl_type_for_def(ctx->types, &instr->def); - const char *addressing = - access & ACCESS_COHERENT ? "coherent device" : "device"; + const bool apply_workaround = + !(ctx->disabled_workarounds & BITFIELD64_BIT(6)); + const char *addressing = apply_workaround || (access & ACCESS_COHERENT) + ? "coherent device" + : "device"; if (access & ACCESS_ATOMIC) { assert(instr->num_components == 1u && "We can only do single component with atomics"); @@ -1133,8 +1136,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_store_global: { enum gl_access_qualifier access = nir_intrinsic_access(instr); const char *type = msl_type_for_src(ctx->types, &instr->src[0]); - const char *addressing = - access & ACCESS_COHERENT ? "coherent device" : "device"; + const bool apply_workaround = + !(ctx->disabled_workarounds & BITFIELD64_BIT(6)); + const char *addressing = apply_workaround || (access & ACCESS_COHERENT) + ? "coherent device" + : "device"; if (access & ACCESS_ATOMIC) { assert(instr->num_components == 1u && "We can only do single component with atomics"); diff --git a/src/kosmickrisp/vulkan/kk_physical_device.c b/src/kosmickrisp/vulkan/kk_physical_device.c index abecafdf507..37defbf0e3a 100644 --- a/src/kosmickrisp/vulkan/kk_physical_device.c +++ b/src/kosmickrisp/vulkan/kk_physical_device.c @@ -74,8 +74,7 @@ kk_get_device_extensions(const struct kk_instance *instance, .KHR_separate_depth_stencil_layouts = true, .KHR_shader_atomic_int64 = false, .KHR_shader_float_controls = true, - .KHR_shader_float16_int8 = - false, /* TODO_KOSMICKRISP shaderInt8 shaderFloat16 */ + .KHR_shader_float16_int8 = true, .KHR_shader_subgroup_extended_types = true, .KHR_spirv_1_4 = true, .KHR_timeline_semaphore = true, @@ -214,25 +213,10 @@ kk_get_device_features( .samplerMirrorClampToEdge = true, .scalarBlockLayout = true, .separateDepthStencilLayouts = true, - /* TODO_KOSMICKRISP shaderFloat16 - * Failing: - * dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.* - * dEQP-VK.memory_model.shared.16bit.nested_structs_arrays.* - */ - .shaderFloat16 = false, + .shaderFloat16 = true, .shaderInputAttachmentArrayDynamicIndexing = true, .shaderInputAttachmentArrayNonUniformIndexing = true, - /* TODO_KOSMICKRISP shaderInt8 - * Multiple MSL compiler crashes if we enable shaderInt8, need to - * understand why and a workaround: - * dEQP-VK.memory_model.shared.8bit.vector_types.9 - * dEQP-VK.memory_model.shared.8bit.basic_types.8 - * dEQP-VK.memory_model.shared.8bit.basic_arrays.2 - * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.1 - * dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.8 - * Probably more - */ - .shaderInt8 = false, + .shaderInt8 = true, .shaderOutputLayer = true, .shaderOutputViewportIndex = true, .shaderSampledImageArrayNonUniformIndexing = true,