kk: Enable float16 and int8

Metal does not seem to respect memory coherency for threads. Workaround 6
enforces device coherency for global loads/stores even if it should not
be needed.

Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com>
Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38847>
This commit is contained in:
Aitor Camacho 2025-12-07 22:21:24 +09:00 committed by Marge Bot
parent 1e0e7d4296
commit 16a94b1b7f
4 changed files with 45 additions and 24 deletions

View file

@ -49,6 +49,37 @@ info on what was updated.
Workarounds
===========
KK_WORKAROUND_6
---------------
| macOS version: 26.0.1
| Metal ticket: Not reported
| Metal ticket status:
| CTS test failure: ``dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*``
| Comments:
Metal does not respect its own Memory Coherency Model (MSL spec 4.8). From
the spec:
``By default, memory in the device address space has threadgroup coherence.``
If we have a single thread compute dispatch so that we do (simplified version):
.. code-block:: c
for (...) {
value = ssbo_data[0]; // ssbo_data is a device buffer
...
ssbo_data[0] = new_value;
}
``ssbo_data[0]`` will not correctly store/load the values so the value
written in iteration 0, will not be available in iteration 1. The workaround
to this issue is marking the device memory pointer through which the memory
is accessed as coherent so that the value is stored and loaded correctly.
Hopefully this does not affect performance much.
| Log:
| 2025-12-08: Workaround implemented and reported to Apple
KK_WORKAROUND_5
---------------
| macOS version: 26.0.1

View file

@ -475,7 +475,7 @@ Vulkan 1.2 -- all DONE: anv, hk, nvk, panvk/v10+, pvr, tu, vn
VK_KHR_sampler_mirror_clamp_to_edge DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
VK_KHR_separate_depth_stencil_layouts DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
VK_KHR_shader_atomic_int64 DONE (anv, lvp, nvk, panvk/v10+, radv, vn, tu/a740+)
VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, vn)
VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, radv, tu, vn)
VK_KHR_shader_float_controls DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
VK_KHR_shader_subgroup_extended_types DONE (anv, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, vn)
VK_KHR_spirv_1_4 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, v3dv, vn)

View file

@ -1063,8 +1063,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
case nir_intrinsic_load_global: {
enum gl_access_qualifier access = nir_intrinsic_access(instr);
const char *type = msl_type_for_def(ctx->types, &instr->def);
const char *addressing =
access & ACCESS_COHERENT ? "coherent device" : "device";
const bool apply_workaround =
!(ctx->disabled_workarounds & BITFIELD64_BIT(6));
const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
? "coherent device"
: "device";
if (access & ACCESS_ATOMIC) {
assert(instr->num_components == 1u &&
"We can only do single component with atomics");
@ -1133,8 +1136,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
case nir_intrinsic_store_global: {
enum gl_access_qualifier access = nir_intrinsic_access(instr);
const char *type = msl_type_for_src(ctx->types, &instr->src[0]);
const char *addressing =
access & ACCESS_COHERENT ? "coherent device" : "device";
const bool apply_workaround =
!(ctx->disabled_workarounds & BITFIELD64_BIT(6));
const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
? "coherent device"
: "device";
if (access & ACCESS_ATOMIC) {
assert(instr->num_components == 1u &&
"We can only do single component with atomics");

View file

@ -74,8 +74,7 @@ kk_get_device_extensions(const struct kk_instance *instance,
.KHR_separate_depth_stencil_layouts = true,
.KHR_shader_atomic_int64 = false,
.KHR_shader_float_controls = true,
.KHR_shader_float16_int8 =
false, /* TODO_KOSMICKRISP shaderInt8 shaderFloat16 */
.KHR_shader_float16_int8 = true,
.KHR_shader_subgroup_extended_types = true,
.KHR_spirv_1_4 = true,
.KHR_timeline_semaphore = true,
@ -214,25 +213,10 @@ kk_get_device_features(
.samplerMirrorClampToEdge = true,
.scalarBlockLayout = true,
.separateDepthStencilLayouts = true,
/* TODO_KOSMICKRISP shaderFloat16
* Failing:
* dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*
* dEQP-VK.memory_model.shared.16bit.nested_structs_arrays.*
*/
.shaderFloat16 = false,
.shaderFloat16 = true,
.shaderInputAttachmentArrayDynamicIndexing = true,
.shaderInputAttachmentArrayNonUniformIndexing = true,
/* TODO_KOSMICKRISP shaderInt8
* Multiple MSL compiler crashes if we enable shaderInt8, need to
* understand why and a workaround:
* dEQP-VK.memory_model.shared.8bit.vector_types.9
* dEQP-VK.memory_model.shared.8bit.basic_types.8
* dEQP-VK.memory_model.shared.8bit.basic_arrays.2
* dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.1
* dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.8
* Probably more
*/
.shaderInt8 = false,
.shaderInt8 = true,
.shaderOutputLayer = true,
.shaderOutputViewportIndex = true,
.shaderSampledImageArrayNonUniformIndexing = true,