mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 00:50:10 +01:00
kk: Enable float16 and int8
Metal does not seem to respect memory coherency for threads. Workaround 6 enforces device coherency for global loads/stores even if it should not be needed. Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com> Signed-off-by: Aitor Camacho <aitor@lunarg.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38847>
This commit is contained in:
parent
1e0e7d4296
commit
16a94b1b7f
4 changed files with 45 additions and 24 deletions
|
|
@ -49,6 +49,37 @@ info on what was updated.
|
|||
Workarounds
|
||||
===========
|
||||
|
||||
KK_WORKAROUND_6
|
||||
---------------
|
||||
| macOS version: 26.0.1
|
||||
| Metal ticket: Not reported
|
||||
| Metal ticket status:
|
||||
| CTS test failure: ``dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*``
|
||||
| Comments:
|
||||
|
||||
Metal does not respect its own Memory Coherency Model (MSL spec 4.8). From
|
||||
the spec:
|
||||
``By default, memory in the device address space has threadgroup coherence.``
|
||||
|
||||
If we have a single thread compute dispatch so that we do (simplified version):
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
for (...) {
|
||||
value = ssbo_data[0]; // ssbo_data is a device buffer
|
||||
...
|
||||
ssbo_data[0] = new_value;
|
||||
}
|
||||
|
||||
``ssbo_data[0]`` will not correctly store/load the values so the value
|
||||
written in iteration 0, will not be available in iteration 1. The workaround
|
||||
to this issue is marking the device memory pointer through which the memory
|
||||
is accessed as coherent so that the value is stored and loaded correctly.
|
||||
Hopefully this does not affect performance much.
|
||||
|
||||
| Log:
|
||||
| 2025-12-08: Workaround implemented and reported to Apple
|
||||
|
||||
KK_WORKAROUND_5
|
||||
---------------
|
||||
| macOS version: 26.0.1
|
||||
|
|
|
|||
|
|
@ -475,7 +475,7 @@ Vulkan 1.2 -- all DONE: anv, hk, nvk, panvk/v10+, pvr, tu, vn
|
|||
VK_KHR_sampler_mirror_clamp_to_edge DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
|
||||
VK_KHR_separate_depth_stencil_layouts DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
|
||||
VK_KHR_shader_atomic_int64 DONE (anv, lvp, nvk, panvk/v10+, radv, vn, tu/a740+)
|
||||
VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, vn)
|
||||
VK_KHR_shader_float16_int8 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, radv, tu, vn)
|
||||
VK_KHR_shader_float_controls DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
|
||||
VK_KHR_shader_subgroup_extended_types DONE (anv, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, vn)
|
||||
VK_KHR_spirv_1_4 DONE (anv, dzn, hasvk, kk, lvp, nvk, panvk/v10+, pvr, radv, tu, v3dv, vn)
|
||||
|
|
|
|||
|
|
@ -1063,8 +1063,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
|||
case nir_intrinsic_load_global: {
|
||||
enum gl_access_qualifier access = nir_intrinsic_access(instr);
|
||||
const char *type = msl_type_for_def(ctx->types, &instr->def);
|
||||
const char *addressing =
|
||||
access & ACCESS_COHERENT ? "coherent device" : "device";
|
||||
const bool apply_workaround =
|
||||
!(ctx->disabled_workarounds & BITFIELD64_BIT(6));
|
||||
const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
|
||||
? "coherent device"
|
||||
: "device";
|
||||
if (access & ACCESS_ATOMIC) {
|
||||
assert(instr->num_components == 1u &&
|
||||
"We can only do single component with atomics");
|
||||
|
|
@ -1133,8 +1136,11 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
|||
case nir_intrinsic_store_global: {
|
||||
enum gl_access_qualifier access = nir_intrinsic_access(instr);
|
||||
const char *type = msl_type_for_src(ctx->types, &instr->src[0]);
|
||||
const char *addressing =
|
||||
access & ACCESS_COHERENT ? "coherent device" : "device";
|
||||
const bool apply_workaround =
|
||||
!(ctx->disabled_workarounds & BITFIELD64_BIT(6));
|
||||
const char *addressing = apply_workaround || (access & ACCESS_COHERENT)
|
||||
? "coherent device"
|
||||
: "device";
|
||||
if (access & ACCESS_ATOMIC) {
|
||||
assert(instr->num_components == 1u &&
|
||||
"We can only do single component with atomics");
|
||||
|
|
|
|||
|
|
@ -74,8 +74,7 @@ kk_get_device_extensions(const struct kk_instance *instance,
|
|||
.KHR_separate_depth_stencil_layouts = true,
|
||||
.KHR_shader_atomic_int64 = false,
|
||||
.KHR_shader_float_controls = true,
|
||||
.KHR_shader_float16_int8 =
|
||||
false, /* TODO_KOSMICKRISP shaderInt8 shaderFloat16 */
|
||||
.KHR_shader_float16_int8 = true,
|
||||
.KHR_shader_subgroup_extended_types = true,
|
||||
.KHR_spirv_1_4 = true,
|
||||
.KHR_timeline_semaphore = true,
|
||||
|
|
@ -214,25 +213,10 @@ kk_get_device_features(
|
|||
.samplerMirrorClampToEdge = true,
|
||||
.scalarBlockLayout = true,
|
||||
.separateDepthStencilLayouts = true,
|
||||
/* TODO_KOSMICKRISP shaderFloat16
|
||||
* Failing:
|
||||
* dEQP-VK.spirv_assembly.instruction.*.float16.opcompositeinsert.*
|
||||
* dEQP-VK.memory_model.shared.16bit.nested_structs_arrays.*
|
||||
*/
|
||||
.shaderFloat16 = false,
|
||||
.shaderFloat16 = true,
|
||||
.shaderInputAttachmentArrayDynamicIndexing = true,
|
||||
.shaderInputAttachmentArrayNonUniformIndexing = true,
|
||||
/* TODO_KOSMICKRISP shaderInt8
|
||||
* Multiple MSL compiler crashes if we enable shaderInt8, need to
|
||||
* understand why and a workaround:
|
||||
* dEQP-VK.memory_model.shared.8bit.vector_types.9
|
||||
* dEQP-VK.memory_model.shared.8bit.basic_types.8
|
||||
* dEQP-VK.memory_model.shared.8bit.basic_arrays.2
|
||||
* dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.1
|
||||
* dEQP-VK.memory_model.shared.8bit.arrays_of_arrays.8
|
||||
* Probably more
|
||||
*/
|
||||
.shaderInt8 = false,
|
||||
.shaderInt8 = true,
|
||||
.shaderOutputLayer = true,
|
||||
.shaderOutputViewportIndex = true,
|
||||
.shaderSampledImageArrayNonUniformIndexing = true,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue