kk: Handle memory coherency for textures and buffers
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

M1 chips are more restrictive than M2 and above. We need to enforce memory
coherency when needed through "coherent" for buffer memory and
"memory_coherence_device" for textures. Without these the memory operations
are not visible to other threads.

Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com>
Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38595>
This commit is contained in:
Aitor Camacho 2025-11-23 15:37:54 +09:00 committed by Marge Bot
parent 3aa4be52f3
commit bdaff0b457
3 changed files with 71 additions and 19 deletions

View file

@ -2881,7 +2881,7 @@ intrinsic("is_null_descriptor", src_comp=[-1], dest_comp=1, flags=[CAN_ELIMINATE
# given binding
load("buffer_ptr_kk", [], [BINDING], [CAN_ELIMINATE, CAN_REORDER])
# Opaque texture<T> handle, with DEST_TYPE representing T
load("texture_handle_kk", [1], [DEST_TYPE, IMAGE_DIM, IMAGE_ARRAY, FLAGS], [CAN_ELIMINATE])
load("texture_handle_kk", [1], [DEST_TYPE, IMAGE_DIM, IMAGE_ARRAY, ACCESS, FLAGS], [CAN_ELIMINATE])
# Same as above but for depth<T> textures, T is always float
load("depth_texture_kk", [1], [IMAGE_DIM, IMAGE_ARRAY], [CAN_ELIMINATE])
# Load a bindless sampler handle mapping a binding table sampler.

View file

@ -1069,10 +1069,21 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
P(ctx, "(ulong)&buf%d.contents[0];\n", nir_intrinsic_binding(instr));
break;
case nir_intrinsic_load_global: {
src_to_packed_load(ctx, &instr->src[0], "device",
msl_type_for_def(ctx->types, &instr->def),
enum gl_access_qualifier access = nir_intrinsic_access(instr);
const char *type = msl_type_for_def(ctx->types, &instr->def);
const char *addressing =
access & ACCESS_COHERENT ? "coherent device" : "device";
if (access & ACCESS_ATOMIC) {
assert(instr->num_components == 1u &&
"We can only do single component with atomics");
P(ctx, "atomic_load_explicit((%s atomic_%s*)", addressing, type);
src_to_msl(ctx, &instr->src[0]);
P(ctx, ", memory_order_relaxed);\n");
} else {
src_to_packed_load(ctx, &instr->src[0], addressing, type,
instr->def.num_components);
P(ctx, ";\n");
}
break;
}
case nir_intrinsic_load_global_constant: {
@ -1094,9 +1105,24 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_load_global_constant_offset: {
src_to_packed_load_offset(ctx, &instr->src[0], &instr->src[1], "device",
enum gl_access_qualifier access = nir_intrinsic_access(instr);
const char *type = msl_type_for_def(ctx->types, &instr->def);
const char *addressing =
access & ACCESS_COHERENT ? "coherent device" : "device";
if (access & ACCESS_ATOMIC) {
assert(instr->num_components == 1u &&
"We can only do single component with atomics");
P(ctx, "atomic_load_explicit((%s atomic_%s*)(", addressing, type);
src_to_msl(ctx, &instr->src[0]);
P(ctx, "+");
src_to_msl(ctx, &instr->src[1]);
P(ctx, ", memory_order_relaxed);\n");
} else {
src_to_packed_load_offset(ctx, &instr->src[0], &instr->src[1],
addressing,
msl_type_for_def(ctx->types, &instr->def),
instr->def.num_components);
}
P(ctx, ";\n");
break;
}
@ -1113,8 +1139,21 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
atomic_swap_to_msl(ctx, instr, "threadgroup", true);
break;
case nir_intrinsic_store_global: {
enum gl_access_qualifier access = nir_intrinsic_access(instr);
const char *type = msl_type_for_src(ctx->types, &instr->src[0]);
src_to_packed_store(ctx, &instr->src[1], "device", type,
const char *addressing =
access & ACCESS_COHERENT ? "coherent device" : "device";
if (access & ACCESS_ATOMIC) {
assert(instr->num_components == 1u &&
"We can only do single component with atomics");
P_IND(ctx, "atomic_store_explicit((%s atomic_%s*)", addressing, type);
src_to_msl(ctx, &instr->src[1]);
P(ctx, ", ")
src_to_packed(ctx, &instr->src[0], type,
instr->src[0].ssa->num_components);
P(ctx, ", memory_order_relaxed);\n");
} else {
src_to_packed_store(ctx, &instr->src[1], addressing, type,
instr->src[0].ssa->num_components);
writemask_to_msl(ctx, nir_intrinsic_write_mask(instr),
instr->num_components);
@ -1122,6 +1161,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
src_to_packed(ctx, &instr->src[0], type,
instr->src[0].ssa->num_components);
P(ctx, ";\n");
}
break;
}
case nir_intrinsic_barrier: {
@ -1243,6 +1283,9 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_texture_handle_kk: {
const char *access = "";
const char *coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT
? ", memory_coherence_device"
: "";
switch (nir_intrinsic_flags(instr)) {
case MSL_ACCESS_READ:
access = ", access::read";
@ -1252,15 +1295,23 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
break;
case MSL_ACCESS_READ_WRITE:
access = ", access::read_write";
/* TODO_KOSMICKRISP We shouldn't need this line below but it doesn't
* seem we get the correct access values for what should be device
* coherent textures from NIR. Example test:
* dEQP-VK.memory_model.message_passing.ext.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_local.buffer.comp
* This test declares the texture as devicecoherent, but in NIR it
* appears as resctrict only with no coherent.
*/
coherent = ", memory_coherence_device";
break;
}
P_IND(ctx, "texture%s%s<%s%s> t%d = *(constant texture%s%s<%s%s>*)",
P_IND(ctx, "texture%s%s<%s%s%s> t%d = *(constant texture%s%s<%s%s%s>*)",
texture_dim(nir_intrinsic_image_dim(instr)),
nir_intrinsic_image_array(instr) ? "_array" : "",
tex_type_name(nir_intrinsic_dest_type(instr)), access,
tex_type_name(nir_intrinsic_dest_type(instr)), access, coherent,
instr->def.index, texture_dim(nir_intrinsic_image_dim(instr)),
nir_intrinsic_image_array(instr) ? "_array" : "",
tex_type_name(nir_intrinsic_dest_type(instr)), access);
tex_type_name(nir_intrinsic_dest_type(instr)), access, coherent);
src_to_msl(ctx, &instr->src[0]);
P(ctx, ";\n");
break;

View file

@ -379,6 +379,7 @@ lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr,
b, 1, 64, resource_addr, .dest_type = type,
.image_dim = nir_intrinsic_image_dim(intr),
.image_array = nir_intrinsic_image_array(intr),
.access = nir_intrinsic_access(intr) | var->data.access,
.flags = msl_convert_access_flag(var->data.access));
nir_rewrite_image_intrinsic(intr, handle, true);