diff --git a/docs/drivers/kosmickrisp/workarounds.rst b/docs/drivers/kosmickrisp/workarounds.rst index 6e556704cf1..7a5f725ff81 100644 --- a/docs/drivers/kosmickrisp/workarounds.rst +++ b/docs/drivers/kosmickrisp/workarounds.rst @@ -49,6 +49,36 @@ info on what was updated. Workarounds =========== +KK_WORKAROUND_7 +--------------- +| macOS version: 26.0.1 +| Metal ticket: Not reported +| Metal ticket status: +| CTS test failure: ``dEQP-VK.renderpasses.renderpass2.depth_stencil_resolve.image_2d_*testing_stencil_samplemask`` +| Comments: + +Metal seems to ignore sample_mask out for cases for the stencil attachment +where we have no color attachments, a multisample depth_stencil attachment +with at least 2 samples and a fragment shader that only writes the depth and +a static sample_mask out. + +My conclusion is that they may try to prematurely optimize by doing early +fragment testing disregarding completely the sample_mask out and applying +the value to all stencil samples. + +The failing tests do something along the lines of, start render pass with +depth_stencil cleared to 0.0f and 0 respectively, if depth test passes set +stencil to 1. Sample mask out is 1 (first sample). Draw framebuffer size square +with 0.5f depth. End render pass storing values. Start render pass loading the +previous output but if depth passes stencil will be set to 255. Sample mask out +is 2 (second sample). Draw framebuffer size square with 0.5f depth. + +In a similar fashion to 2 workarounds below this one, we do a conditional +discard at the end discarding fragments not covered by the coverage_mask. + +| Log: +| 2026-02-05: Workaround implemented + KK_WORKAROUND_6 --------------- | macOS version: 26.0.1 diff --git a/src/kosmickrisp/compiler/msl_iomap.c b/src/kosmickrisp/compiler/msl_iomap.c index 9182023a34b..9885c3157da 100644 --- a/src/kosmickrisp/compiler/msl_iomap.c +++ b/src/kosmickrisp/compiler/msl_iomap.c @@ -184,11 +184,78 @@ vs_output_block(nir_shader *shader, struct nir_to_msl_ctx *ctx) if (shader->info.clip_distance_array_size) P_IND(ctx, "float gl_ClipDistance [[clip_distance]] [%d];", - shader->info.clip_distance_array_size); + shader->info.clip_distance_array_size); ctx->indentlevel--; P(ctx, "};\n"); } +static void +fs_input_interpolant(struct nir_to_msl_ctx *ctx, uint64_t location) +{ + struct io_slot_info info = ctx->inputs_info[location]; + bool scalarized = VARYING_SLOT_INFO[location].scalarized; + const char *type = alu_type_to_string(info.type); + const char *vector_suffix = + scalarized ? "" : vector_suffixes[info.num_components]; + unsigned components = scalarized ? info.num_components : 1; + const char *interp = ""; + switch (info.interpolation) { + case INTERP_MODE_SMOOTH: + interp = "interpolation::perspective"; + break; + case INTERP_MODE_NOPERSPECTIVE: + interp = "interpolation::no_perspective"; + break; + default: + UNREACHABLE("Only perspective and no-perspective are allowed"); + } + for (int c = 0; c < components; c++) { + P_IND(ctx, "interpolant<%s%s, %s> ", type, vector_suffix, interp); + varying_slot_name(ctx, location, c); + P(ctx, " "); + varying_slot_semantic(ctx, location, c); + P(ctx, ";\n"); + } +} + +static void +fs_input_no_interpolant(struct nir_to_msl_ctx *ctx, uint64_t location) +{ + struct io_slot_info info = ctx->inputs_info[location]; + bool scalarized = VARYING_SLOT_INFO[location].scalarized; + const char *type = alu_type_to_string(info.type); + const char *vector_suffix = + scalarized ? "" : vector_suffixes[info.num_components]; + unsigned components = scalarized ? info.num_components : 1; + const char *interp = ""; + switch (info.interpolation) { + case INTERP_MODE_NOPERSPECTIVE: + if (info.centroid) + interp = "[[centroid_no_perspective]]"; + else if (info.sample) + interp = "[[sample_no_perspective]]"; + else + interp = "[[center_no_perspective]]"; + break; + case INTERP_MODE_FLAT: + interp = "[[flat]]"; + break; + default: + if (info.centroid) + interp = "[[centroid_perspective]]"; + else if (info.sample) + interp = "[[sample_perspective]]"; + break; + } + for (int c = 0; c < components; c++) { + P_IND(ctx, "%s%s ", type, vector_suffix); + varying_slot_name(ctx, location, c); + P(ctx, " "); + varying_slot_semantic(ctx, location, c); + P(ctx, " %s;\n", interp); + } +} + /* Generate the struct definition for the fragment shader input argument */ static void fs_input_block(nir_shader *shader, struct nir_to_msl_ctx *ctx) @@ -197,38 +264,10 @@ fs_input_block(nir_shader *shader, struct nir_to_msl_ctx *ctx) ctx->indentlevel++; u_foreach_bit64(location, shader->info.inputs_read) { struct io_slot_info info = ctx->inputs_info[location]; - bool scalarized = VARYING_SLOT_INFO[location].scalarized; - const char *type = alu_type_to_string(info.type); - const char *vector_suffix = - scalarized ? "" : vector_suffixes[info.num_components]; - const char *interp = ""; - switch (info.interpolation) { - case INTERP_MODE_NOPERSPECTIVE: - if (info.centroid) - interp = "[[centroid_no_perspective]]"; - else if (info.sample) - interp = "[[sample_no_perspective]]"; - else - interp = "[[center_no_perspective]]"; - break; - case INTERP_MODE_FLAT: - interp = "[[flat]]"; - break; - default: - if (info.centroid) - interp = "[[centroid_perspective]]"; - else if (info.sample) - interp = "[[sample_perspective]]"; - break; - } - unsigned components = scalarized ? info.num_components : 1; - for (int c = 0; c < components; c++) { - P_IND(ctx, "%s%s ", type, vector_suffix); - varying_slot_name(ctx, location, c); - P(ctx, " "); - varying_slot_semantic(ctx, location, c); - P(ctx, " %s;\n", interp); - } + if (info.uses_interpolant) + fs_input_interpolant(ctx, location); + else + fs_input_no_interpolant(ctx, location); } /* Enable reading from framebuffer */ @@ -302,6 +341,9 @@ msl_nir_gather_io_info(nir_builder *b, nir_intrinsic_instr *intrin, void *data) interp_intrin->intrinsic == nir_intrinsic_load_barycentric_centroid; ctx->input[location].sample = interp_intrin->intrinsic == nir_intrinsic_load_barycentric_sample; + ctx->input[location].uses_interpolant |= + interp_intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample || + interp_intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset; break; } case nir_intrinsic_load_input: { diff --git a/src/kosmickrisp/compiler/msl_nir_lower_common.c b/src/kosmickrisp/compiler/msl_nir_lower_common.c index 12a9582ed1e..0f715849305 100644 --- a/src/kosmickrisp/compiler/msl_nir_lower_common.c +++ b/src/kosmickrisp/compiler/msl_nir_lower_common.c @@ -131,8 +131,10 @@ msl_replace_load_sample_mask_in_for_static_sample_mask( if (intr->intrinsic != nir_intrinsic_load_sample_mask_in) return false; - nir_def *sample_mask = (nir_def *)data; - nir_def_rewrite_uses(&intr->def, sample_mask); + b->cursor = nir_after_instr(&intr->instr); + nir_def *static_sample_mask = (nir_def *)data; + nir_def *sample_mask = nir_iand(b, &intr->def, static_sample_mask); + nir_def_rewrite_uses_after(&intr->def, sample_mask); return true; } @@ -150,14 +152,15 @@ msl_lower_static_sample_mask(nir_shader *nir, uint32_t sample_mask) .location = FRAG_RESULT_SAMPLE_MASK, .num_slots = 1u, }; - nir_def *sample_mask_def = nir_imm_int(&b, sample_mask); - nir_store_output(&b, sample_mask_def, nir_imm_int(&b, 0u), .base = 0u, - .range = 1u, .write_mask = 0x1, .component = 0u, + nir_def *static_sample_mask = nir_imm_int(&b, sample_mask); + nir_store_output(&b, nir_load_sample_mask_in(&b), nir_imm_int(&b, 0u), + .base = 0u, .range = 1u, .write_mask = 0x1, .component = 0u, .src_type = nir_type_uint32, .io_semantics = io_semantics); + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); - return nir_shader_intrinsics_pass( + nir_shader_intrinsics_pass( nir, msl_replace_load_sample_mask_in_for_static_sample_mask, - nir_metadata_control_flow, sample_mask_def); + nir_metadata_control_flow, static_sample_mask); return true; } @@ -309,6 +312,73 @@ msl_nir_fake_guard_for_discards(struct nir_shader *nir) nir_metadata_control_flow, NULL); } +/* Returns true if gl_SampleID is required. */ +static bool +gather_fs_input_interpolant_usage(nir_builder *b, nir_intrinsic_instr *intr, + void *data) +{ + if (intr->intrinsic != nir_intrinsic_load_interpolated_input) + return false; + + bool *uses_interpolant = (bool *)data; + struct nir_io_semantics io = nir_intrinsic_io_semantics(intr); + nir_intrinsic_instr *interpolation = nir_src_as_intrinsic(intr->src[0]); + bool at_sample = + interpolation->intrinsic == nir_intrinsic_load_barycentric_at_sample; + uses_interpolant[io.location] |= + at_sample || + interpolation->intrinsic == nir_intrinsic_load_barycentric_at_offset; + return at_sample; +} + +static bool +lower_sample_shading(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_frag_coord) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *offset = + nir_fadd(b, nir_load_sample_pos_from_id(b, 32u, nir_load_sample_id(b)), + nir_imm_vec2(b, -0.5f, -0.5f)); + nir_def *sample_position = + nir_fadd(b, &intr->def, nir_pad_vector_imm_int(b, offset, 0u, 4u)); + nir_def_rewrite_uses_after(&intr->def, sample_position); + return true; + } + + if (intr->intrinsic != nir_intrinsic_load_interpolated_input) + return false; + + bool *uses_interpolant = (bool *)data; + struct nir_io_semantics io = nir_intrinsic_io_semantics(intr); + nir_intrinsic_instr *interpolation = nir_src_as_intrinsic(intr->src[0]); + if (!uses_interpolant[io.location] || + interpolation->intrinsic != nir_intrinsic_load_barycentric_sample) + return false; + + b->cursor = nir_before_instr(&intr->instr); + nir_def *def = nir_load_barycentric_at_sample( + b, intr->def.bit_size, nir_load_sample_id(b), + .interp_mode = nir_intrinsic_interp_mode(interpolation)); + nir_def_rewrite_uses_after(&interpolation->def, def); + nir_instr_remove(&interpolation->instr); + return false; +} + +bool +msl_nir_lower_sample_shading(nir_shader *nir) +{ + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + + bool uses_interpolant[NUM_TOTAL_VARYING_SLOTS] = {}; + + if (nir_shader_intrinsics_pass(nir, gather_fs_input_interpolant_usage, + nir_metadata_all, uses_interpolant)) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID); + + return nir_shader_intrinsics_pass( + nir, lower_sample_shading, nir_metadata_control_flow, uses_interpolant); +} + static bool lower_clip_distance(nir_builder *b, nir_intrinsic_instr *intr, void *data) { diff --git a/src/kosmickrisp/compiler/msl_private.h b/src/kosmickrisp/compiler/msl_private.h index b33ae95db8f..e9588f7c3cb 100644 --- a/src/kosmickrisp/compiler/msl_private.h +++ b/src/kosmickrisp/compiler/msl_private.h @@ -15,6 +15,7 @@ struct io_slot_info { unsigned num_components; bool centroid; bool sample; + bool uses_interpolant; }; struct nir_to_msl_ctx { diff --git a/src/kosmickrisp/compiler/msl_type_inference.c b/src/kosmickrisp/compiler/msl_type_inference.c index 6258ea5f593..809c818f8c8 100644 --- a/src/kosmickrisp/compiler/msl_type_inference.c +++ b/src/kosmickrisp/compiler/msl_type_inference.c @@ -276,6 +276,10 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr) set_type(types, &instr->def, ty); break; } + case nir_intrinsic_load_sample_pos_from_id: + set_type(types, &instr->def, TYPE_FLOAT); + set_type(types, &instr->src[0], TYPE_UINT); + break; case nir_intrinsic_load_global_constant: set_type(types, &instr->def, TYPE_GENERIC_DATA); set_type(types, &instr->src[0], TYPE_UINT); @@ -356,6 +360,7 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr) set_type(types, &instr->def, TYPE_UINT); break; case nir_intrinsic_load_vulkan_descriptor: + case nir_intrinsic_load_barycentric_at_sample: set_type(types, &instr->src[0], TYPE_UINT); set_type(types, &instr->def, TYPE_UINT); break; @@ -380,10 +385,12 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr) case nir_intrinsic_ddy_coarse: case nir_intrinsic_ddx_fine: case nir_intrinsic_ddy_fine: + case nir_intrinsic_load_barycentric_at_offset: set_type(types, &instr->src[0], TYPE_FLOAT); set_type(types, &instr->def, TYPE_FLOAT); break; case nir_intrinsic_load_point_coord: + case nir_intrinsic_load_sample_pos: set_type(types, &instr->def, TYPE_FLOAT); break; case nir_intrinsic_load_front_face: diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c index bc753ade1bf..7502bb8c96a 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.c +++ b/src/kosmickrisp/compiler/nir_to_msl.c @@ -852,6 +852,34 @@ memory_modes_to_msl(struct nir_to_msl_ctx *ctx, nir_variable_mode modes) P(ctx, "mem_flags::mem_none"); } +static void +msl_interpolant_method(struct nir_to_msl_ctx *ctx, nir_src *src) +{ + nir_intrinsic_instr *interpolation_intrin = nir_src_as_intrinsic(*src); + switch (interpolation_intrin->intrinsic) { + case nir_intrinsic_load_barycentric_at_offset: + P(ctx, ".interpolate_at_offset(") + src_to_msl(ctx, &interpolation_intrin->src[0u]); + /* Need to add 0.5f since GLSL.std.450 marks the pixel center as the + * coordinate while Metal is at top-left. */ + P(ctx, " + 0.5f)"); + break; + case nir_intrinsic_load_barycentric_centroid: + P(ctx, ".interpolate_at_centroid()"); + break; + case nir_intrinsic_load_barycentric_at_sample: + P(ctx, ".interpolate_at_sample("); + src_to_msl(ctx, &interpolation_intrin->src[0u]); + P(ctx, ")"); + break; + case nir_intrinsic_load_barycentric_pixel: + P(ctx, ".interpolate_at_center()"); + break; + default: + break; + } +} + static void intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) { @@ -859,7 +887,9 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) * don't generate any code. */ if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel || instr->intrinsic == nir_intrinsic_load_barycentric_centroid || - instr->intrinsic == nir_intrinsic_load_barycentric_sample) + instr->intrinsic == nir_intrinsic_load_barycentric_sample || + instr->intrinsic == nir_intrinsic_load_barycentric_at_sample || + instr->intrinsic == nir_intrinsic_load_barycentric_at_offset) return; const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; @@ -978,7 +1008,10 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) P(ctx, "gl_SampleID;\n"); break; case nir_intrinsic_load_sample_mask_in: - P(ctx, "gl_SampleMask;\n"); + P(ctx, "gl_SampleMask & (1u << gl_SampleID);\n"); + break; + case nir_intrinsic_load_sample_pos: + P(ctx, "get_sample_position(gl_SampleID);\n"); break; case nir_intrinsic_load_amplification_id_kk: P(ctx, "mtl_AmplificationID;\n"); @@ -990,6 +1023,8 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) uint32_t location = io.location + idx; msl_input_name(ctx, location, component); + if (ctx->inputs_info[location].uses_interpolant) + msl_interpolant_method(ctx, &instr->src[0u]); if (instr->num_components < msl_input_num_components(ctx, location)) { P(ctx, "."); for (unsigned i = 0; i < instr->num_components; i++) @@ -1013,6 +1048,12 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) P(ctx, ";\n"); break; } + case nir_intrinsic_load_sample_pos_from_id: { + P(ctx, "get_sample_position("); + src_to_msl(ctx, &instr->src[0]); + P(ctx, ");\n"); + break; + } case nir_intrinsic_load_output: { unsigned idx = nir_src_as_uint(instr->src[0]); nir_io_semantics io = nir_intrinsic_io_semantics(instr); @@ -2043,6 +2084,25 @@ msl_gather_info(struct nir_to_msl_ctx *ctx, struct nir_to_msl_options *options) msl_gather_io_info(ctx, ctx->inputs_info, ctx->outputs_info); if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { + /* Metal does not have a system value for sample position but an + * entrypoint "get_sample_position" that take the sample index aka + * sample id. Ensure we have the required system values present. */ + if (BITSET_TEST(ctx->shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS)) { + BITSET_CLEAR(ctx->shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS); + BITSET_SET(ctx->shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_ID); + } + + /* Metal's sample mask has all the bits when we only care about the bit + * the fragment was generated for. This is why we need to compute it + * using sample id. */ + if (BITSET_TEST(ctx->shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_MASK_IN)) + BITSET_SET(ctx->shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_ID); + for (uint32_t i = 0u; i < MAX_DRAW_BUFFERS; ++i) { uint32_t location = i + FRAG_RESULT_DATA0; ctx->outputs_info[location].num_components = diff --git a/src/kosmickrisp/compiler/nir_to_msl.h b/src/kosmickrisp/compiler/nir_to_msl.h index fc82f2e95fd..12fe6727468 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.h +++ b/src/kosmickrisp/compiler/nir_to_msl.h @@ -66,4 +66,5 @@ bool msl_ensure_vertex_position_output(nir_shader *nir); bool msl_nir_fs_io_types(nir_shader *nir); bool msl_nir_vs_io_types(nir_shader *nir); bool msl_nir_fake_guard_for_discards(struct nir_shader *nir); +bool msl_nir_lower_sample_shading(nir_shader *nir); void msl_lower_nir_late(nir_shader *nir); diff --git a/src/kosmickrisp/vulkan/kk_physical_device.c b/src/kosmickrisp/vulkan/kk_physical_device.c index 7aa05ba6da9..3cbae3a0ea8 100644 --- a/src/kosmickrisp/vulkan/kk_physical_device.c +++ b/src/kosmickrisp/vulkan/kk_physical_device.c @@ -171,6 +171,7 @@ kk_get_device_features( .occlusionQueryPrecise = true, .robustBufferAccess = true, .samplerAnisotropy = true, + .sampleRateShading = true, .shaderClipDistance = true, .shaderImageGatherExtended = true, .shaderInt16 = true, diff --git a/src/kosmickrisp/vulkan/kk_shader.c b/src/kosmickrisp/vulkan/kk_shader.c index 6da9933e8ce..0b662f5fd83 100644 --- a/src/kosmickrisp/vulkan/kk_shader.c +++ b/src/kosmickrisp/vulkan/kk_shader.c @@ -406,6 +406,13 @@ static void kk_lower_fs(struct kk_device *dev, nir_shader *nir, const struct vk_graphics_pipeline_state *state) { + /* msl_nir_lower_sample_shading needs to go before blending since + * nir_lower_blend will always set uses_sample_shading to true if there's any + * output read. I believe we do not need to lower it always, that is why it + * goes here but need to double check. */ + if (nir->info.fs.uses_sample_shading) + NIR_PASS(_, nir, msl_nir_lower_sample_shading); + if (state->cb) kk_lower_fs_blend(nir, state); @@ -425,8 +432,19 @@ kk_lower_fs(struct kk_device *dev, nir_shader *nir, NIR_PASS(_, nir, msl_nir_fs_io_types); if (state->ms && state->ms->rasterization_samples && - state->ms->sample_mask != UINT16_MAX) + state->ms->sample_mask != UINT16_MAX) { + + /* KK_WORKAROUND_7 */ + if (!(dev->disabled_workarounds & BITFIELD64_BIT(7))) { + if (!nir->info.fs.early_fragment_tests) { + nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); + nir_builder b = nir_builder_at(nir_after_impl(entrypoint)); + nir_discard_if(&b, + nir_ieq_imm(&b, nir_load_sample_mask_in(&b), 0u)); + } + } NIR_PASS(_, nir, msl_lower_static_sample_mask, state->ms->sample_mask); + } /* Check https://github.com/KhronosGroup/Vulkan-Portability/issues/54 for * explanation on why we need this. */ else if (nir->info.fs.needs_full_quad_helper_invocations ||