kk: Support shaderCullDistance
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Uses an approach based on HoneyKrisp. In the vertex shader, an
extra output writes 1 if the cull distance is >= 0, otherwise it
writes 0. In the fragment shader, if the extra outputs from the
vertex shader interpolate zero, all cull distances are < 0, so
the primitive is culled by discarding fragments.

Reviewed-by: Arcady Goldmints-Orlov <arcady@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41463>
This commit is contained in:
squidbus 2026-05-09 23:21:25 -07:00 committed by Marge Bot
parent e81a292165
commit 2f63d09270
9 changed files with 150 additions and 41 deletions

View file

@ -163,7 +163,6 @@ compile(void *memctx, const uint32_t *spirv, size_t spirv_size)
NIR_PASS(_, nir, nir_opt_idiv_const, 16);
msl_lower_textures(nir);
msl_lower_nir_late(nir);
optimize(nir);

View file

@ -68,6 +68,12 @@ static const struct {
[VARYING_SLOT_VIEWPORT] = {"viewport_array_index"},
[VARYING_SLOT_CLIP_DIST0] = {"clip_0", .user = true, .scalarized = true},
[VARYING_SLOT_CLIP_DIST1] = {"clip_1", .user = true, .scalarized = true},
[VARYING_SLOT_CULL_DIST0] = {"cull_0", .user = true, .scalarized = true},
[VARYING_SLOT_CULL_DIST1] = {"cull_1", .user = true, .scalarized = true},
[VARYING_SLOT_CULL_PRIMITIVE] = {"cull_primitive_0", .user = true, .scalarized = true},
/* Using cull primitive slots to emulate cull distances in fragment shader,
* which may extend to one varying extra (which is otherwise unused) */
[VARYING_SLOT_CULL_PRIMITIVE + 1] = {"cull_primitive_1", .user = true, .scalarized = true},
[VARYING_SLOT_VAR0] = {"vary_00", .user = true},
[VARYING_SLOT_VAR1] = {"vary_01", .user = true},
[VARYING_SLOT_VAR2] = {"vary_02", .user = true},
@ -183,7 +189,7 @@ vs_output_block(nir_shader *shader, struct nir_to_msl_ctx *ctx)
}
if (shader->info.clip_distance_array_size)
P_IND(ctx, "float gl_ClipDistance [[clip_distance]] [%d];",
P_IND(ctx, "float gl_ClipDistance [[clip_distance]] [%d];\n",
shader->info.clip_distance_array_size);
ctx->indentlevel--;
P(ctx, "};\n");

View file

@ -413,28 +413,141 @@ msl_nir_lower_sample_shading(nir_shader *nir)
}
static bool
lower_clip_distance(nir_builder *b, nir_intrinsic_instr *intr, void *data)
lower_clip_cull_distance_write(nir_builder *b, nir_intrinsic_instr *intr,
UNUSED void *data)
{
if (intr->intrinsic != nir_intrinsic_store_output)
return false;
nir_io_semantics io = nir_intrinsic_io_semantics(intr);
unsigned component = nir_intrinsic_component(intr);
if (io.location != VARYING_SLOT_CLIP_DIST0 &&
io.location != VARYING_SLOT_CLIP_DIST1)
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
if (sem.location != VARYING_SLOT_CLIP_DIST0 &&
sem.location != VARYING_SLOT_CLIP_DIST1 &&
sem.location != VARYING_SLOT_CULL_DIST0 &&
sem.location != VARYING_SLOT_CULL_DIST1)
return false;
unsigned base = (io.location - VARYING_SLOT_CLIP_DIST0) * 4 + component;
if (intr->intrinsic == nir_intrinsic_store_output) {
assert(nir_src_num_components(intr->src[0]) == 1 && "must be scalarized");
signed location = sem.location + nir_src_as_uint(intr->src[1]);
if (sem.location == VARYING_SLOT_CLIP_DIST0 ||
sem.location == VARYING_SLOT_CLIP_DIST1) {
/* Clip distance, add write to MSL clip_distance output */
unsigned component =
(location - VARYING_SLOT_CLIP_DIST0) * 4 +
nir_intrinsic_component(intr);
b->cursor = nir_after_instr(&intr->instr);
nir_store_clip_distance_kk(b, intr->src[0].ssa, .base = base);
nir_store_clip_distance_kk(b, intr->src[0].ssa, .base = component);
return true;
}
if (sem.location == VARYING_SLOT_CULL_DIST0 ||
sem.location == VARYING_SLOT_CULL_DIST1) {
/* Cull distance, add write to cull primitive output */
unsigned component =
(location - VARYING_SLOT_CULL_DIST0) * 4 +
nir_intrinsic_component(intr);
b->cursor = nir_before_instr(&intr->instr);
nir_def *offs = nir_imm_int(b, component / 4);
nir_def *v = nir_b2f32(b, nir_fge_imm(b, intr->src[0].ssa, 0.0));
nir_store_output(b, v, offs, .component = component % 4,
.src_type = nir_type_float32,
.io_semantics.location = VARYING_SLOT_CULL_PRIMITIVE,
.io_semantics.num_slots = 2);
return true;
}
return false;
}
static bool
msl_nir_lower_clip_cull_distance_vs(nir_shader *s)
{
if (s->info.clip_distance_array_size == 0 &&
s->info.cull_distance_array_size == 0)
return false;
nir_shader_intrinsics_pass(s, lower_clip_cull_distance_write,
nir_metadata_control_flow, NULL);
if (s->info.cull_distance_array_size > 0)
s->info.outputs_written |=
BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE,
DIV_ROUND_UP(s->info.cull_distance_array_size, 4));
return true;
}
bool
msl_nir_lower_clip_distance(nir_shader *nir)
static bool
msl_nir_lower_cull_distance_fs(nir_shader *s, unsigned nr_distances)
{
return nir_shader_intrinsics_pass(nir, lower_clip_distance, nir_metadata_all,
NULL);
assert(s->info.stage == MESA_SHADER_FRAGMENT);
if (nr_distances == 0)
return false;
nir_builder b_ =
nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
nir_builder *b = &b_;
/* Test each half-space */
nir_def *culled = nir_imm_false(b);
for (unsigned i = 0; i < nr_distances; ++i) {
/* Load the cull primitive input for this cull distance */
nir_def *baryc = nir_load_barycentric_pixel(
b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
nir_def *cull = nir_load_interpolated_input(
b, 1, 32, baryc, nir_imm_int(b, 0),
.component = i & 3,
.io_semantics.location = VARYING_SLOT_CULL_PRIMITIVE + (i / 4),
.io_semantics.num_slots = nr_distances / 4);
/* When the cull distance is negative in the vertex shader, the resulting
* cull primitive output is zero, otherwise it is one. Thus, the
* interpolated value will be zero only if all of its vertices had
* negative cull distances, indicating the primitive should be called.
* Note that, since the value is interpolated at the pixel center, we
* don't have to worry about corner values. */
culled = nir_ior(b, culled, nir_ball(b, nir_feq_imm(b, cull, 0)));
}
/* Emulate primitive culling by discarding fragments */
nir_demote_if(b, culled);
s->info.inputs_read |= BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE,
DIV_ROUND_UP(nr_distances, 4));
s->info.fs.uses_discard = true;
return nir_progress(true, b->impl, nir_metadata_control_flow);
}
/* Scalarize stores to CLIP_DIST* varyings */
static bool
scalarize_clip_cull_distance_filter(const nir_intrinsic_instr *intrin,
UNUSED const void *_data)
{
if (intrin->intrinsic != nir_intrinsic_store_output)
return false;
nir_io_semantics semantics = nir_intrinsic_io_semantics(intrin);
return semantics.location == VARYING_SLOT_CLIP_DIST0 ||
semantics.location == VARYING_SLOT_CLIP_DIST1 ||
semantics.location == VARYING_SLOT_CULL_DIST0 ||
semantics.location == VARYING_SLOT_CULL_DIST1;
}
void
msl_nir_lower_clip_cull_distance(nir_shader *nir, unsigned num_cull_distances)
{
NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out,
scalarize_clip_cull_distance_filter, NULL);
NIR_PASS(_, nir, nir_separate_merged_clip_cull_io);
if (nir->info.stage == MESA_SHADER_FRAGMENT)
NIR_PASS(_, nir, msl_nir_lower_cull_distance_fs, num_cull_distances);
else
NIR_PASS(_, nir, msl_nir_lower_clip_cull_distance_vs);
}

View file

@ -86,5 +86,3 @@ bool msl_def_is_sampler(struct nir_to_msl_ctx *ctx, nir_def *def);
void msl_nir_lower_subgroups(nir_shader *nir);
bool msl_nir_lower_algebraic_late(nir_shader *shader);
bool msl_nir_lower_clip_distance(nir_shader *nir);

View file

@ -2125,26 +2125,6 @@ void msl_preprocess_nir_workarounds(struct nir_shader *nir,
}
}
/* Scalarize stores to CLIP_DIST* varyings */
static bool
scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin,
UNUSED const void *_data)
{
if (intrin->intrinsic != nir_intrinsic_store_output)
return false;
nir_io_semantics semantics = nir_intrinsic_io_semantics(intrin);
return semantics.location == VARYING_SLOT_CLIP_DIST0 ||
semantics.location == VARYING_SLOT_CLIP_DIST1;
}
void
msl_lower_nir_late(nir_shader *nir)
{
NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out,
scalarize_clip_distance_filter, NULL);
NIR_PASS(_, nir, msl_nir_lower_clip_distance);
}
static void
msl_gather_info(struct nir_to_msl_ctx *ctx, struct nir_to_msl_options *options)
{

View file

@ -76,7 +76,8 @@ bool msl_nir_fs_io_types(nir_shader *nir);
bool msl_nir_vs_io_types(nir_shader *nir);
bool msl_nir_fake_guard_for_discards(struct nir_shader *nir);
bool msl_nir_lower_sample_shading(nir_shader *nir);
void msl_lower_nir_late(nir_shader *nir);
void msl_nir_lower_clip_cull_distance(nir_shader *nir,
unsigned num_cull_distances);
bool msl_gather_uses_per_draw_data(nir_shader *nir);

View file

@ -193,6 +193,7 @@ kk_get_device_features(
.samplerAnisotropy = true,
.sampleRateShading = true,
.shaderClipDistance = true,
.shaderCullDistance = true,
.shaderImageGatherExtended = true,
.shaderInt16 = true,
.shaderInt64 = true,

View file

@ -643,6 +643,7 @@ gather_shader_info(struct kk_shader *shader, nir_shader *nir,
if (nir->info.stage == MESA_SHADER_VERTEX) {
nir_shader_intrinsics_pass(nir, gather_vs_inputs, nir_metadata_all,
&shader->info.vs.attribs_read);
shader->info.vs.num_cull_distances = nir->info.cull_distance_array_size;
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
/* Some meta shaders like vk-meta-resolve will have depth_layout as NONE
* which is not a valid Metal layout */
@ -687,6 +688,7 @@ modify_nir_info(nir_shader *nir)
static VkResult
kk_compile_shader(struct kk_device *dev, struct vk_shader_compile_info *info,
struct kk_shader *prev_stage,
const struct vk_graphics_pipeline_state *state,
const VkAllocationCallbacks *pAllocator,
struct vk_shader **shader_out)
@ -715,7 +717,9 @@ kk_compile_shader(struct kk_device *dev, struct vk_shader_compile_info *info,
if (info->stage == MESA_SHADER_VERTEX) {
kk_lower_vs_vbo(nir, state, info->robustness);
}
msl_lower_nir_late(nir);
unsigned num_cull_distances =
prev_stage ? prev_stage->info.vs.num_cull_distances : 0;
msl_nir_lower_clip_cull_distance(nir, num_cull_distances);
msl_optimize_nir(nir);
modify_nir_info(nir);
@ -780,7 +784,7 @@ kk_compile_nir_shader(struct kk_device *dev, nir_shader *nir,
struct vk_shader *shader = NULL;
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
VkResult result = kk_compile_shader(dev, &info, NULL, alloc, &shader);
VkResult result = kk_compile_shader(dev, &info, NULL, NULL, alloc, &shader);
if (result != VK_SUCCESS)
return result;
@ -1162,8 +1166,11 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
nir_opts, NULL);
for (uint32_t i = 0; i < shader_count; i++) {
struct kk_shader *prev_stage = i > 0 ?
container_of(shaders_out[i - 1], struct kk_shader, vk) : NULL;
result =
kk_compile_shader(dev, &infos[i], state, pAllocator, &shaders_out[i]);
kk_compile_shader(dev, &infos[i], prev_stage, state, pAllocator,
&shaders_out[i]);
if (result != VK_SUCCESS) {
/* Clean up all the shaders before this point */
for (uint32_t j = 0; j < i; j++)
@ -1203,7 +1210,8 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
};
struct vk_shader *frag_shader;
result =
kk_compile_shader(dev, &info, state, &dev->vk.alloc, &frag_shader);
kk_compile_shader(dev, &info, fs, state, &dev->vk.alloc,
&frag_shader);
if (result != VK_SUCCESS) {
for (uint32_t i = 0; i < shader_count; i++)

View file

@ -33,6 +33,9 @@ struct kk_shader_info {
uint32_t attribs_read;
uint32_t sample_count;
/* Required for fragment shader cull distance discards. */
uint8_t num_cull_distances;
/* Data needed for serialization. */
enum mtl_primitive_topology_class topology;
enum mtl_pixel_format rt_formats[MAX_DRAW_BUFFERS];