microsoft/compiler, d3d12: preserve TCS outputs and pad TES inputs for cross-stage signature matching

Four linked D3D12 pipeline-validation problems with GLSL TCS on DXIL:

1) dxil_nir_kill_unused_outputs killed TCS outputs read back by the
   patch-constant function after a barrier, zeroing the tess factors.
   Keep shader_out locations with any intra-shader load_deref live
   regardless of next_stage_read_mask.

2) is_dead_in_variable dropped TES padding placeholders (no local
   uses) in nir_remove_dead_variables. Also honor
   prev_stage_written_mask so padded TES inputs stay alive.

3) Preserving (1) leaves HS with outputs the DS doesn't declare,
   breaking pipeline validation (e.g. piglit's barrier.shader_test).
   Add dxil_nir_pad_tes_input_signature, called from both link paths,
   to synthesize matching TES inputs (reusing each TCS output's type
   so sig shape and stride match byte-for-byte) plus the tess-level
   inputs -- subsuming the tess-level-only block previously in
   dxil_spirv_nir_link. Scope the per-variable padding to TCS
   outputs that TCS itself reads back via load_deref: outputs that
   neither TES nor TCS consumes get killed from the HS signature,
   so padding them into DS would make the DS input signature longer
   than HS output and break validation for SSO pipelines whose TCS
   declares unused per-patch writes (arb_separate_shader_objects/
   mix-and-match-tcs-tes).

4) remove_hs_intrinsics rewrote load_output but not
   load_per_vertex_output in HS main. With (1) keeping outputs alive,
   GLSL reads of outputs in main whose result survives DCE (UAV
   atomics, non-tess per-vertex output writes) left
   LoadOutputControlPoint in the control-point function, which dxil.dll
   rejects outside the PCF (CreatePipelineState then fails with
   E_INVALIDARG). Treat load_per_vertex_output like load_output.

Validated on piglit arb_tessellation_shader/execution (WARP + DXC
1.8.2403): barrier now passes; the previously-crashing
tcs-output-unmatched and variable-indexing/tcs-output-array-* fail
gracefully matching baseline; isoline/isoline-no-tcs remain flakes
(pre-existing canary corruption, unrelated).

d3d12-quick_shader.txt drops barrier; d3d12-flakes.txt adds
isoline-no-tcs alongside isoline.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41028>
This commit is contained in:
Virgile Bello 2026-04-20 10:13:29 +09:00 committed by Marge Bot
parent 1d923fdd2b
commit 50ab52f135
7 changed files with 160 additions and 17 deletions

View file

@ -1,5 +1,6 @@
# Tessellation isoline tests are flaky on WARP, probably a WARP bug
isoline
isoline-no-tcs
# If the test machine are busy, these can spuriously fail
arb_timer_query

View file

@ -15,7 +15,6 @@ spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
# These tests use a TCS output variable only as temporary storage. Since the output
# is unused by the TES, we remove it.
spec@arb_tessellation_shader@execution@barrier,Fail
spec@arb_tessellation_shader@execution@barrier-patch,Crash
spec@arb_tessellation_shader@execution@variable-indexing@tcs-output-array-float-index-wr-before-barrier,Fail
spec@arb_tessellation_shader@execution@variable-indexing@tcs-output-array-vec2-index-rd-after-barrier,Fail

View file

@ -1173,6 +1173,13 @@ select_shader_variant(struct d3d12_selection_context *sel_ctx, d3d12_shader_sele
if (prev) {
NIR_PASS(_, new_nir_variant, dxil_nir_kill_undefined_varyings, key.prev_varying_outputs,
prev->initial->info.patch_outputs_written, key.prev_varying_frac_outputs);
/* Pad TES input signature so HS<->DS match for D3D12 pipeline validation. */
if (new_nir_variant->info.stage == MESA_SHADER_TESS_EVAL &&
prev->initial->info.stage == MESA_SHADER_TESS_CTRL) {
dxil_nir_pad_tes_input_signature(new_nir_variant, prev->initial);
}
dxil_reassign_driver_locations(new_nir_variant, nir_var_shader_in, key.prev_varying_outputs,
key.prev_varying_frac_outputs);
}

View file

@ -2699,11 +2699,38 @@ struct undefined_varying_masks {
uint64_t io_mask;
uint32_t patch_io_mask;
const BITSET_WORD *frac_io_mask;
/* Stage of the shader being processed; used to scope cross-stage
* preservation heuristics to TCS<->TES linkage where they apply. */
mesa_shader_stage stage;
/* For kill_unused_outputs: locations of shader_out variables that have at
* least one load_deref use in the same shader (e.g. TCS patch-constant
* function reading back per-vertex outputs). These must be kept alive even
* if the next stage doesn't read them. */
uint64_t intra_shader_load_mask;
uint32_t intra_shader_patch_load_mask;
};
static bool
is_dead_in_variable(nir_variable *var, void *data)
{
const struct undefined_varying_masks *masks = data;
/* Only TES needs this: placeholder tess-level inputs (synthesized by
* spirv_to_dxil for signature matching with HS) have no local uses but
* must be preserved. Other stages don't synthesize such placeholders,
* and it's perfectly valid for a prev stage to write outputs the next
* stage doesn't read. */
if (masks && masks->stage == MESA_SHADER_TESS_EVAL) {
unsigned loc = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
uint64_t written = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
masks->patch_io_mask : masks->io_mask;
if (BITFIELD64_RANGE(loc, glsl_varying_count(var->type)) & written)
return false;
}
switch (var->data.location) {
/* Only these values can be system generated values in addition to varyings */
case VARYING_SLOT_PRIMITIVE_ID:
@ -2774,7 +2801,8 @@ dxil_nir_kill_undefined_varyings(nir_shader *shader, uint64_t prev_stage_written
struct undefined_varying_masks masks = {
.io_mask = prev_stage_written_mask,
.patch_io_mask = prev_stage_patch_written_mask,
.frac_io_mask = prev_stage_frac_output_mask
.frac_io_mask = prev_stage_frac_output_mask,
.stage = shader->info.stage,
};
bool progress = nir_shader_instructions_pass(shader,
kill_undefined_varyings,
@ -2831,6 +2859,15 @@ kill_unused_outputs(struct nir_builder *b,
unsigned loc = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
var->data.location - VARYING_SLOT_PATCH0 :
var->data.location;
/* Outputs read back by load_deref in the same shader (e.g. TCS patch-
* constant function reading per-vertex outputs after a barrier) must
* stay alive even if the next stage doesn't consume them. */
uint64_t intra = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
masks->intra_shader_patch_load_mask : masks->intra_shader_load_mask;
if (BITFIELD64_RANGE(loc, glsl_varying_count(var->type)) & intra)
return false;
uint64_t read = var->data.patch && var->data.location >= VARYING_SLOT_PATCH0 ?
masks->patch_io_mask : masks->io_mask;
if (BITFIELD64_RANGE(loc, glsl_varying_count(var->type)) & read) {
@ -2856,9 +2893,36 @@ dxil_nir_kill_unused_outputs(nir_shader *shader, uint64_t next_stage_read_mask,
struct undefined_varying_masks masks = {
.io_mask = next_stage_read_mask,
.patch_io_mask = next_stage_patch_read_mask,
.frac_io_mask = next_stage_frac_input_mask
.frac_io_mask = next_stage_frac_input_mask,
.stage = shader->info.stage,
};
/* In TCS, the patch-constant function reads per-vertex outputs via
* output[i].x after a barrier; such outputs must be preserved even if
* the next stage doesn't consume them. load_deref on shader_out is only
* valid in TCS, so scope the scan to that stage. */
if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
nir_foreach_block(block, entrypoint) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_load_deref)
continue;
nir_variable *var = nir_intrinsic_get_var(intr, 0);
if (!var || var->data.mode != nir_var_shader_out)
continue;
if (var->data.patch && var->data.location >= VARYING_SLOT_PATCH0) {
unsigned loc = var->data.location - VARYING_SLOT_PATCH0;
masks.intra_shader_patch_load_mask |= BITFIELD_RANGE(loc, glsl_varying_count(var->type));
} else {
masks.intra_shader_load_mask |= BITFIELD64_RANGE(var->data.location, glsl_varying_count(var->type));
}
}
}
}
bool progress = nir_shader_instructions_pass(shader,
kill_unused_outputs,
nir_metadata_control_flow |
@ -2906,3 +2970,76 @@ dxil_nir_propagate_interp_to_outputs(nir_shader *prev_stage_nir,
}
}
}
/* Without this, an HS output that survives the kill pass (e.g. via the
* intra-shader scratchpad pattern) but has no matching DS input makes
* D3D12 CreatePipelineState fail with E_INVALIDARG. */
bool
dxil_nir_pad_tes_input_signature(nir_shader *tes, const nir_shader *tcs)
{
if (!tcs || !tes ||
tes->info.stage != MESA_SHADER_TESS_EVAL ||
tcs->info.stage != MESA_SHADER_TESS_CTRL)
return false;
bool added = false;
/* Pad only for outputs that TCS reads back itself -- the ones
* kill_unused_outputs keeps alive when TES doesn't read them. Padding
* DS for anything else produces a longer input sig than HS output. */
const uint64_t tcs_intra_read = tcs->info.outputs_read;
const uint32_t tcs_intra_patch_read = tcs->info.patch_outputs_read;
nir_foreach_variable_with_modes(out_var, tcs, nir_var_shader_out) {
unsigned loc = out_var->data.location;
switch (loc) {
case VARYING_SLOT_POS:
case VARYING_SLOT_PSIZ:
case VARYING_SLOT_PRIMITIVE_ID:
case VARYING_SLOT_LAYER:
case VARYING_SLOT_VIEWPORT:
continue;
}
bool intra_accessed;
if (out_var->data.patch && loc >= VARYING_SLOT_PATCH0)
intra_accessed = tcs_intra_patch_read & BITFIELD_BIT(loc - VARYING_SLOT_PATCH0);
else
intra_accessed = tcs_intra_read & BITFIELD64_BIT(loc);
if (!intra_accessed)
continue;
if (nir_find_variable_with_location(tes, nir_var_shader_in, loc))
continue;
nir_variable *v = nir_variable_create(tes, nir_var_shader_in, out_var->type,
out_var->name ? out_var->name : "tes_pad");
v->data = out_var->data;
v->data.mode = nir_var_shader_in;
v->data.read_only = 1;
added = true;
}
/* Tess levels come from the fixed-function tessellator, not TCS code. */
const struct {
gl_varying_slot loc;
const char *name;
uint8_t arr_len;
} tess_levels[] = {
{ VARYING_SLOT_TESS_LEVEL_OUTER, "gl_TessLevelOuter", 4 },
{ VARYING_SLOT_TESS_LEVEL_INNER, "gl_TessLevelInner", 2 },
};
for (unsigned k = 0; k < ARRAY_SIZE(tess_levels); ++k) {
if (nir_find_variable_with_location(tes, nir_var_shader_in, tess_levels[k].loc))
continue;
const struct glsl_type *t = glsl_array_type(glsl_float_type(), tess_levels[k].arr_len, 0);
nir_variable *v = nir_variable_create(tes, nir_var_shader_in, t, tess_levels[k].name);
v->data.location = tess_levels[k].loc;
v->data.patch = true;
v->data.compact = true;
v->data.always_active_io = 1;
added = true;
}
return added;
}

View file

@ -92,6 +92,10 @@ bool dxil_nir_kill_unused_outputs(nir_shader *shader, uint64_t next_stage_read_m
void dxil_nir_propagate_interp_to_outputs(nir_shader *prev_stage_nir,
const nir_shader *fs_nir);
/* Synthesize placeholder TES inputs mirroring TCS outputs the consumer
* doesn't already declare, so DXIL signature shapes match. */
bool dxil_nir_pad_tes_input_signature(nir_shader *tes, const nir_shader *tcs);
#ifdef __cplusplus
}
#endif

View file

@ -48,7 +48,8 @@ remove_hs_intrinsics(nir_function_impl *impl)
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_load_output) {
if (intr->intrinsic == nir_intrinsic_load_output ||
intr->intrinsic == nir_intrinsic_load_per_vertex_output) {
nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
nir_def_rewrite_uses(&intr->def, nir_undef(&b, intr->def.num_components, intr->def.bit_size));
} else if (intr->intrinsic != nir_intrinsic_store_output &&

View file

@ -811,25 +811,19 @@ dxil_spirv_nir_link(nir_shader *nir, nir_shader *prev_stage_nir,
NIR_PASS(_, nir, dxil_nir_kill_undefined_varyings, prev_stage_nir->info.outputs_written, prev_stage_nir->info.patch_outputs_written, NULL);
NIR_PASS(_, prev_stage_nir, dxil_nir_kill_unused_outputs, nir->info.inputs_read, nir->info.patch_inputs_read, NULL);
dxil_reassign_driver_locations(nir, nir_var_shader_in, prev_stage_nir->info.outputs_written, NULL);
dxil_reassign_driver_locations(prev_stage_nir, nir_var_shader_out, nir->info.inputs_read, NULL);
/* Tess metadata flows TCS->TES so the placeholder helper below knows
* the per-vertex input array length, and TCS gets the domain config. */
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
assert(prev_stage_nir->info.stage == MESA_SHADER_TESS_CTRL);
nir->info.tess.tcs_vertices_out = prev_stage_nir->info.tess.tcs_vertices_out;
prev_stage_nir->info.tess = nir->info.tess;
for (uint32_t i = 0; i < 2; ++i) {
unsigned loc = i == 0 ? VARYING_SLOT_TESS_LEVEL_OUTER : VARYING_SLOT_TESS_LEVEL_INNER;
nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, loc);
if (!var) {
var = nir_variable_create(nir, nir_var_shader_in, glsl_array_type(glsl_float_type(), i == 0 ? 4 : 2, 0), i == 0 ? "outer" : "inner");
var->data.location = loc;
var->data.patch = true;
var->data.compact = true;
}
}
/* Pad TES input signature so HS<->DS match for D3D12 pipeline validation. */
dxil_nir_pad_tes_input_signature(nir, prev_stage_nir);
}
dxil_reassign_driver_locations(nir, nir_var_shader_in, prev_stage_nir->info.outputs_written, NULL);
dxil_reassign_driver_locations(prev_stage_nir, nir_var_shader_out, nir->info.inputs_read, NULL);
}
glsl_type_singleton_decref();