diff --git a/src/compiler/nir/nir_opt_varyings.c b/src/compiler/nir/nir_opt_varyings.c index 0a190355c4b..eac7ce27677 100644 --- a/src/compiler/nir/nir_opt_varyings.c +++ b/src/compiler/nir/nir_opt_varyings.c @@ -389,6 +389,10 @@ * * 32-bit transform feedback only * * 16-bit transform feedback only * + * When the driver/hw can't mix different interpolation qualifiers + * in the same vec4, the interpolated groups are further split into 6 + * groups, one for each qualifier. + * * Then, all scalar varyings are relocated into new slots, starting from * VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit * increments. Rules: @@ -509,27 +513,79 @@ enum fs_vec4_type { FS_VEC4_TYPE_NONE = 0, FS_VEC4_TYPE_FLAT, - FS_VEC4_TYPE_INTERP_FP32, - FS_VEC4_TYPE_INTERP_FP16, - FS_VEC4_TYPE_INTERP_COLOR, FS_VEC4_TYPE_INTERP_EXPLICIT, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT, FS_VEC4_TYPE_PER_PRIMITIVE, + /* When nir_io_has_flexible_input_interpolation_except_flat is set: */ + FS_VEC4_TYPE_INTERP_FP32, + FS_VEC4_TYPE_INTERP_FP16, + FS_VEC4_TYPE_INTERP_COLOR, /* only for glShadeModel, i.e. INTERP_MODE_NONE */ + /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */ + FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, + FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID, + FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE, + FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL, + FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID, + FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE, + FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, + FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID, + FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE, + FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL, + FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID, + FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE, + FS_VEC4_TYPE_INTERP_COLOR_PIXEL, /* only for glShadeModel, i.e. INTERP_MODE_NONE */ + FS_VEC4_TYPE_INTERP_COLOR_CENTROID, /* same */ + FS_VEC4_TYPE_INTERP_COLOR_SAMPLE, /* same */ +}; + +enum { + PERSP_PIXEL, + PERSP_CENTROID, + PERSP_SAMPLE, + LINEAR_PIXEL, + LINEAR_CENTROID, + LINEAR_SAMPLE, + NUM_INTERP_QUALIFIERS, +}; + +enum { + COLOR_PIXEL, + COLOR_CENTROID, + COLOR_SAMPLE, + NUM_COLOR_QUALIFIERS, }; #if PRINT_RELOCATE_SLOT static const char *fs_vec4_type_strings[] = { "NONE", "FLAT", - "INTERP_FP32", - "INTERP_FP16", - "INTERP_COLOR", "INTERP_EXPLICIT", "INTERP_EXPLICIT_STRICT", "PER_PRIMITIVE", + "INTERP_FP32", + "INTERP_FP16", + "INTERP_COLOR", + "INTERP_FP32_PERSP_PIXEL", + "INTERP_FP32_PERSP_CENTROID", + "INTERP_FP32_PERSP_SAMPLE", + "INTERP_FP32_LINEAR_PIXEL", + "INTERP_FP32_LINEAR_CENTROID", + "INTERP_FP32_LINEAR_SAMPLE", + "INTERP_FP16_PERSP_PIXEL", + "INTERP_FP16_PERSP_CENTROID", + "INTERP_FP16_PERSP_SAMPLE", + "INTERP_FP16_LINEAR_PIXEL", + "INTERP_FP16_LINEAR_CENTROID", + "INTERP_FP16_LINEAR_SAMPLE", + "INTERP_COLOR_PIXEL", + "INTERP_COLOR_CENTROID", + "INTERP_COLOR_SAMPLE", }; #endif // PRINT_RELOCATE_SLOT +typedef BITSET_WORD INTERP_QUAL_BITSET[NUM_INTERP_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)]; +typedef BITSET_WORD COLOR_QUAL_BITSET[NUM_COLOR_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)]; + static unsigned get_scalar_16bit_slot(nir_io_semantics sem, unsigned component) { @@ -598,6 +654,7 @@ struct linkage_info { bool can_move_uniforms; bool can_move_ubos; bool can_mix_convergent_flat_with_interpolated; + bool has_flexible_interp; bool always_interpolate_convergent_fs_inputs; gl_shader_stage producer_stage; @@ -667,6 +724,13 @@ struct linkage_info { /* Color interpolation unqualified (follows the flat-shade state). */ BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS); + /* A separate bitmask for each qualifier when + * nir_io_has_flexible_input_interpolation_except_flat is not set. + */ + INTERP_QUAL_BITSET interp_fp32_qual_masks; + INTERP_QUAL_BITSET interp_fp16_qual_masks; + COLOR_QUAL_BITSET color32_qual_masks; + /* Mask of output components that have only one store instruction, or if * they have multiple store instructions, all those instructions store * the same value. If the output has multiple vertices, all vertices store @@ -701,6 +765,12 @@ struct linkage_info { #define BITSET_TEST32(m, b) \ (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1)) +#define BITSET3_TEST_ANY(bitsets, b) (BITSET_TEST((bitsets)[0], (b)) || \ + BITSET_TEST((bitsets)[1], (b)) || \ + BITSET_TEST((bitsets)[2], (b))) +#define BITSET6_TEST_ANY(bitsets, b) (BITSET3_TEST_ANY((bitsets), (b)) || \ + BITSET3_TEST_ANY(&(bitsets)[3], (b))) + static void print_linkage(struct linkage_info *linkage) { @@ -725,6 +795,10 @@ print_linkage(struct linkage_info *linkage) !BITSET_TEST(linkage->no_varying16_mask, i) && !BITSET_TEST(linkage->interp_fp32_mask, i) && !BITSET_TEST(linkage->interp_fp16_mask, i) && + !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) && + !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i) && + !BITSET_TEST(linkage->color32_mask, i) && + !BITSET3_TEST_ANY(linkage->color32_qual_masks, i) && !BITSET_TEST(linkage->flat32_mask, i) && !BITSET_TEST(linkage->flat16_mask, i) && !BITSET_TEST(linkage->interp_explicit32_mask, i) && @@ -738,7 +812,7 @@ print_linkage(struct linkage_info *linkage) !BITSET_TEST(linkage->output_equal_mask, i)) continue; - printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", gl_varying_slot_name_for_stage(vec4_slot(i), linkage->producer_stage) + 13, "xyzw"[(i / 2) % 4], @@ -753,7 +827,23 @@ print_linkage(struct linkage_info *linkage) BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "", BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "", BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[0], i) ? " interp_fp32_persp_pixel" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[1], i) ? " interp_fp32_persp_centroid" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[2], i) ? " interp_fp32_persp_sample" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[3], i) ? " interp_fp32_linear_pixel" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[4], i) ? " interp_fp32_linear_centroid" : "", + BITSET_TEST(linkage->interp_fp32_qual_masks[5], i) ? " interp_fp32_linear_sample" : "", BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[0], i) ? " interp_fp16_persp_pixel" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[1], i) ? " interp_fp16_persp_centroid" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[2], i) ? " interp_fp16_persp_sample" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[3], i) ? " interp_fp16_linear_pixel" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[4], i) ? " interp_fp16_linear_centroid" : "", + BITSET_TEST(linkage->interp_fp16_qual_masks[5], i) ? " interp_fp16_linear_sample" : "", + BITSET_TEST(linkage->color32_mask, i) ? " color32" : "", + BITSET_TEST(linkage->color32_qual_masks[0], i) ? " color32_pixel" : "", + BITSET_TEST(linkage->color32_qual_masks[1], i) ? " color32_centroid" : "", + BITSET_TEST(linkage->color32_qual_masks[2], i) ? " color32_sample" : "", BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "", BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "", BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "", @@ -780,6 +870,10 @@ slot_disable_optimizations_and_compaction(struct linkage_info *linkage, BITSET_CLEAR(linkage->convergent16_mask, i); BITSET_CLEAR(linkage->interp_fp32_mask, i); BITSET_CLEAR(linkage->interp_fp16_mask, i); + for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) { + BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i); + BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i); + } BITSET_CLEAR(linkage->flat32_mask, i); BITSET_CLEAR(linkage->flat16_mask, i); BITSET_CLEAR(linkage->interp_explicit32_mask, i); @@ -793,6 +887,8 @@ slot_disable_optimizations_and_compaction(struct linkage_info *linkage, BITSET_CLEAR(linkage->no_varying32_mask, i); BITSET_CLEAR(linkage->no_varying16_mask, i); BITSET_CLEAR(linkage->color32_mask, i); + for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++) + BITSET_CLEAR(linkage->color32_qual_masks[b], i); } static void @@ -873,6 +969,45 @@ color_uses_shade_model(struct linkage_info *linkage, unsigned i) return false; } +static enum fs_vec4_type +get_interp_vec4_type(struct linkage_info *linkage, unsigned slot, + nir_intrinsic_instr *load) +{ + assert(!linkage->has_flexible_interp); + assert(load->intrinsic == nir_intrinsic_load_interpolated_input); + + nir_intrinsic_instr *baryc = + nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); + enum fs_vec4_type base; + + if (color_uses_shade_model(linkage, slot)) + base = FS_VEC4_TYPE_INTERP_COLOR_PIXEL; + else if (load->def.bit_size == 32) + base = FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL; + else if (load->def.bit_size == 16) + base = FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL; + else + unreachable("invalid load_interpolated_input type"); + + bool linear = nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NOPERSPECTIVE; + + if (linear) + base += 3; + + switch (baryc->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + case nir_intrinsic_load_barycentric_at_sample: + return base; + case nir_intrinsic_load_barycentric_centroid: + return base + 1; + case nir_intrinsic_load_barycentric_sample: + return base + 2; + default: + unreachable("unexpected barycentric intrinsic"); + } +} + static bool preserve_infs_nans(nir_shader *nir, unsigned bit_size) { @@ -1180,14 +1315,18 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT; break; case nir_intrinsic_load_interpolated_input: - if (color_uses_shade_model(linkage, slot)) - fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR; - else if (intr->def.bit_size == 32) - fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32; - else if (intr->def.bit_size == 16) - fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16; - else - unreachable("invalid load_interpolated_input type"); + if (linkage->has_flexible_interp) { + if (color_uses_shade_model(linkage, slot)) + fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR; + else if (intr->def.bit_size == 32) + fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32; + else if (intr->def.bit_size == 16) + fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16; + else + unreachable("invalid load_interpolated_input type"); + } else { + fs_vec4_type = get_interp_vec4_type(linkage, slot, intr); + } break; default: unreachable("unexpected input load intrinsic"); @@ -1215,52 +1354,107 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d /* Record inputs that can be compacted. */ if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) { - switch (intr->intrinsic) { - case nir_intrinsic_load_input: + unsigned i; + assert(intr->def.bit_size == 32 || intr->def.bit_size == 16); + + switch (fs_vec4_type) { + case FS_VEC4_TYPE_FLAT: if (intr->def.bit_size == 32) BITSET_SET(linkage->flat32_mask, slot); - else if (intr->def.bit_size == 16) - BITSET_SET(linkage->flat16_mask, slot); else - unreachable("invalid load_input type"); + BITSET_SET(linkage->flat16_mask, slot); break; - case nir_intrinsic_load_per_primitive_input: + case FS_VEC4_TYPE_INTERP_EXPLICIT: + if (intr->def.bit_size == 32) + BITSET_SET(linkage->interp_explicit32_mask, slot); + else + BITSET_SET(linkage->interp_explicit16_mask, slot); + break; + case FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT: + if (intr->def.bit_size == 32) + BITSET_SET(linkage->interp_explicit_strict32_mask, slot); + else + BITSET_SET(linkage->interp_explicit_strict16_mask, slot); + break; + case FS_VEC4_TYPE_PER_PRIMITIVE: if (intr->def.bit_size == 32) BITSET_SET(linkage->per_primitive32_mask, slot); - else if (intr->def.bit_size == 16) + else BITSET_SET(linkage->per_primitive16_mask, slot); - else - unreachable("invalid load_input type"); break; - case nir_intrinsic_load_input_vertex: - if (sem.interp_explicit_strict) { - if (intr->def.bit_size == 32) - BITSET_SET(linkage->interp_explicit_strict32_mask, slot); - else if (intr->def.bit_size == 16) - BITSET_SET(linkage->interp_explicit_strict16_mask, slot); - else - unreachable("invalid load_input_vertex type"); + + case FS_VEC4_TYPE_INTERP_FP32: + BITSET_SET(linkage->interp_fp32_mask, slot); + break; + case FS_VEC4_TYPE_INTERP_FP16: + BITSET_SET(linkage->interp_fp16_mask, slot); + break; + case FS_VEC4_TYPE_INTERP_COLOR: + BITSET_SET(linkage->color32_mask, slot); + break; + + case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL: + case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID: + case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE: + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL: + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID: + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE: + i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL; + BITSET_SET(linkage->interp_fp32_qual_masks[i], slot); + break; + + case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL: + case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID: + case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE: + i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL; + BITSET_SET(linkage->interp_fp16_qual_masks[i], slot); + break; + + case FS_VEC4_TYPE_INTERP_COLOR_PIXEL: + case FS_VEC4_TYPE_INTERP_COLOR_CENTROID: + case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE: + i = fs_vec4_type - FS_VEC4_TYPE_INTERP_COLOR_PIXEL; + BITSET_SET(linkage->color32_qual_masks[i], slot); + break; + + case FS_VEC4_TYPE_NONE: + unreachable("unexpected fs_vec4_type"); + } + + if (!linkage->has_flexible_interp && + intr->intrinsic == nir_intrinsic_load_interpolated_input) { + /* interpolateAtCentroid can occur simultaneously with any other + * qualifier. If centroid is flagged with any other qualifier, + * unflag centroid. Even though we track such outputs as the other + * qualifier, the load_barycentric_centroid intrinsic must be + * preserved by all optimizations. The only case when it's not + * preserved is when the input is convergent, in which case + * all qualifiers have the same behavior and we opportunistically + * change it during compaction. + */ + if (color_uses_shade_model(linkage, slot)) { + if (BITSET_TEST(linkage->color32_qual_masks[COLOR_CENTROID], slot) && + (BITSET_TEST(linkage->color32_qual_masks[COLOR_PIXEL], slot) || + BITSET_TEST(linkage->color32_qual_masks[COLOR_SAMPLE], slot))) + BITSET_CLEAR(linkage->color32_qual_masks[COLOR_CENTROID], slot); } else { - if (intr->def.bit_size == 32) - BITSET_SET(linkage->interp_explicit32_mask, slot); - else if (intr->def.bit_size == 16) - BITSET_SET(linkage->interp_explicit16_mask, slot); - else - unreachable("invalid load_input_vertex type"); + INTERP_QUAL_BITSET *bitsets = + intr->def.bit_size == 32 ? &linkage->interp_fp32_qual_masks : + &linkage->interp_fp16_qual_masks; + + if (BITSET_TEST((*bitsets)[PERSP_CENTROID], slot) && + (BITSET_TEST((*bitsets)[PERSP_PIXEL], slot) || + BITSET_TEST((*bitsets)[PERSP_SAMPLE], slot))) + BITSET_CLEAR((*bitsets)[PERSP_CENTROID], slot); + + if (BITSET_TEST((*bitsets)[LINEAR_CENTROID], slot) && + (BITSET_TEST((*bitsets)[LINEAR_PIXEL], slot) || + BITSET_TEST((*bitsets)[LINEAR_SAMPLE], slot))) + BITSET_CLEAR((*bitsets)[LINEAR_CENTROID], slot); } - break; - case nir_intrinsic_load_interpolated_input: - if (color_uses_shade_model(linkage, slot)) - BITSET_SET(linkage->color32_mask, slot); - else if (intr->def.bit_size == 32) - BITSET_SET(linkage->interp_fp32_mask, slot); - else if (intr->def.bit_size == 16) - BITSET_SET(linkage->interp_fp16_mask, slot); - else - unreachable("invalid load_interpolated_input type"); - break; - default: - unreachable("unexpected input load intrinsic"); } } else { if (intr->def.bit_size == 32) @@ -1558,8 +1752,10 @@ tidy_up_convergent_varyings(struct linkage_info *linkage) */ BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) { if (!BITSET_TEST(linkage->interp_fp32_mask, i) && + !BITSET_TEST(linkage->color32_mask, i) && !BITSET_TEST(linkage->flat32_mask, i) && - !BITSET_TEST(linkage->color32_mask, i)) { + !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) && + !BITSET3_TEST_ANY(linkage->color32_qual_masks, i)) { /* Clear the flag - not used by FS. */ BITSET_CLEAR(linkage->convergent32_mask, i); } else if ((!linkage->can_mix_convergent_flat_with_interpolated && @@ -1571,13 +1767,19 @@ tidy_up_convergent_varyings(struct linkage_info *linkage) } else { /* Keep it convergent. */ BITSET_CLEAR(linkage->interp_fp32_mask, i); + for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) + BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i); BITSET_CLEAR(linkage->color32_mask, i); + for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++) + BITSET_CLEAR(linkage->color32_qual_masks[b], i); BITSET_CLEAR(linkage->flat32_mask, i); } } + BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) { if (!BITSET_TEST(linkage->interp_fp16_mask, i) && - !BITSET_TEST(linkage->flat16_mask, i)) { + !BITSET_TEST(linkage->flat16_mask, i) && + !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i)) { /* Clear the flag - not used by FS. */ BITSET_CLEAR(linkage->convergent16_mask, i); } else if ((!linkage->can_mix_convergent_flat_with_interpolated && @@ -1589,6 +1791,8 @@ tidy_up_convergent_varyings(struct linkage_info *linkage) } else { /* Keep it convergent. */ BITSET_CLEAR(linkage->interp_fp16_mask, i); + for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) + BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i); BITSET_CLEAR(linkage->flat16_mask, i); } } @@ -2293,7 +2497,7 @@ enum var_qualifier { QUAL_VAR_INTERP_ANY, QUAL_COLOR_INTERP_ANY, QUAL_COLOR_SHADEMODEL_ANY, - /* When nir_io_has_flexible_input_interpolation_except_flat is unset: */ + /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */ QUAL_VAR_PERSP_PIXEL, QUAL_VAR_PERSP_CENTROID, QUAL_VAR_PERSP_SAMPLE, @@ -2342,8 +2546,7 @@ get_input_qualifier(struct linkage_info *linkage, unsigned i) nir_intrinsic_instr *baryc = nir_instr_as_intrinsic(load->src[0].ssa->parent_instr); - if (linkage->consumer_builder.shader->options->io_options & - nir_io_has_flexible_input_interpolation_except_flat) { + if (linkage->has_flexible_interp) { if (is_color) { return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ? QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY; @@ -2352,6 +2555,25 @@ get_input_qualifier(struct linkage_info *linkage, unsigned i) } } + /* If interpolateAt{Centroid,Offset,Sample} is used, see if there is + * another load that doesn't use those, so that we get the real qualifier. + */ + if (baryc->intrinsic == nir_intrinsic_load_barycentric_centroid || + baryc->intrinsic == nir_intrinsic_load_barycentric_at_offset || + baryc->intrinsic == nir_intrinsic_load_barycentric_at_sample) { + list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) { + nir_intrinsic_instr *bar = + nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr); + + if (bar->intrinsic != nir_intrinsic_load_barycentric_centroid && + bar->intrinsic != nir_intrinsic_load_barycentric_at_offset && + bar->intrinsic != nir_intrinsic_load_barycentric_at_sample) { + baryc = bar; + break; + } + } + } + /* Get the exact interpolation qualifier. */ unsigned pixel_location; enum var_qualifier qual; @@ -3194,9 +3416,14 @@ try_move_postdominator(struct linkage_info *linkage, if (alu_interp == FLAG_INTERP_CONVERGENT) { mask = new_bit_size == 16 ? linkage->convergent16_mask : linkage->convergent32_mask; - } else { + } else if (linkage->has_flexible_interp) { mask = new_bit_size == 16 ? linkage->interp_fp16_mask : linkage->interp_fp32_mask; + } else { + /* The index of the qualifier is encoded in alu_interp, so extract it. */ + unsigned i = (alu_interp - FLAG_INTERP_PERSP_PIXEL) >> 5; + mask = new_bit_size == 16 ? linkage->interp_fp16_qual_masks[i] + : linkage->interp_fp32_qual_masks[i]; } } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL && alu_interp > FLAG_INTERP_FLAT) { @@ -3275,7 +3502,7 @@ try_move_postdominator(struct linkage_info *linkage, assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index)); /* Re-set the category of the new scalar input. This will cause - * the compaction to treat it as a different type, so that it will move it + * the compaction to treat it as a different type, so that it will be moved * into the vec4 that has compatible interpolation qualifiers. * * This shouldn't be done if any of the interp masks are not set, which @@ -3283,12 +3510,18 @@ try_move_postdominator(struct linkage_info *linkage, */ if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) || BITSET_TEST(linkage->interp_fp16_mask, slot_index) || + BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, slot_index) || + BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, slot_index) || BITSET_TEST(linkage->flat32_mask, slot_index) || BITSET_TEST(linkage->flat16_mask, slot_index) || BITSET_TEST(linkage->convergent32_mask, slot_index) || BITSET_TEST(linkage->convergent16_mask, slot_index)) { BITSET_CLEAR(linkage->interp_fp32_mask, slot_index); + for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) + BITSET_CLEAR(linkage->interp_fp32_qual_masks[i], slot_index); BITSET_CLEAR(linkage->interp_fp16_mask, slot_index); + for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) + BITSET_CLEAR(linkage->interp_fp16_qual_masks[i], slot_index); BITSET_CLEAR(linkage->flat16_mask, slot_index); BITSET_CLEAR(linkage->flat32_mask, slot_index); BITSET_CLEAR(linkage->convergent16_mask, slot_index); @@ -3779,14 +4012,16 @@ relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot, intr->intrinsic != nir_intrinsic_load_per_primitive_input); } + if (intr->intrinsic != nir_intrinsic_load_interpolated_input) + continue; + /* This path is used when promoting convergent interpolated * inputs to flat. Replace load_interpolated_input with load_input. */ - if (intr->intrinsic == nir_intrinsic_load_interpolated_input && - (fs_vec4_type == FS_VEC4_TYPE_FLAT || - /* Promote all convergent loads to flat if the driver supports it. */ - (convergent && - linkage->can_mix_convergent_flat_with_interpolated))) { + if (fs_vec4_type == FS_VEC4_TYPE_FLAT || + /* Promote all convergent loads to flat if the driver supports it. */ + (convergent && + linkage->can_mix_convergent_flat_with_interpolated)) { assert(instruction_lists[i] == &slot->consumer.loads); nir_builder *b = &linkage->consumer_builder; @@ -3820,6 +4055,76 @@ relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot, nir_src_rewrite(&store->src[0], repl); } } + continue; + } + + /* We are packing convergent inputs with any other interpolated + * inputs in the same vec4, but the interpolation qualifier might not + * be the same between the two. Set the qualifier of the convergent + * input to match the input it's being packed with. + */ + if (!linkage->has_flexible_interp && convergent) { + enum fs_vec4_type current_vec4_type = + get_interp_vec4_type(linkage, i, intr); + + /* Make the interpolation qualifier match the slot where we are + * moving this input. + */ + if (current_vec4_type != fs_vec4_type) { + nir_builder *b = &linkage->consumer_builder; + nir_def *baryc; + + b->cursor = nir_before_instr(&intr->instr); + + switch (fs_vec4_type) { + case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL: + case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL: + baryc = nir_load_barycentric_pixel(b, 32, + .interp_mode = INTERP_MODE_SMOOTH); + break; + case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID: + case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID: + baryc = nir_load_barycentric_centroid(b, 32, + .interp_mode = INTERP_MODE_SMOOTH); + break; + case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE: + case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE: + baryc = nir_load_barycentric_sample(b, 32, + .interp_mode = INTERP_MODE_SMOOTH); + break; + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL: + baryc = nir_load_barycentric_pixel(b, 32, + .interp_mode = INTERP_MODE_NOPERSPECTIVE); + break; + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID: + baryc = nir_load_barycentric_centroid(b, 32, + .interp_mode = INTERP_MODE_NOPERSPECTIVE); + break; + case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE: + case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE: + baryc = nir_load_barycentric_sample(b, 32, + .interp_mode = INTERP_MODE_NOPERSPECTIVE); + break; + case FS_VEC4_TYPE_INTERP_COLOR_PIXEL: + baryc = nir_load_barycentric_pixel(b, 32, + .interp_mode = INTERP_MODE_NONE); + break; + case FS_VEC4_TYPE_INTERP_COLOR_CENTROID: + baryc = nir_load_barycentric_centroid(b, 32, + .interp_mode = INTERP_MODE_NONE); + break; + case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE: + baryc = nir_load_barycentric_sample(b, 32, + .interp_mode = INTERP_MODE_NONE); + break; + default: + unreachable("invalid qualifier"); + } + + nir_src_rewrite(&intr->src[0], baryc); + } } } } @@ -3965,7 +4270,7 @@ fs_assign_slots(struct linkage_info *linkage, * \param flat_mask The list of flat slots to assign locations for. * \param convergent_mask The list of slots that have convergent output * stores. - * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}. + * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}*. * \param slot_size 1 for 16 bits, 2 for 32 bits * \param color_channel_rotate Assign color channels starting with this index, * e.g. 2 assigns channels in the zwxy order. @@ -4051,6 +4356,106 @@ fs_assign_slot_groups(struct linkage_info *linkage, color_channel_rotate, progress); } +/** + * Same as fs_assign_slot_groups, but don't mix different interpolation + * qualifiers in the same vec4. + */ +static void +fs_assign_slot_groups_separate_qual(struct linkage_info *linkage, + BITSET_WORD *assigned_mask, + uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS], + INTERP_QUAL_BITSET *interp_masks, + BITSET_WORD *flat_mask, + BITSET_WORD *convergent_mask, + COLOR_QUAL_BITSET *color_interp_masks, + enum fs_vec4_type sized_interp_type_base, + unsigned slot_size, + bool assign_colors, + unsigned color_channel_rotate, + nir_opt_varyings_progress *progress) +{ + unsigned unused_interp_slots[NUM_INTERP_QUALIFIERS] = {0}; + unsigned unused_color_slots[NUM_COLOR_QUALIFIERS] = {0}; + + /* Put interpolated slots first. */ + for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { + unused_interp_slots[i] = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + (*interp_masks)[i], sized_interp_type_base + i, + slot_size, NUM_SCALAR_SLOTS, false, assign_colors, + color_channel_rotate, progress); + } + + if (color_interp_masks) { + for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { + unused_color_slots[i] = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + (*color_interp_masks)[i], + FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i, + slot_size, NUM_SCALAR_SLOTS, false, assign_colors, + color_channel_rotate, progress); + } + } + + /* Put flat slots next. + * Note that only flat vec4 slots can have both 32-bit and 16-bit types + * packed in the same vec4. 32-bit flat inputs are packed first, followed + * by 16-bit flat inputs. + */ + unsigned unused_flat_slots = + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + flat_mask, FS_VEC4_TYPE_FLAT, + slot_size, NUM_SCALAR_SLOTS, false, assign_colors, + color_channel_rotate, progress); + + /* Take the inputs with convergent values and assign them as follows. + * Since they can be assigned as both interpolated and flat, we can + * choose. We prefer them to be flat, but if interpolated vec4s have + * unused components, try to fill those before starting a new flat vec4. + * + * First, fill the unused components of flat (if any) with convergent + * inputs. + */ + if (!linkage->always_interpolate_convergent_fs_inputs && + unused_flat_slots) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, FS_VEC4_TYPE_FLAT, + slot_size, unused_flat_slots, true, assign_colors, + color_channel_rotate, progress); + } + + /* Then fill the unused components of interpolated slots (if any) with + * convergent inputs. + */ + for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { + if (unused_interp_slots[i]) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, sized_interp_type_base + i, + slot_size, unused_interp_slots[i], true, + assign_colors, color_channel_rotate, progress); + } + } + + for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { + if (unused_color_slots[i]) { + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i, + slot_size, unused_color_slots[i], true, assign_colors, + color_channel_rotate, progress); + } + } + + /* Then make the remaining convergent inputs flat. */ + fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type, + convergent_mask, + linkage->always_interpolate_convergent_fs_inputs ? + (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL : + FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL) : + FS_VEC4_TYPE_FLAT, + slot_size, NUM_SCALAR_SLOTS, true, assign_colors, + color_channel_rotate, progress); +} + static void vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage, BITSET_WORD *input_mask, @@ -4120,16 +4525,34 @@ compact_varyings(struct linkage_info *linkage, BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS); BITSET_ZERO(assigned_mask); - fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, - linkage->interp_fp32_mask, linkage->flat32_mask, - linkage->convergent32_mask, NULL, - FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress); + if (linkage->has_flexible_interp) { + /* This codepath packs convergent varyings with both interpolated and + * flat, whichever has free space. + */ + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp32_mask, linkage->flat32_mask, + linkage->convergent32_mask, NULL, + FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress); - /* Now do the same thing, but for 16-bit inputs. */ - fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, - linkage->interp_fp16_mask, linkage->flat16_mask, - linkage->convergent16_mask, NULL, - FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress); + /* Now do the same thing, but for 16-bit inputs. */ + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp16_mask, linkage->flat16_mask, + linkage->convergent16_mask, NULL, + FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress); + } else { + /* Basically the same as above. */ + fs_assign_slot_groups_separate_qual( + linkage, assigned_mask, assigned_fs_vec4_type, + &linkage->interp_fp32_qual_masks, linkage->flat32_mask, + linkage->convergent32_mask, NULL, + FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, false, 0, progress); + + fs_assign_slot_groups_separate_qual( + linkage, assigned_mask, assigned_fs_vec4_type, + &linkage->interp_fp16_qual_masks, linkage->flat16_mask, + linkage->convergent16_mask, NULL, + FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, 1, false, 0, progress); + } /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same * slot because the vertex data is passed to FS as-is. @@ -4184,6 +4607,17 @@ compact_varyings(struct linkage_info *linkage, !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0, col0 + 15, 0); + for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) { + has_colors |= + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_qual_masks[i], + col0, col0 + 15, 0); + } + for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) { + has_colors |= + !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_qual_masks[i], + col0, col0 + 15, 0); + } + if (has_colors) { unsigned color_channel_rotate = 0; @@ -4193,11 +4627,20 @@ compact_varyings(struct linkage_info *linkage, DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4; } - fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, - linkage->interp_fp32_mask, linkage->flat32_mask, - linkage->convergent32_mask, linkage->color32_mask, - FS_VEC4_TYPE_INTERP_FP32, 2, true, - color_channel_rotate, progress); + if (linkage->has_flexible_interp) { + fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type, + linkage->interp_fp32_mask, linkage->flat32_mask, + linkage->convergent32_mask, linkage->color32_mask, + FS_VEC4_TYPE_INTERP_FP32, 2, true, + color_channel_rotate, progress); + } else { + fs_assign_slot_groups_separate_qual( + linkage, assigned_mask, assigned_fs_vec4_type, + &linkage->interp_fp32_qual_masks, linkage->flat32_mask, + linkage->convergent32_mask, &linkage->color32_qual_masks, + FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, true, + color_channel_rotate, progress); + } /* Put transform-feedback-only outputs last. */ fs_assign_slots(linkage, assigned_mask, NULL, @@ -4274,6 +4717,10 @@ init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv, consumer->info.stage == MESA_SHADER_FRAGMENT && consumer->options->io_options & nir_io_mix_convergent_flat_with_interpolated, + .has_flexible_interp = + consumer->info.stage == MESA_SHADER_FRAGMENT && + consumer->options->io_options & + nir_io_has_flexible_input_interpolation_except_flat, .always_interpolate_convergent_fs_inputs = consumer->info.stage == MESA_SHADER_FRAGMENT && consumer->options->io_options &