From 3418525a82fe5cbd40ecb8533d216572f6eed8c5 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Tue, 17 Mar 2026 12:10:50 +0100 Subject: [PATCH] pan/bi: Lower VS outputs in NIR Co-authored-by: Lorenzo Rossi Reviewed-by: Lorenzo Rossi Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 3 + src/compiler/nir/nir_intrinsics.py | 11 + .../compiler/bifrost/bifrost_compile.c | 275 +++++++----------- src/panfrost/compiler/bifrost/compiler.h | 2 - src/panfrost/compiler/pan_nir.h | 4 + .../compiler/pan_nir_lower_varyings_io.c | 158 ++++++++++ 6 files changed, 274 insertions(+), 179 deletions(-) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 8b591c73a0b..718c8160a0d 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -755,6 +755,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_input_attachment_target_pan: case nir_intrinsic_load_input_attachment_conv_pan: case nir_intrinsic_load_global_cvt_pan: + case nir_intrinsic_lea_attr_pan: + case nir_intrinsic_lea_buf_pan: case nir_intrinsic_atomic_counter_read: case nir_intrinsic_atomic_counter_read_deref: case nir_intrinsic_is_sparse_texels_resident: @@ -1039,6 +1041,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_tile_res_pan: case nir_intrinsic_load_cumulative_coverage_pan: case nir_intrinsic_load_blend_input_pan: + case nir_intrinsic_load_idvs_output_buf_index_pan: case nir_intrinsic_atest_pan: case nir_intrinsic_zs_emit_pan: case nir_intrinsic_load_return_param_amd: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 76ed3fb2654..ae6410f990f 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1741,6 +1741,17 @@ store("global_cvt_pan", [1, 1], indices=[SRC_TYPE, ACCESS]) # src[] = { value, address } store("global_psiz_pan", [1], indices=[WRITE_MASK, ACCESS]) +# Base index of the output buffer passed into the IDVS on Valhall. +system_value("idvs_output_buf_index_pan", 1, bit_sizes=[32]) + +# src[] = { handle, vertex_id, instance_id } +intrinsic("lea_attr_pan", [1, 1, 1], dest_comp=3, bit_sizes=[32], + indices=[SRC_TYPE], flags=[CAN_ELIMINATE, CAN_REORDER]) + +# src[] = { handle, index } +intrinsic("lea_buf_pan", [1, 1], dest_comp=2, bit_sizes=[32], + flags=[CAN_ELIMINATE, CAN_REORDER]) + # Load the address and potentially the conversion descriptor for a texel buffer index. # The 64 bit address is always in the first two channels, while the 32 bit # conversion descriptor is in the last channel only for Bifrost. diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 2a67a5b4dc2..033ce69e704 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -580,6 +580,80 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) bi_copy_component(b, instr, dest); } +static void +bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->intrinsic == nir_intrinsic_lea_attr_pan); + const nir_alu_type src_fmt = nir_intrinsic_src_type(intr); + + if (b->shader->arch < 9 && b->shader->idvs == BI_IDVS_POSITION) { + /* Bifrost position shaders have a fast path */ + assert(nir_src_as_uint(intr->src[0]) == 0); + assert(src_fmt == nir_type_float32); + unsigned regfmt = BI_REGISTER_FORMAT_F32; + unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; + unsigned snap4 = 0x5E; + uint32_t format = identity | (snap4 << 12) | (regfmt << 24); + bi_collect_v3i32_to(b, bi_def_index(&intr->def), + bi_preload(b, 58), bi_preload(b, 59), + bi_imm_u32(format)); + return; + } + + bi_index vertex_id = bi_src_index(&intr->src[1]); + bi_index instance_id = bi_src_index(&intr->src[2]); + enum bi_register_format regfmt = bi_reg_fmt_for_nir(src_fmt); + + /* Check if the index can fit in LEA_ATTR_IMM */ + uint32_t imm_res = 0; + bool use_imm_form = false; + if (nir_src_is_const(intr->src[0])) { + imm_res = nir_src_as_uint(intr->src[0]); + use_imm_form = pan_res_handle_get_index(imm_res) < 0x10; + } + + bi_index address = bi_def_index(&intr->def); + if (use_imm_form) { + bi_instr *I = bi_lea_attr_imm_to(b, address, vertex_id, instance_id, + regfmt, + pan_res_handle_get_index(imm_res)); + if (b->shader->arch >= 9) + I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res)); + } else { + bi_index res = bi_src_index(&intr->src[0]); + bi_lea_attr_to(b, address, vertex_id, instance_id, res, regfmt); + } + bi_split_def(b, &intr->def); +} + +static void +bi_emit_lea_buf(bi_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->intrinsic == nir_intrinsic_lea_buf_pan); + assert(b->shader->arch >= 9); + bi_index index = bi_src_index(&intr->src[1]); + + uint32_t imm_res; + bool use_imm_form = false; + if (nir_src_is_const(intr->src[0])) { + imm_res = nir_src_as_uint(intr->src[0]); + uint32_t table_index = pan_res_handle_get_table(imm_res); + uint32_t res_index = pan_res_handle_get_index(imm_res); + use_imm_form = va_is_valid_const_table(table_index) && res_index < 256; + } + + bi_index address = bi_def_index(&intr->def); + if (use_imm_form) { + bi_instr *I = bi_lea_buf_imm_to(b, address, index); + I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res)); + I->index = pan_res_handle_get_index(imm_res); + } else { + bi_index res = bi_src_index(&intr->src[0]); + bi_lea_buf_to(b, address, index, res); + } + bi_split_def(b, &intr->def); +} + static void bi_emit_load_var(bi_builder *b, nir_intrinsic_instr *intr) { @@ -1046,169 +1120,6 @@ bifrost_nir_lower_vs_atomics(nir_shader *shader) nir_metadata_none, NULL); } -static void -bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) -{ - /* In principle we can do better for 16-bit. At the moment we require - * mediump varyings to be 32-bit to permit the use of .auto, in order to - * force .u32 for flat varyings, to handle internal TGSI shaders that set - * flat in the VS but smooth in the FS. - * - * Explicit 16-bit types are unaffected, and written as 16-bit. */ - - ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); - ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - - const struct pan_varying_slot *slot = - pan_varying_layout_find_slot(b->shader->varying_layout, sem.location); - ASSERTED unsigned base = nir_intrinsic_base(instr); - assert(slot == &b->shader->varying_layout->slots[base]); - - unsigned imm_index = 0; - bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); - - /* Only look at the total components needed. In effect, we fill in all - * the intermediate "holes" in the write mask, since we can't mask off - * stores. Since nir_lower_io_vars_to_temporaries ensures each varying is - * written at most once, anything that's masked out is undefined, so it - * doesn't matter what we write there. So we may as well do the - * simplest thing possible. */ - unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); - assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); - - bi_index data = bi_src_index(&instr->src[0]); - - /* To keep the vector dimensions consistent, we need to drop some - * components. This should be coalesced. - * - * TODO: This is ugly and maybe inefficient. Would we rather - * introduce a TRIM.i32 pseudoinstruction? - */ - if (nr < nir_intrinsic_src_components(instr, 0)) { - bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; - unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1; - unsigned src_comps = - DIV_ROUND_UP(nir_intrinsic_src_components(instr, 0), comps_per_reg); - unsigned dst_comps = DIV_ROUND_UP(nr, comps_per_reg); - - bi_emit_split_i32(b, chans, data, src_comps); - - bi_index tmp = bi_temp(b->shader); - bi_instr *collect = bi_collect_i32_to(b, tmp, dst_comps); - - bi_foreach_src(collect, w) - collect->src[w] = chans[w]; - - data = tmp; - } - - bi_index a[4] = {bi_null()}; - - if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { - /* Bifrost position shaders have a fast path */ - assert(T == nir_type_float32); - unsigned regfmt = BI_REGISTER_FORMAT_F32; - unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; - unsigned snap4 = 0x5E; - uint32_t format = identity | (snap4 << 12) | (regfmt << 24); - - bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), - bi_imm_u32(format), regfmt, nr - 1); - } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { - bi_index index = bi_preload(b, 59); - unsigned src_bit_sz = nir_src_bit_size(instr->src[0]); - - unsigned index_offset = 0; - if (slot->section == PAN_VARYING_SECTION_ATTRIBS) - index_offset += 4; - - if (instr->intrinsic == nir_intrinsic_store_per_view_output) { - unsigned view_index = nir_src_as_uint(instr->src[1]); - - if (slot->section == PAN_VARYING_SECTION_GENERIC) { - index_offset += view_index * 4; - } else { - /* We don't patch these offsets in the no_psiz variant, so if - * multiview is enabled we can't switch to the basic format by - * using no_psiz */ - const uint64_t outputs = b->shader->nir->info.outputs_written; - bool extended_position_fifo = - valhal_writes_extended_fifo(outputs, false, true); - /* Must be the same with and without no_psiz */ - assert(valhal_writes_extended_fifo(outputs, true, true) == - extended_position_fifo); - unsigned position_fifo_stride = extended_position_fifo ? 8 : 4; - index_offset += view_index * position_fifo_stride; - } - } - - if (index_offset != 0) - index = bi_iadd_imm_i32(b, index, index_offset); - - const enum va_memory_access mem_access = - slot->section == PAN_VARYING_SECTION_GENERIC ? VA_MEMORY_ACCESS_ESTREAM - : VA_MEMORY_ACCESS_ISTREAM; - - nir_src *offset_src = nir_get_io_offset_src(instr); - assert(nir_src_is_const(*offset_src) && "assumes immediate offset"); - unsigned offset = slot->offset + (nir_src_as_uint(*offset_src) * 16); - - /* On Valhall, with IDVS varying are stored in a hardware-controlled - * buffer through table 61 at index 0 */ - bi_index address = bi_temp(b->shader); - bi_instr *I = bi_lea_buf_imm_to(b, address, index); - I->table = va_res_fold_table_idx(61); - I->index = 0; - - /* On 5th Gen, the hardware-controlled buffer is at index 1 for varyings */ - if (pan_arch(b->shader->inputs->gpu_id) >= 12 && - slot->section == PAN_VARYING_SECTION_GENERIC) { - I->index = 1; - } - - bi_emit_split_i32(b, a, address, 2); - - bi_instr *S = bi_store(b, nr * src_bit_sz, data, a[0], a[1], BI_SEG_NONE, - offset); - S->mem_access = mem_access; - S->is_psiz_write = slot->location == VARYING_SLOT_PSIZ; - } else { - assert(T_size == 32 || T_size == 16); - - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - - /* Since v9 we cannot have separate attribute descriptors for VS-FS, - * There might be a mismatch on Gallium where the VS thinks it is storing - * an int, but the data is actually a float, and that's what FS expects. - * So, just for v9 onwards, just until we haven't fixed gallium, use auto32. - * We are still getting around the midgard quirk since we do this only - * from v9. - * TODO: fix all bugs with gallium and remove this patch - */ - if (b->shader->arch >= 9 && T_size == 32) - regfmt = BI_REGISTER_FORMAT_AUTO; - - if (immediate) { - bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), - bi_instance_id(b), - regfmt, imm_index); - bi_emit_split_i32(b, a, address, 3); - - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } else { - bi_index idx = bi_iadd_u32(b, - bi_src_index(nir_get_io_offset_src(instr)), - bi_imm_u32(nir_intrinsic_base(instr)), false); - bi_index address = - bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt); - bi_emit_split_i32(b, a, address, 3); - - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } - } -} - static void bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) { @@ -2047,16 +1958,6 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) bi_emit_load_var_buf(b, instr); break; - case nir_intrinsic_store_output: - case nir_intrinsic_store_per_view_output: - if (stage == MESA_SHADER_FRAGMENT) - UNREACHABLE("Should have been lowered by pan_nir_lower_fs_outputs"); - else if (stage == MESA_SHADER_VERTEX) - bi_emit_store_vary(b, instr); - else - UNREACHABLE("Unsupported shader stage"); - break; - case nir_intrinsic_load_cumulative_coverage_pan: bi_mov_i32_to(b, dst, bi_preload(b, 60)); break; @@ -2335,6 +2236,18 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) bi_emit_store_cvt(b, instr, va_memory_access_from_nir(instr)); break; + case nir_intrinsic_load_idvs_output_buf_index_pan: + bi_mov_i32_to(b, dst, bi_preload(b, 59)); + break; + + case nir_intrinsic_lea_attr_pan: + bi_emit_lea_attr(b, instr); + break; + + case nir_intrinsic_lea_buf_pan: + bi_emit_lea_buf(b, instr); + break; + case nir_intrinsic_load_tile_pan: case nir_intrinsic_load_tile_res_pan: bi_emit_ld_tile(b, instr); @@ -6631,10 +6544,6 @@ bi_compile_variant_nir(nir_shader *nir, ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; ctx->fau_consts_count = info.init_fau_consts_count; - if (!mesa_shader_stage_is_compute(nir->info.stage)) { - ctx->varying_layout = inputs->varying_layout; - } - unsigned execution_mode = nir->info.float_controls_execution_mode; ctx->rtz_fp16 = nir_is_rounding_mode_rtz(execution_mode, 16); ctx->rtz_fp32 = nir_is_rounding_mode_rtz(execution_mode, 32); @@ -7098,6 +7007,18 @@ bifrost_compile_shader_nir(nir_shader *nir, NIR_PASS(_, nir, nir_opt_if, 0); } } + + bool has_extended_fifo = false; + if (pan_arch(inputs->gpu_id) >= 9) { + const uint64_t outputs = nir->info.outputs_written; + has_extended_fifo = valhal_writes_extended_fifo(outputs, false, true); + /* Must be the same with and without no_psiz */ + assert(valhal_writes_extended_fifo(outputs, true, true) == + has_extended_fifo); + } + + NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, + inputs->varying_layout, info->vs.idvs, has_extended_fifo); } if (nir->info.stage == MESA_SHADER_FRAGMENT) { diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index a9fe1ddd837..c8f999e6e0b 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1057,8 +1057,6 @@ typedef struct { enum bi_idvs_mode idvs; unsigned num_blocks; - const struct pan_varying_layout *varying_layout; - /* Floating point rounding mode controls */ bool rtz_fp16; bool rtz_fp32; diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h index 97ca12711e4..9e22a5d10a5 100644 --- a/src/panfrost/compiler/pan_nir.h +++ b/src/panfrost/compiler/pan_nir.h @@ -57,6 +57,10 @@ bool pan_nir_lower_frag_coord_zw(nir_shader *shader); bool pan_nir_lower_noperspective_vs(nir_shader *shader); bool pan_nir_lower_noperspective_fs(nir_shader *shader); +bool pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id, + const struct pan_varying_layout *varying_layout, + bool has_idvs, bool has_extended_fifo); + bool pan_nir_lower_fs_inputs(nir_shader *shader, unsigned gpu_id, const struct pan_varying_layout *varying_layout, bool valhall_use_ld_var_buf); diff --git a/src/panfrost/compiler/pan_nir_lower_varyings_io.c b/src/panfrost/compiler/pan_nir_lower_varyings_io.c index b90cf8d7935..42362c2763f 100644 --- a/src/panfrost/compiler/pan_nir_lower_varyings_io.c +++ b/src/panfrost/compiler/pan_nir_lower_varyings_io.c @@ -8,6 +8,164 @@ #include "panfrost/model/pan_model.h" +struct lower_vs_outputs_ctx { + unsigned arch; + const struct pan_varying_layout *varying_layout; + bool has_idvs; + bool has_extended_fifo; +}; + +static void +build_attr_buf_write(struct nir_builder *b, nir_def *data, + const struct pan_varying_slot *slot, uint32_t view_index, + const struct lower_vs_outputs_ctx *ctx) +{ + /* We need the precise memory layout */ + pan_varying_layout_require_layout(ctx->varying_layout); + + nir_def *index = nir_load_idvs_output_buf_index_pan(b); + + uint32_t res, view_stride; + if (slot->section == PAN_VARYING_SECTION_GENERIC) { + /* The varying buffer is bound at index 1 on v12+ */ + uint32_t res_index = ctx->arch >= 12 ? 1 : 0; + res = pan_res_handle(61, res_index); + view_stride = 4; + } else { + res = pan_res_handle(61, 0); + view_stride = ctx->has_extended_fifo ? 8 : 4; + } + + uint32_t index_offset = view_index * view_stride; + if (slot->section == PAN_VARYING_SECTION_ATTRIBS) + index_offset += 4; + + /* v9+ cache hints, generic varyings don't need caching while + * position/attribute varyings are reused by other units inside of the GPU. + * TODO: Do we really want ESTREAM on generic varyings? + */ + enum gl_access_qualifier access = + slot->section == PAN_VARYING_SECTION_GENERIC ? ACCESS_ESTREAM_PAN : + ACCESS_ISTREAM_PAN; + + index = nir_iadd_imm(b, index, index_offset); + nir_def *addr = nir_lea_buf_pan(b, nir_imm_int(b, res), index); + addr = nir_pack_64_2x32(b, addr); + addr = nir_iadd(b, addr, nir_imm_int64(b, slot->offset)); + + /* Tag writes to gl_PointSize with a special intrinsic */ + if (slot->location == VARYING_SLOT_PSIZ) { + nir_store_global_psiz_pan(b, data, addr, .access = access); + } else { + nir_store_global(b, data, addr, .access = access); + } +} + +static void +build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base, + nir_alu_type src_type, + const struct lower_vs_outputs_ctx *ctx) +{ + nir_def *index = nir_imm_int(b, base); + nir_def *vertex_id = nir_load_raw_vertex_id_pan(b); + nir_def *instance_id = nir_load_instance_id(b); + + nir_def *addr_cvt = nir_lea_attr_pan(b, index, vertex_id, instance_id, + .src_type = src_type); + nir_def *addr = nir_pack_64_2x32(b, nir_trim_vector(b, addr_cvt, 2)); + nir_def *cvt = nir_channel(b, addr_cvt, 2); + + nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type); +} + +static bool +lower_vs_output_store(struct nir_builder *b, + nir_intrinsic_instr *store, void *cb_data) +{ + const struct lower_vs_outputs_ctx *ctx = cb_data; + + if (store->intrinsic != nir_intrinsic_store_output && + store->intrinsic != nir_intrinsic_store_per_view_output) + return false; + + b->cursor = nir_instr_remove(&store->instr); + + nir_io_semantics sem = nir_intrinsic_io_semantics(store); + nir_alu_type src_type = nir_intrinsic_src_type(store); + unsigned src_bit_size = nir_alu_type_get_type_size(src_type); + + /* Indirect array varyings are not yet supported (num_slots > 1) */ + assert(sem.num_slots == 1); + assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0); + + /* We need the slot section for cache hints */ + pan_varying_layout_require_format(ctx->varying_layout); + const struct pan_varying_slot *slot = + pan_varying_layout_find_slot(ctx->varying_layout, sem.location); + /* Special slots are read only */ + assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL); + /* From v9, IO is resized to the real size of the slot */ + assert(ctx->arch < 9 || + src_bit_size == nir_alu_type_get_type_size(slot->alu_type)); + + /* Since v9 we cannot have separate attribute descriptors for VS-FS, + * There might be a mismatch on Gallium where the VS thinks it is storing + * an int, but the data is actually a float, and that's what FS expects. + * So, just for v9 onwards, just until we haven't fixed gallium, use auto32. + * We are still getting around the midgard quirk since we do this only + * from v9. + * TODO: fix all bugs with gallium and remove this patch + */ + if (ctx->arch >= 9 && src_bit_size == 32) + src_type = 32; + + nir_def *data = store->src[0].ssa; + assert(src_bit_size == data->bit_size); + + /* Trim the input so we don't write extra channels at the end. In effect, + * we fill in all the intermediate "holes" in the write mask, since we + * can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures + * each varying is written at most once, anything that's masked out is + * undefined, so it doesn't matter what we write there. So we may as well + * do the simplest thing possible. + */ + const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store); + data = nir_trim_vector(b, data, util_last_bit(write_mask)); + + if (ctx->arch >= 9 && ctx->has_idvs) { + uint32_t view_index = 0; + if (store->intrinsic == nir_intrinsic_store_per_view_output) + view_index = nir_src_as_uint(store->src[1]); + + build_attr_buf_write(b, data, slot, view_index, ctx); + } else { + uint32_t base = nir_intrinsic_base(store); + assert(store->intrinsic != nir_intrinsic_store_per_view_output); + build_attr_desc_write(b, data, base, src_type, ctx); + } + + return true; +} + +bool +pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id, + const struct pan_varying_layout *varying_layout, + bool has_idvs, + bool has_extended_fifo) +{ + assert(shader->info.stage == MESA_SHADER_VERTEX); + + const struct lower_vs_outputs_ctx ctx = { + .arch = pan_arch(gpu_id), + .varying_layout = varying_layout, + .has_idvs = has_idvs, + .has_extended_fifo = has_extended_fifo, + }; + return nir_shader_intrinsics_pass(shader, lower_vs_output_store, + nir_metadata_control_flow, + (void *)&ctx); +} + struct lower_fs_inputs_ctx { unsigned arch; const struct pan_varying_layout *varying_layout;