diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 926c7518c13..c41a9f693f1 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -1015,34 +1015,6 @@ bi_is_zs(unsigned location) return location == FRAG_RESULT_DEPTH || location == FRAG_RESULT_STENCIL; } -static bool -bifrost_nir_lower_shader_output_impl(struct nir_builder *b, - nir_intrinsic_instr *intr, void *data) -{ - if (intr->intrinsic != nir_intrinsic_store_output && - intr->intrinsic != nir_intrinsic_store_per_view_output) - return false; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - unsigned mask = va_shader_output_from_loc(sem.location); - - b->cursor = nir_instr_remove(&intr->instr); - nir_def *shader_output = nir_load_shader_output_pan(b); - - nir_push_if(b, nir_i2b(b, nir_iand_imm(b, shader_output, mask))); - nir_builder_instr_insert(b, &intr->instr); - nir_pop_if(b, NULL); - return true; -} - -static bool -bifrost_nir_lower_shader_output(nir_shader *shader) -{ - return nir_shader_intrinsics_pass(shader, - bifrost_nir_lower_shader_output_impl, - nir_metadata_none, NULL); -} - /* Atomics and memory write on the vertex stage have implementation-defined * behaviors on how many invocations will happen. However for some reasons, * atomic counters on GL/GLES specs are quite ambigous here and even have tests @@ -6133,13 +6105,6 @@ bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id) NIR_PASS(_, nir, nir_lower_viewport_transform); NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0); NIR_PASS(_, nir, pan_nir_lower_noperspective_vs); - - /* nir_lower[_explicit]_io is lazy and emits mul+add chains even - * for offsets it could figure out are constant. Do some - * constant folding before pan_nir_lower_store_component below. - */ - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, pan_nir_lower_store_component); } nir_lower_mem_access_bit_sizes_options mem_size_options = { @@ -6853,26 +6818,6 @@ bifrost_compile_shader_nir(nir_shader *nir, if (info->vs.idvs && nir->info.writes_memory) NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics); - if (info->vs.idvs) { - bool shader_output_pass = false; - NIR_PASS(shader_output_pass, nir, bifrost_nir_lower_shader_output); - - /* If shader output lower made progress, ensure to merge adjacent if that were added */ - if (shader_output_pass) { - /* First we clean up and deduplicate added condition logic */ - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - NIR_PASS(_, nir, nir_opt_cse); - - /* We then move vec closer to their I/O stores to ensure nothing is between ifs */ - NIR_PASS(_, nir, nir_opt_sink, nir_move_copies); - NIR_PASS(_, nir, nir_opt_move, nir_move_copies); - - /* Finally run if optimization to ensure trivial if blocks are merged together */ - NIR_PASS(_, nir, nir_opt_if, 0); - } - } - bool has_extended_fifo = false; if (pan_arch(inputs->gpu_id) >= 9) { const uint64_t outputs = nir->info.outputs_written; diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build index 6c9bb5eeef1..83a94dbde04 100644 --- a/src/panfrost/compiler/meson.build +++ b/src/panfrost/compiler/meson.build @@ -15,7 +15,6 @@ libpanfrost_compiler_files = files( 'pan_nir_lower_image_ms.c', 'pan_nir_lower_noperspective.c', 'pan_nir_lower_sample_position.c', - 'pan_nir_lower_store_component.c', 'pan_nir_lower_texel_buffer_index.c', 'pan_nir_lower_vertex_id.c', 'pan_nir_lower_vs_outputs.c', @@ -29,8 +28,8 @@ subdir('midgard') libpanfrost_compiler = static_library( 'panfrost_compiler', [libpanfrost_compiler_files], - include_directories : [inc_include, inc_src], - dependencies: [idep_nir, idep_mesautil], + include_directories : [inc_include, inc_src, inc_valhall], + dependencies: [idep_nir, idep_mesautil, idep_valhall_enums_h, idep_bi_opcodes_h], link_with: [libpanfrost_midgard, libpanfrost_bifrost], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', diff --git a/src/panfrost/compiler/midgard/midgard_compile.c b/src/panfrost/compiler/midgard/midgard_compile.c index ae4cd952c3c..b8a38a7bf35 100644 --- a/src/panfrost/compiler/midgard/midgard_compile.c +++ b/src/panfrost/compiler/midgard/midgard_compile.c @@ -388,13 +388,6 @@ midgard_postprocess_nir(nir_shader *nir, UNUSED unsigned gpu_id) if (nir->info.stage == MESA_SHADER_VERTEX) { NIR_PASS(_, nir, nir_lower_viewport_transform); NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0); - - /* nir_lower[_explicit]_io is lazy and emits mul+add chains even - * for offsets it could figure out are constant. Do some - * constant folding before pan_nir_lower_store_component below. - */ - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, pan_nir_lower_store_component); } /* Could be eventually useful for Vulkan, but we don't expect it to have @@ -2989,6 +2982,12 @@ midgard_compile_shader_nir(nir_shader *nir, pan_nir_collect_noperspective_varyings_fs(nir); } + if (nir->info.stage == MESA_SHADER_VERTEX) { + NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, + inputs->varying_layout, false /* has_idvs */, + false /* has_extended_fifo */); + } + /* Optimisation passes */ optimise_nir(nir, ctx->quirks, inputs->is_blend); diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h index 1dec76e6d51..1c34afc6edc 100644 --- a/src/panfrost/compiler/pan_nir.h +++ b/src/panfrost/compiler/pan_nir.h @@ -47,8 +47,6 @@ pan_nir_tile_default_coverage(nir_builder *b) bool pan_nir_lower_bool_to_bitsize(nir_shader *shader); -bool pan_nir_lower_store_component(nir_shader *shader); - bool pan_nir_lower_vertex_id(nir_shader *shader); bool pan_nir_lower_image_ms(nir_shader *shader); diff --git a/src/panfrost/compiler/pan_nir_lower_store_component.c b/src/panfrost/compiler/pan_nir_lower_store_component.c deleted file mode 100644 index e0bc7ef06e7..00000000000 --- a/src/panfrost/compiler/pan_nir_lower_store_component.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2020-2022 Collabora Ltd. - * SPDX-License-Identifier: MIT - */ - -#include "compiler/nir/nir_builder.h" -#include "pan_nir.h" - -/* - * If the shader packs multiple varyings into the same location with different - * location_frac, we'll need to lower to a single varying store that collects - * all of the channels together. This is because the varying instruction on - * Midgard and Bifrost is slot-based, writing out an entire vec4 slot at a time. - * - * NOTE: this expects all stores to be outside of control flow, and with - * constant offsets. It should be run after nir_lower_io_vars_to_temporaries. - */ -static bool -lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data) -{ - if (intr->intrinsic != nir_intrinsic_store_output && - intr->intrinsic != nir_intrinsic_store_per_view_output) - return false; - - struct hash_table_u64 *slots = data; - unsigned component = nir_intrinsic_component(intr); - nir_src *slot_src = nir_get_io_offset_src(intr); - uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); - - if (intr->intrinsic == nir_intrinsic_store_per_view_output) { - uint64_t view_index = nir_src_as_uint(intr->src[1]); - slot |= view_index << 32; - } - - nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); - unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); - - nir_def *value = intr->src[0].ssa; - b->cursor = nir_before_instr(&intr->instr); - - nir_def *undef = nir_undef(b, 1, value->bit_size); - nir_def *channels[4] = {undef, undef, undef, undef}; - - /* Copy old */ - u_foreach_bit(i, mask) { - assert(prev != NULL); - nir_def *prev_ssa = prev->src[0].ssa; - channels[i] = nir_channel(b, prev_ssa, i); - } - - /* Copy new */ - unsigned new_mask = nir_intrinsic_write_mask(intr); - mask |= (new_mask << component); - - u_foreach_bit(i, new_mask) { - assert(component + i < 4); - channels[component + i] = nir_channel(b, value, i); - } - - intr->num_components = util_last_bit(mask); - nir_src_rewrite(&intr->src[0], nir_vec(b, channels, intr->num_components)); - - nir_intrinsic_set_component(intr, 0); - nir_intrinsic_set_write_mask(intr, mask); - - if (prev) { - _mesa_hash_table_u64_remove(slots, slot); - nir_instr_remove(&prev->instr); - } - - _mesa_hash_table_u64_insert(slots, slot, intr); - return true; -} - -bool -pan_nir_lower_store_component(nir_shader *s) -{ - assert(s->info.stage == MESA_SHADER_VERTEX); - - struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); - bool progress = nir_shader_intrinsics_pass( - s, lower_store_component, - nir_metadata_control_flow, stores); - _mesa_hash_table_u64_destroy(stores); - return progress; -} diff --git a/src/panfrost/compiler/pan_nir_lower_vs_outputs.c b/src/panfrost/compiler/pan_nir_lower_vs_outputs.c index ec07faf3f82..01ae096bbe3 100644 --- a/src/panfrost/compiler/pan_nir_lower_vs_outputs.c +++ b/src/panfrost/compiler/pan_nir_lower_vs_outputs.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2025 Collabora, Ltd. + * Copyright (C) 2025-2026 Collabora, Ltd. * SPDX-License-Identifier: MIT */ @@ -7,21 +7,30 @@ #include "nir_builder.h" #include "panfrost/model/pan_model.h" +#include "bifrost/valhall/valhall.h" +#include "util/bitscan.h" struct lower_vs_outputs_ctx { unsigned arch; const struct pan_varying_layout *varying_layout; bool has_idvs; bool has_extended_fifo; + + nir_variable *variables[PAN_MAX_VARYINGS]; + uint8_t per_view_written[PAN_MAX_VARYINGS]; + unsigned used_buckets; }; static void -build_attr_buf_write(struct nir_builder *b, nir_def *data, - const struct pan_varying_slot *slot, uint32_t view_index, +build_attr_buf_write(struct nir_builder *b, nir_def *data, uint32_t idx, + uint32_t view_index, const struct lower_vs_outputs_ctx *ctx) { /* We need the precise memory layout */ pan_varying_layout_require_layout(ctx->varying_layout); + const struct pan_varying_slot *slot = + pan_varying_layout_slot_at(ctx->varying_layout, idx); + assert(slot); nir_def *index = nir_load_idvs_output_buf_index_pan(b); @@ -78,61 +87,146 @@ build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base, nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type); } -static bool -lower_vs_output_store(struct nir_builder *b, - nir_intrinsic_instr *store, void *cb_data) +static void +build_store_output(struct nir_builder *b, nir_def *data, uint32_t idx, + nir_alu_type src_type, + const struct lower_vs_outputs_ctx *ctx) { - const struct lower_vs_outputs_ctx *ctx = cb_data; + pan_varying_layout_require_format(ctx->varying_layout); - if (store->intrinsic != nir_intrinsic_store_output && - store->intrinsic != nir_intrinsic_store_per_view_output) - return false; + const struct pan_varying_slot *slot = + pan_varying_layout_slot_at(ctx->varying_layout, idx); + assert(slot); - b->cursor = nir_instr_remove(&store->instr); + nir_io_semantics sem = { + .location = slot->location, + .num_slots = 1, + }; - nir_io_semantics sem = nir_intrinsic_io_semantics(store); - nir_alu_type src_type = nir_intrinsic_src_type(store); - unsigned src_bit_size = nir_alu_type_get_type_size(src_type); + nir_store_output(b, data, nir_imm_int(b, 0), .base = idx, .range = 1, + .write_mask = BITFIELD_MASK(slot->ncomps), .component = 0, + .src_type = src_type, .io_semantics = sem); +} + +static void +lower_vs_output_store(struct nir_builder *b, + unsigned view_index, + unsigned idx, const struct lower_vs_outputs_ctx *ctx) +{ + nir_variable *var = ctx->variables[idx]; + assert(var != NULL); + + nir_alu_type src_type = + nir_get_nir_type_for_glsl_type(glsl_without_array(var->type)); + + bool is_per_view = glsl_type_is_array(var->type); + assert(is_per_view || view_index == 0); + nir_def *data = is_per_view ? nir_load_array_var_imm(b, var, view_index) + : nir_load_var(b, var); + + if (ctx->arch >= 9 && ctx->has_idvs) { + build_attr_buf_write(b, data, idx, view_index, ctx); + } else if (ctx->arch >= 6) { + assert(!is_per_view); + build_attr_desc_write(b, data, idx, src_type, ctx); + } else { + assert(!is_per_view); + build_store_output(b, data, idx, src_type, ctx); + } +} + +static nir_variable * +get_or_create_var(nir_builder *b, struct lower_vs_outputs_ctx *ctx, + nir_intrinsic_instr *intr) +{ + assert(intr->intrinsic == nir_intrinsic_store_output || + intr->intrinsic == nir_intrinsic_store_per_view_output); + bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output; + ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned slot_idx = nir_intrinsic_base(intr); + nir_alu_type src_type = nir_intrinsic_src_type(intr); + enum glsl_base_type base_type = nir_get_glsl_base_type_for_nir_type(src_type); /* Indirect array varyings are not yet supported (num_slots > 1) */ assert(sem.num_slots == 1); - assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0); + assert(nir_src_as_uint(*nir_get_io_offset_src(intr)) == 0); - /* We need the slot section for cache hints */ + nir_variable *var = ctx->variables[slot_idx]; + if (var != NULL) { + /* All stores should agree per-location */ + assert(glsl_type_is_array(var->type) == is_per_view); + assert(glsl_get_base_type(glsl_without_array(var->type)) == base_type); + return var; + } + + /* We need the slot section for the number of components */ pan_varying_layout_require_format(ctx->varying_layout); const struct pan_varying_slot *slot = - pan_varying_layout_find_slot(ctx->varying_layout, sem.location); + pan_varying_layout_slot_at(ctx->varying_layout, slot_idx); /* Special slots are read only */ - assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL); - /* From v9, IO is resized to the real size of the slot */ - assert(ctx->arch < 9 || - src_bit_size == nir_alu_type_get_type_size(slot->alu_type)); + assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL && + slot->location == sem.location); - nir_def *data = store->src[0].ssa; - assert(src_bit_size == data->bit_size); + const glsl_type *var_type = glsl_vector_type(base_type, slot->ncomps); + if (is_per_view) + var_type = glsl_array_type(var_type, PAN_MAX_MULTIVIEW_VIEW_COUNT, false); - /* Trim the input so we don't write extra channels at the end. In effect, - * we fill in all the intermediate "holes" in the write mask, since we - * can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures - * each varying is written at most once, anything that's masked out is - * undefined, so it doesn't matter what we write there. So we may as well - * do the simplest thing possible. + var = nir_local_variable_create(b->impl, var_type, "vs_out_tmp"); + ctx->variables[slot_idx] = var; + return var; +} + +static bool +gather_vs_outputs(struct nir_builder *b, + nir_intrinsic_instr *intr, void *cb_data) +{ + struct lower_vs_outputs_ctx *ctx = cb_data; + + if (intr->intrinsic != nir_intrinsic_store_output && + intr->intrinsic != nir_intrinsic_store_per_view_output) + return false; + + unsigned mask = nir_intrinsic_write_mask(intr); + unsigned slot_idx = nir_intrinsic_base(intr); + unsigned component = nir_intrinsic_component(intr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output; + unsigned view_index = is_per_view ? nir_src_as_uint(intr->src[1]) : 0; + + ctx->per_view_written[slot_idx] |= BITFIELD_BIT(view_index); + ctx->used_buckets |= BITFIELD_BIT(va_shader_output_from_loc(sem.location)); + + b->cursor = nir_instr_remove(&intr->instr); + nir_variable *var = get_or_create_var(b, ctx, intr); + + unsigned ncomps = glsl_get_vector_elements(glsl_without_array(var->type)); + + /* store_output semantics differ from store_var when we write a subset of + * the components, we need to re-swizzle the channels manually. */ - const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store); - data = nir_trim_vector(b, data, util_last_bit(write_mask)); + nir_def *stored = intr->src[0].ssa; + if (stored->num_components != ncomps) { + nir_def *undef = nir_undef(b, 1, stored->bit_size); + assert(ncomps <= 4); + nir_def *channels[4] = {undef, undef, undef, undef}; - if (ctx->arch >= 9 && ctx->has_idvs) { - uint32_t view_index = 0; - if (store->intrinsic == nir_intrinsic_store_per_view_output) - view_index = nir_src_as_uint(store->src[1]); + for (unsigned i = 0; i < stored->num_components; i++) { + assert(component + i < 4); + channels[component + i] = nir_channel(b, stored, i); + } - build_attr_buf_write(b, data, slot, view_index, ctx); + stored = nir_vec(b, channels, ncomps); + mask = mask << component; /* mask was relative to intr->component */ } else { - uint32_t base = nir_intrinsic_base(store); - assert(store->intrinsic != nir_intrinsic_store_per_view_output); - build_attr_desc_write(b, data, base, src_type, ctx); + assert(component == 0); } + if (is_per_view) + nir_store_array_var_imm(b, var, view_index, stored, mask); + else + nir_store_var(b, var, stored, mask); + return true; } @@ -143,14 +237,62 @@ pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id, bool has_extended_fifo) { assert(shader->info.stage == MESA_SHADER_VERTEX); + nir_function_impl *impl = nir_shader_get_entrypoint(shader); - const struct lower_vs_outputs_ctx ctx = { + struct lower_vs_outputs_ctx ctx = { .arch = pan_arch(gpu_id), .varying_layout = varying_layout, .has_idvs = has_idvs, .has_extended_fifo = has_extended_fifo, + .variables = {NULL, }, + .per_view_written = {0, }, + .used_buckets = 0, }; - return nir_shader_intrinsics_pass(shader, lower_vs_output_store, - nir_metadata_control_flow, - (void *)&ctx); + /* We use uint8 as a viewcount bitmask */ + assert(PAN_MAX_MULTIVIEW_VIEW_COUNT <= 8 * sizeof(ctx.per_view_written[0])); + bool progress = nir_shader_intrinsics_pass(shader, gather_vs_outputs, + nir_metadata_control_flow, + (void *)&ctx); + if (!progress) + return false; + + nir_builder builder = nir_builder_at(nir_after_impl(impl)); + nir_builder *b = &builder; + nir_def *shader_output = has_idvs ? nir_load_shader_output_pan(b) : NULL; + + for (int out = 0; out < VA_SHADER_OUTPUT_COUNT; out++) { + /* Avoid looping a lot and adding ifs (in case of IDVS) */ + if (!(ctx.used_buckets & BITFIELD_BIT(out))) + continue; + + nir_if *nif = NULL; + if (has_idvs) { + nir_def *should_write = + nir_i2b(b, nir_iand_imm(b, shader_output, BITFIELD_BIT(out))); + nif = nir_push_if(b, should_write); + } + + for (unsigned idx = 0; idx < varying_layout->count; idx++) { + nir_variable *var = ctx.variables[idx]; + if (var == NULL) + continue; + + const struct pan_varying_slot *slot = + pan_varying_layout_slot_at(varying_layout, idx); + if (slot == NULL || va_shader_output_from_loc(slot->location) != out) + continue; + + /* Non-multiview stores are treated as writing just view index 0 */ + unsigned view_mask = ctx.per_view_written[idx]; + assert(view_mask != 0); + + u_foreach_bit(view_index, view_mask) + lower_vs_output_store(b, view_index, idx, &ctx); + } + + if (nif) + nir_pop_if(b, nif); + } + + return nir_progress(true, impl, nir_metadata_none); }