pan/compiler: Group outputs in lower_vs_outputs

Previously bifrost_nir_lower_shader_output grouped outputs in separate
if blocks and made a best-effort attempt to group them together.  This
also assumed that pan_nir_lower_store_component wrote each output only
once and that nir_lower_io_vars_to_temporaries pulled them out of any
control flow.

Now all of these are handled by the new pan_nir_lower_vs_outputs pass
that handles write masks, control flow, per_view and grouping for IDVS.
This makes the overall dependencies much simpler, ensures that the
stores are grouped in the same ifs and should be more robust.

Signed-off-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40537>
This commit is contained in:
Lorenzo Rossi 2026-03-25 10:08:31 +01:00 committed by Marge Bot
parent 66bee415ad
commit 445a22acbd
6 changed files with 193 additions and 196 deletions

View file

@ -1015,34 +1015,6 @@ bi_is_zs(unsigned location)
return location == FRAG_RESULT_DEPTH || location == FRAG_RESULT_STENCIL;
}
static bool
bifrost_nir_lower_shader_output_impl(struct nir_builder *b,
nir_intrinsic_instr *intr, void *data)
{
if (intr->intrinsic != nir_intrinsic_store_output &&
intr->intrinsic != nir_intrinsic_store_per_view_output)
return false;
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
unsigned mask = va_shader_output_from_loc(sem.location);
b->cursor = nir_instr_remove(&intr->instr);
nir_def *shader_output = nir_load_shader_output_pan(b);
nir_push_if(b, nir_i2b(b, nir_iand_imm(b, shader_output, mask)));
nir_builder_instr_insert(b, &intr->instr);
nir_pop_if(b, NULL);
return true;
}
static bool
bifrost_nir_lower_shader_output(nir_shader *shader)
{
return nir_shader_intrinsics_pass(shader,
bifrost_nir_lower_shader_output_impl,
nir_metadata_none, NULL);
}
/* Atomics and memory write on the vertex stage have implementation-defined
* behaviors on how many invocations will happen. However for some reasons,
* atomic counters on GL/GLES specs are quite ambigous here and even have tests
@ -6133,13 +6105,6 @@ bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id)
NIR_PASS(_, nir, nir_lower_viewport_transform);
NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
/* nir_lower[_explicit]_io is lazy and emits mul+add chains even
* for offsets it could figure out are constant. Do some
* constant folding before pan_nir_lower_store_component below.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, pan_nir_lower_store_component);
}
nir_lower_mem_access_bit_sizes_options mem_size_options = {
@ -6853,26 +6818,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
if (info->vs.idvs && nir->info.writes_memory)
NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics);
if (info->vs.idvs) {
bool shader_output_pass = false;
NIR_PASS(shader_output_pass, nir, bifrost_nir_lower_shader_output);
/* If shader output lower made progress, ensure to merge adjacent if that were added */
if (shader_output_pass) {
/* First we clean up and deduplicate added condition logic */
NIR_PASS(_, nir, nir_opt_copy_prop);
NIR_PASS(_, nir, nir_opt_dce);
NIR_PASS(_, nir, nir_opt_cse);
/* We then move vec closer to their I/O stores to ensure nothing is between ifs */
NIR_PASS(_, nir, nir_opt_sink, nir_move_copies);
NIR_PASS(_, nir, nir_opt_move, nir_move_copies);
/* Finally run if optimization to ensure trivial if blocks are merged together */
NIR_PASS(_, nir, nir_opt_if, 0);
}
}
bool has_extended_fifo = false;
if (pan_arch(inputs->gpu_id) >= 9) {
const uint64_t outputs = nir->info.outputs_written;

View file

@ -15,7 +15,6 @@ libpanfrost_compiler_files = files(
'pan_nir_lower_image_ms.c',
'pan_nir_lower_noperspective.c',
'pan_nir_lower_sample_position.c',
'pan_nir_lower_store_component.c',
'pan_nir_lower_texel_buffer_index.c',
'pan_nir_lower_vertex_id.c',
'pan_nir_lower_vs_outputs.c',
@ -29,8 +28,8 @@ subdir('midgard')
libpanfrost_compiler = static_library(
'panfrost_compiler',
[libpanfrost_compiler_files],
include_directories : [inc_include, inc_src],
dependencies: [idep_nir, idep_mesautil],
include_directories : [inc_include, inc_src, inc_valhall],
dependencies: [idep_nir, idep_mesautil, idep_valhall_enums_h, idep_bi_opcodes_h],
link_with: [libpanfrost_midgard, libpanfrost_bifrost],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',

View file

@ -388,13 +388,6 @@ midgard_postprocess_nir(nir_shader *nir, UNUSED unsigned gpu_id)
if (nir->info.stage == MESA_SHADER_VERTEX) {
NIR_PASS(_, nir, nir_lower_viewport_transform);
NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
/* nir_lower[_explicit]_io is lazy and emits mul+add chains even
* for offsets it could figure out are constant. Do some
* constant folding before pan_nir_lower_store_component below.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, pan_nir_lower_store_component);
}
/* Could be eventually useful for Vulkan, but we don't expect it to have
@ -2989,6 +2982,12 @@ midgard_compile_shader_nir(nir_shader *nir,
pan_nir_collect_noperspective_varyings_fs(nir);
}
if (nir->info.stage == MESA_SHADER_VERTEX) {
NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id,
inputs->varying_layout, false /* has_idvs */,
false /* has_extended_fifo */);
}
/* Optimisation passes */
optimise_nir(nir, ctx->quirks, inputs->is_blend);

View file

@ -47,8 +47,6 @@ pan_nir_tile_default_coverage(nir_builder *b)
bool pan_nir_lower_bool_to_bitsize(nir_shader *shader);
bool pan_nir_lower_store_component(nir_shader *shader);
bool pan_nir_lower_vertex_id(nir_shader *shader);
bool pan_nir_lower_image_ms(nir_shader *shader);

View file

@ -1,86 +0,0 @@
/*
* Copyright (C) 2020-2022 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#include "compiler/nir/nir_builder.h"
#include "pan_nir.h"
/*
* If the shader packs multiple varyings into the same location with different
* location_frac, we'll need to lower to a single varying store that collects
* all of the channels together. This is because the varying instruction on
* Midgard and Bifrost is slot-based, writing out an entire vec4 slot at a time.
*
* NOTE: this expects all stores to be outside of control flow, and with
* constant offsets. It should be run after nir_lower_io_vars_to_temporaries.
*/
static bool
lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
if (intr->intrinsic != nir_intrinsic_store_output &&
intr->intrinsic != nir_intrinsic_store_per_view_output)
return false;
struct hash_table_u64 *slots = data;
unsigned component = nir_intrinsic_component(intr);
nir_src *slot_src = nir_get_io_offset_src(intr);
uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
if (intr->intrinsic == nir_intrinsic_store_per_view_output) {
uint64_t view_index = nir_src_as_uint(intr->src[1]);
slot |= view_index << 32;
}
nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
nir_def *value = intr->src[0].ssa;
b->cursor = nir_before_instr(&intr->instr);
nir_def *undef = nir_undef(b, 1, value->bit_size);
nir_def *channels[4] = {undef, undef, undef, undef};
/* Copy old */
u_foreach_bit(i, mask) {
assert(prev != NULL);
nir_def *prev_ssa = prev->src[0].ssa;
channels[i] = nir_channel(b, prev_ssa, i);
}
/* Copy new */
unsigned new_mask = nir_intrinsic_write_mask(intr);
mask |= (new_mask << component);
u_foreach_bit(i, new_mask) {
assert(component + i < 4);
channels[component + i] = nir_channel(b, value, i);
}
intr->num_components = util_last_bit(mask);
nir_src_rewrite(&intr->src[0], nir_vec(b, channels, intr->num_components));
nir_intrinsic_set_component(intr, 0);
nir_intrinsic_set_write_mask(intr, mask);
if (prev) {
_mesa_hash_table_u64_remove(slots, slot);
nir_instr_remove(&prev->instr);
}
_mesa_hash_table_u64_insert(slots, slot, intr);
return true;
}
bool
pan_nir_lower_store_component(nir_shader *s)
{
assert(s->info.stage == MESA_SHADER_VERTEX);
struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL);
bool progress = nir_shader_intrinsics_pass(
s, lower_store_component,
nir_metadata_control_flow, stores);
_mesa_hash_table_u64_destroy(stores);
return progress;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (C) 2025 Collabora, Ltd.
* Copyright (C) 2025-2026 Collabora, Ltd.
* SPDX-License-Identifier: MIT
*/
@ -7,21 +7,30 @@
#include "nir_builder.h"
#include "panfrost/model/pan_model.h"
#include "bifrost/valhall/valhall.h"
#include "util/bitscan.h"
struct lower_vs_outputs_ctx {
unsigned arch;
const struct pan_varying_layout *varying_layout;
bool has_idvs;
bool has_extended_fifo;
nir_variable *variables[PAN_MAX_VARYINGS];
uint8_t per_view_written[PAN_MAX_VARYINGS];
unsigned used_buckets;
};
static void
build_attr_buf_write(struct nir_builder *b, nir_def *data,
const struct pan_varying_slot *slot, uint32_t view_index,
build_attr_buf_write(struct nir_builder *b, nir_def *data, uint32_t idx,
uint32_t view_index,
const struct lower_vs_outputs_ctx *ctx)
{
/* We need the precise memory layout */
pan_varying_layout_require_layout(ctx->varying_layout);
const struct pan_varying_slot *slot =
pan_varying_layout_slot_at(ctx->varying_layout, idx);
assert(slot);
nir_def *index = nir_load_idvs_output_buf_index_pan(b);
@ -78,61 +87,146 @@ build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base,
nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type);
}
static bool
lower_vs_output_store(struct nir_builder *b,
nir_intrinsic_instr *store, void *cb_data)
static void
build_store_output(struct nir_builder *b, nir_def *data, uint32_t idx,
nir_alu_type src_type,
const struct lower_vs_outputs_ctx *ctx)
{
const struct lower_vs_outputs_ctx *ctx = cb_data;
pan_varying_layout_require_format(ctx->varying_layout);
if (store->intrinsic != nir_intrinsic_store_output &&
store->intrinsic != nir_intrinsic_store_per_view_output)
return false;
const struct pan_varying_slot *slot =
pan_varying_layout_slot_at(ctx->varying_layout, idx);
assert(slot);
b->cursor = nir_instr_remove(&store->instr);
nir_io_semantics sem = {
.location = slot->location,
.num_slots = 1,
};
nir_io_semantics sem = nir_intrinsic_io_semantics(store);
nir_alu_type src_type = nir_intrinsic_src_type(store);
unsigned src_bit_size = nir_alu_type_get_type_size(src_type);
nir_store_output(b, data, nir_imm_int(b, 0), .base = idx, .range = 1,
.write_mask = BITFIELD_MASK(slot->ncomps), .component = 0,
.src_type = src_type, .io_semantics = sem);
}
static void
lower_vs_output_store(struct nir_builder *b,
unsigned view_index,
unsigned idx, const struct lower_vs_outputs_ctx *ctx)
{
nir_variable *var = ctx->variables[idx];
assert(var != NULL);
nir_alu_type src_type =
nir_get_nir_type_for_glsl_type(glsl_without_array(var->type));
bool is_per_view = glsl_type_is_array(var->type);
assert(is_per_view || view_index == 0);
nir_def *data = is_per_view ? nir_load_array_var_imm(b, var, view_index)
: nir_load_var(b, var);
if (ctx->arch >= 9 && ctx->has_idvs) {
build_attr_buf_write(b, data, idx, view_index, ctx);
} else if (ctx->arch >= 6) {
assert(!is_per_view);
build_attr_desc_write(b, data, idx, src_type, ctx);
} else {
assert(!is_per_view);
build_store_output(b, data, idx, src_type, ctx);
}
}
static nir_variable *
get_or_create_var(nir_builder *b, struct lower_vs_outputs_ctx *ctx,
nir_intrinsic_instr *intr)
{
assert(intr->intrinsic == nir_intrinsic_store_output ||
intr->intrinsic == nir_intrinsic_store_per_view_output);
bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output;
ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
unsigned slot_idx = nir_intrinsic_base(intr);
nir_alu_type src_type = nir_intrinsic_src_type(intr);
enum glsl_base_type base_type = nir_get_glsl_base_type_for_nir_type(src_type);
/* Indirect array varyings are not yet supported (num_slots > 1) */
assert(sem.num_slots == 1);
assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0);
assert(nir_src_as_uint(*nir_get_io_offset_src(intr)) == 0);
/* We need the slot section for cache hints */
nir_variable *var = ctx->variables[slot_idx];
if (var != NULL) {
/* All stores should agree per-location */
assert(glsl_type_is_array(var->type) == is_per_view);
assert(glsl_get_base_type(glsl_without_array(var->type)) == base_type);
return var;
}
/* We need the slot section for the number of components */
pan_varying_layout_require_format(ctx->varying_layout);
const struct pan_varying_slot *slot =
pan_varying_layout_find_slot(ctx->varying_layout, sem.location);
pan_varying_layout_slot_at(ctx->varying_layout, slot_idx);
/* Special slots are read only */
assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL);
/* From v9, IO is resized to the real size of the slot */
assert(ctx->arch < 9 ||
src_bit_size == nir_alu_type_get_type_size(slot->alu_type));
assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL &&
slot->location == sem.location);
nir_def *data = store->src[0].ssa;
assert(src_bit_size == data->bit_size);
const glsl_type *var_type = glsl_vector_type(base_type, slot->ncomps);
if (is_per_view)
var_type = glsl_array_type(var_type, PAN_MAX_MULTIVIEW_VIEW_COUNT, false);
/* Trim the input so we don't write extra channels at the end. In effect,
* we fill in all the intermediate "holes" in the write mask, since we
* can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures
* each varying is written at most once, anything that's masked out is
* undefined, so it doesn't matter what we write there. So we may as well
* do the simplest thing possible.
var = nir_local_variable_create(b->impl, var_type, "vs_out_tmp");
ctx->variables[slot_idx] = var;
return var;
}
static bool
gather_vs_outputs(struct nir_builder *b,
nir_intrinsic_instr *intr, void *cb_data)
{
struct lower_vs_outputs_ctx *ctx = cb_data;
if (intr->intrinsic != nir_intrinsic_store_output &&
intr->intrinsic != nir_intrinsic_store_per_view_output)
return false;
unsigned mask = nir_intrinsic_write_mask(intr);
unsigned slot_idx = nir_intrinsic_base(intr);
unsigned component = nir_intrinsic_component(intr);
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output;
unsigned view_index = is_per_view ? nir_src_as_uint(intr->src[1]) : 0;
ctx->per_view_written[slot_idx] |= BITFIELD_BIT(view_index);
ctx->used_buckets |= BITFIELD_BIT(va_shader_output_from_loc(sem.location));
b->cursor = nir_instr_remove(&intr->instr);
nir_variable *var = get_or_create_var(b, ctx, intr);
unsigned ncomps = glsl_get_vector_elements(glsl_without_array(var->type));
/* store_output semantics differ from store_var when we write a subset of
* the components, we need to re-swizzle the channels manually.
*/
const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store);
data = nir_trim_vector(b, data, util_last_bit(write_mask));
nir_def *stored = intr->src[0].ssa;
if (stored->num_components != ncomps) {
nir_def *undef = nir_undef(b, 1, stored->bit_size);
assert(ncomps <= 4);
nir_def *channels[4] = {undef, undef, undef, undef};
if (ctx->arch >= 9 && ctx->has_idvs) {
uint32_t view_index = 0;
if (store->intrinsic == nir_intrinsic_store_per_view_output)
view_index = nir_src_as_uint(store->src[1]);
for (unsigned i = 0; i < stored->num_components; i++) {
assert(component + i < 4);
channels[component + i] = nir_channel(b, stored, i);
}
build_attr_buf_write(b, data, slot, view_index, ctx);
stored = nir_vec(b, channels, ncomps);
mask = mask << component; /* mask was relative to intr->component */
} else {
uint32_t base = nir_intrinsic_base(store);
assert(store->intrinsic != nir_intrinsic_store_per_view_output);
build_attr_desc_write(b, data, base, src_type, ctx);
assert(component == 0);
}
if (is_per_view)
nir_store_array_var_imm(b, var, view_index, stored, mask);
else
nir_store_var(b, var, stored, mask);
return true;
}
@ -143,14 +237,62 @@ pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
bool has_extended_fifo)
{
assert(shader->info.stage == MESA_SHADER_VERTEX);
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
const struct lower_vs_outputs_ctx ctx = {
struct lower_vs_outputs_ctx ctx = {
.arch = pan_arch(gpu_id),
.varying_layout = varying_layout,
.has_idvs = has_idvs,
.has_extended_fifo = has_extended_fifo,
.variables = {NULL, },
.per_view_written = {0, },
.used_buckets = 0,
};
return nir_shader_intrinsics_pass(shader, lower_vs_output_store,
nir_metadata_control_flow,
(void *)&ctx);
/* We use uint8 as a viewcount bitmask */
assert(PAN_MAX_MULTIVIEW_VIEW_COUNT <= 8 * sizeof(ctx.per_view_written[0]));
bool progress = nir_shader_intrinsics_pass(shader, gather_vs_outputs,
nir_metadata_control_flow,
(void *)&ctx);
if (!progress)
return false;
nir_builder builder = nir_builder_at(nir_after_impl(impl));
nir_builder *b = &builder;
nir_def *shader_output = has_idvs ? nir_load_shader_output_pan(b) : NULL;
for (int out = 0; out < VA_SHADER_OUTPUT_COUNT; out++) {
/* Avoid looping a lot and adding ifs (in case of IDVS) */
if (!(ctx.used_buckets & BITFIELD_BIT(out)))
continue;
nir_if *nif = NULL;
if (has_idvs) {
nir_def *should_write =
nir_i2b(b, nir_iand_imm(b, shader_output, BITFIELD_BIT(out)));
nif = nir_push_if(b, should_write);
}
for (unsigned idx = 0; idx < varying_layout->count; idx++) {
nir_variable *var = ctx.variables[idx];
if (var == NULL)
continue;
const struct pan_varying_slot *slot =
pan_varying_layout_slot_at(varying_layout, idx);
if (slot == NULL || va_shader_output_from_loc(slot->location) != out)
continue;
/* Non-multiview stores are treated as writing just view index 0 */
unsigned view_mask = ctx.per_view_written[idx];
assert(view_mask != 0);
u_foreach_bit(view_index, view_mask)
lower_vs_output_store(b, view_index, idx, &ctx);
}
if (nif)
nir_pop_if(b, nif);
}
return nir_progress(true, impl, nir_metadata_none);
}