From f0e1ccc8d490ded39b7f248fb9d564e167422b92 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 3 Mar 2024 10:09:08 -0400 Subject: [PATCH] asahi: rewrite varying linking Lower store_output to store_uvs_agx + math. Link UVS indices at draw-time instead of compile-time to get efficient separate shaders. Also picks up varying compaction along the way. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/agx_compile.c | 153 +---------- src/asahi/compiler/agx_compile.h | 60 ----- .../compiler/agx_nir_lower_clip_distance.c | 42 --- src/asahi/compiler/agx_nir_lower_layer.c | 70 ----- src/asahi/compiler/meson.build | 2 - src/asahi/lib/agx_nir_lower_uvs.c | 251 ++++++++++++++++++ src/asahi/lib/agx_nir_passes.h | 3 + src/asahi/lib/agx_uvs.h | 79 ++++++ src/asahi/lib/meson.build | 1 + src/gallium/drivers/asahi/agx_disk_cache.c | 2 + .../drivers/asahi/agx_nir_lower_sysvals.c | 3 + src/gallium/drivers/asahi/agx_state.c | 227 ++++++---------- src/gallium/drivers/asahi/agx_state.h | 17 +- 13 files changed, 436 insertions(+), 474 deletions(-) delete mode 100644 src/asahi/compiler/agx_nir_lower_clip_distance.c delete mode 100644 src/asahi/compiler/agx_nir_lower_layer.c create mode 100644 src/asahi/lib/agx_nir_lower_uvs.c create mode 100644 src/asahi/lib/agx_uvs.h diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index de45d05815a..730cb1f4b01 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -578,39 +578,6 @@ agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr) agx_emit_cached_split(b, dest, components); } -static agx_instr * -agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr) -{ - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - nir_src *offset = nir_get_io_offset_src(instr); - assert(nir_src_is_const(*offset) && "todo: indirects"); - - unsigned imm_index = b->shader->out->varyings.vs.slots[sem.location]; - - if (sem.location == VARYING_SLOT_LAYER || - sem.location == VARYING_SLOT_CLIP_DIST0) { - /* Separate slots used for the sysval vs the varying. The default slot - * above is for the varying. Change for the sysval. - */ - assert(sem.no_sysval_output || sem.no_varying); - - if (sem.no_varying) { - imm_index = sem.location == VARYING_SLOT_LAYER - ? b->shader->out->varyings.vs.layer_viewport_slot - : b->shader->out->varyings.vs.clip_dist_slot; - } - } - - assert(imm_index < ~0); - imm_index += (nir_src_as_uint(*offset) * 4) + nir_intrinsic_component(instr); - - /* nir_lower_io_to_scalar */ - assert(nir_intrinsic_write_mask(instr) == 0x1); - - return agx_st_vary(b, agx_immediate(imm_index), - agx_src_index(&instr->src[0])); -} - static agx_instr * agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr) { @@ -1210,9 +1177,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr) agx_emit_load(b, dst, instr); return NULL; - case nir_intrinsic_store_output: + case nir_intrinsic_store_uvs_agx: assert(stage == MESA_SHADER_VERTEX); - return agx_emit_store_vary(b, instr); + return agx_st_vary(b, agx_src_index(&instr->src[1]), + agx_src_index(&instr->src[0])); case nir_intrinsic_store_agx: agx_emit_store(b, instr); @@ -2667,96 +2635,6 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size) NIR_PASS(_, nir, nir_lower_phis_to_scalar, true); } -/* ABI: position first, then user, then psiz */ -static void -agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings, - struct agx_shader_key *key) -{ - unsigned base = 0; - - /* Initialize to "nothing is written" */ - for (unsigned i = 0; i < ARRAY_SIZE(varyings->slots); ++i) - varyings->slots[i] = ~0; - - /* gl_Position is implicitly written, although it may validly be absent in - * vertex programs run only for transform feedback. Those ignore their - * varyings so it doesn't matter what we do here as long as we don't fail. - */ - varyings->slots[VARYING_SLOT_POS] = base; - base += 4; - - /* These are always flat-shaded from the FS perspective */ - key->vs.outputs_flat_shaded |= VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; - - /* The internal cull distance slots are always linearly-interpolated */ - key->vs.outputs_linear_shaded |= - BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE, 2); - - assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded)); - - /* Smooth 32-bit user bindings go next */ - u_foreach_bit64(loc, nir->info.outputs_written & - ~key->vs.outputs_flat_shaded & - ~key->vs.outputs_linear_shaded) { - if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) - continue; - - assert(loc < ARRAY_SIZE(varyings->slots)); - varyings->slots[loc] = base; - base += 4; - varyings->num_32_smooth += 4; - } - - /* Flat 32-bit user bindings go next */ - u_foreach_bit64(loc, - nir->info.outputs_written & key->vs.outputs_flat_shaded) { - if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) - continue; - - assert(loc < ARRAY_SIZE(varyings->slots)); - varyings->slots[loc] = base; - base += 4; - varyings->num_32_flat += 4; - } - - /* Linear 32-bit user bindings go next */ - u_foreach_bit64(loc, - nir->info.outputs_written & key->vs.outputs_linear_shaded) { - if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) - continue; - - assert(loc < ARRAY_SIZE(varyings->slots)); - varyings->slots[loc] = base; - base += 4; - varyings->num_32_linear += 4; - } - - /* TODO: Link FP16 varyings */ - varyings->base_index_fp16 = base; - varyings->num_16_smooth = 0; - varyings->num_16_flat = 0; - varyings->num_16_linear = 0; - - if (nir->info.outputs_written & VARYING_BIT_PSIZ) { - varyings->slots[VARYING_SLOT_PSIZ] = base; - base += 1; - } - - if (nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) { - varyings->layer_viewport_slot = base; - base += 1; - } - - if (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0) { - varyings->clip_dist_slot = base; - varyings->nr_clip_dists = nir->info.clip_distance_array_size; - base += varyings->nr_clip_dists; - } - - /* All varyings linked now */ - varyings->nr_index = base; -} - /* * Varyings that are used as texture coordinates should be kept at fp32, because * fp16 does not have enough precision for large textures. It's technically @@ -3188,10 +3066,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, if (nir->info.stage == MESA_SHADER_FRAGMENT) out->tag_write_disable = !nir->info.writes_memory; - if (nir->info.stage == MESA_SHADER_VERTEX && - (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0)) - NIR_PASS(_, nir, agx_nir_lower_clip_distance); - bool needs_libagx = true /* TODO: Optimize */; if (nir->info.stage == MESA_SHADER_FRAGMENT) @@ -3238,19 +3112,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, /* Late VBO lowering creates constant udiv instructions */ NIR_PASS(_, nir, nir_opt_idiv_const, 16); - /* Varying output is scalar, other I/O is vector. Lowered late because - * transform feedback programs will use vector output. - */ - if (nir->info.stage == MESA_SHADER_VERTEX) { - NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); - - if (nir->info.outputs_written & - (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) { - - NIR_PASS(_, nir, agx_nir_lower_layer); - } - } - NIR_PASS(_, nir, nir_opt_constant_folding); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle, nir_metadata_block_index | nir_metadata_dominance, NULL); @@ -3258,10 +3119,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, out->push_count = key->reserved_preamble; agx_optimize_nir(nir, &out->push_count); - /* Must be last since NIR passes can remap driver_location freely */ - if (nir->info.stage == MESA_SHADER_VERTEX) - agx_remap_varyings_vs(nir, &out->varyings.vs, key); - else if (nir->info.stage == MESA_SHADER_FRAGMENT) + if (nir->info.stage == MESA_SHADER_FRAGMENT) assign_coefficient_regs(nir, &out->varyings.fs); if (agx_should_dump(nir, AGX_DBG_SHADERS)) @@ -3284,9 +3142,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, } if (nir->info.stage == MESA_SHADER_VERTEX) { - out->writes_psiz = - nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ); - out->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT; out->writes_layer_viewport = diff --git a/src/asahi/compiler/agx_compile.h b/src/asahi/compiler/agx_compile.h index 39064cedcb3..811c1cb4407 100644 --- a/src/asahi/compiler/agx_compile.h +++ b/src/asahi/compiler/agx_compile.h @@ -9,50 +9,6 @@ #include "util/u_dynarray.h" #include "shader_enums.h" -struct agx_varyings_vs { - /* The number of user varyings of each type. The varyings must be allocated - * in this order ({smooth, flat, linear} × {32, 16}), which may require - * remapping. - */ - unsigned num_32_smooth; - unsigned num_32_flat; - unsigned num_32_linear; - unsigned num_16_smooth; - unsigned num_16_flat; - unsigned num_16_linear; - - /* The first index used for FP16 varyings. Indices less than this are treated - * as FP32. This may require remapping slots to guarantee. - */ - unsigned base_index_fp16; - - /* The total number of vertex shader indices output. Must be at least - * base_index_fp16. - */ - unsigned nr_index; - - /* If the slot is written, this is the base index that the first component - * of the slot is written to. The next components are found in the next - * indices. If less than base_index_fp16, this is a 32-bit slot (with 4 - * indices for the 4 components), else this is a 16-bit slot (with 2 - * indices for the 4 components). This must be less than nr_index. - * - * If the slot is not written, this must be ~0. - */ - unsigned slots[VARYING_SLOT_MAX]; - - /* Slot for the combined layer/viewport 32-bit sysval output, or ~0 if none - * is written. What's at slots[VARYING_SLOT_LAYER] is the varying output. - */ - unsigned layer_viewport_slot; - - /* Base slot for the clip distance sysval outputs, or ~0 if none is written. - * What's at slots[VARYING_SLOT_CLIP_DIST0] is the varying output. - */ - unsigned clip_dist_slot; - unsigned nr_clip_dists; -}; - struct agx_cf_binding { /* Base coefficient register */ unsigned cf_base; @@ -96,7 +52,6 @@ struct agx_varyings_fs { }; union agx_varyings { - struct agx_varyings_vs vs; struct agx_varyings_fs fs; }; @@ -127,9 +82,6 @@ struct agx_shader_info { /* Does the shader read the tilebuffer? */ bool reads_tib; - /* Does the shader write point size? */ - bool writes_psiz; - /* Does the shader potentially draw to a nonzero viewport? */ bool nonzero_viewport; @@ -195,17 +147,6 @@ enum agx_format { AGX_NUM_FORMATS, }; -struct agx_vs_shader_key { - /* The GPU ABI requires all smooth shaded varyings to come first, then all - * flat shaded varyings, then all linear shaded varyings, as written by the - * VS. In order to correctly remap the varyings into the right order in the - * VS, we need to propagate the mask of flat/linear shaded varyings into the - * compiler. - */ - uint64_t outputs_flat_shaded; - uint64_t outputs_linear_shaded; -}; - struct agx_fs_shader_key { /* Normally, access to the tilebuffer must be guarded by appropriate fencing * instructions to ensure correct results in the presence of out-of-order @@ -246,7 +187,6 @@ struct agx_shader_key { bool promote_constants; union { - struct agx_vs_shader_key vs; struct agx_fs_shader_key fs; }; }; diff --git a/src/asahi/compiler/agx_nir_lower_clip_distance.c b/src/asahi/compiler/agx_nir_lower_clip_distance.c deleted file mode 100644 index 589a1b48ed9..00000000000 --- a/src/asahi/compiler/agx_nir_lower_clip_distance.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2023 Valve Corporation - * SPDX-License-Identifier: MIT - */ - -#include "compiler/nir/nir.h" -#include "compiler/nir/nir_builder.h" -#include "agx_nir.h" - -static bool -lower(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) -{ - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.location != VARYING_SLOT_CLIP_DIST0) - return false; - - nir_instr *clone = nir_instr_clone(b->shader, &intr->instr); - nir_intrinsic_instr *lowered = nir_instr_as_intrinsic(clone); - - b->cursor = nir_after_instr(&intr->instr); - nir_builder_instr_insert(b, clone); - - nir_io_semantics new_sem = sem; - new_sem.no_varying = true; - nir_intrinsic_set_io_semantics(lowered, new_sem); - - sem.no_sysval_output = true; - nir_intrinsic_set_io_semantics(intr, sem); - return true; -} - -bool -agx_nir_lower_clip_distance(nir_shader *s) -{ - assert(s->info.outputs_written & VARYING_BIT_CLIP_DIST0); - - return nir_shader_intrinsics_pass( - s, lower, nir_metadata_block_index | nir_metadata_dominance, NULL); -} diff --git a/src/asahi/compiler/agx_nir_lower_layer.c b/src/asahi/compiler/agx_nir_lower_layer.c deleted file mode 100644 index 646eb73ab1f..00000000000 --- a/src/asahi/compiler/agx_nir_lower_layer.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2023 Valve Corporation - * SPDX-License-Identifier: MIT - */ - -#include "compiler/nir/nir.h" -#include "compiler/nir/nir_builder.h" -#include "agx_nir.h" - -bool -agx_nir_lower_layer(nir_shader *s) -{ - assert(s->info.stage == MESA_SHADER_VERTEX); - assert(s->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)); - - /* Writes are in the last block, search */ - nir_function_impl *impl = nir_shader_get_entrypoint(s); - nir_block *last = nir_impl_last_block(impl); - - nir_def *layer = NULL, *viewport = NULL; - nir_cursor last_cursor; - - nir_foreach_instr(instr, last) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *store = nir_instr_as_intrinsic(instr); - if (store->intrinsic != nir_intrinsic_store_output) - continue; - - nir_io_semantics sem = nir_intrinsic_io_semantics(store); - nir_def *value = store->src[0].ssa; - - if (sem.location == VARYING_SLOT_LAYER) { - assert(layer == NULL && "only written once"); - layer = value; - } else if (sem.location == VARYING_SLOT_VIEWPORT) { - assert(viewport == NULL && "only written once"); - viewport = value; - } else { - continue; - } - - last_cursor = nir_after_instr(&store->instr); - - /* Leave the store as a varying-only, no sysval output */ - sem.no_sysval_output = true; - nir_intrinsic_set_io_semantics(store, sem); - } - - assert((layer || viewport) && "metadata inconsistent with program"); - - /* Pack together and write out */ - nir_builder b = nir_builder_at(last_cursor); - - nir_def *zero = nir_imm_intN_t(&b, 0, 16); - nir_def *packed = - nir_pack_32_2x16_split(&b, layer ? nir_u2u16(&b, layer) : zero, - viewport ? nir_u2u16(&b, viewport) : zero); - - /* Written with a sysval-only store, no varying output */ - nir_store_output(&b, packed, nir_imm_int(&b, 0), - .io_semantics.location = VARYING_SLOT_LAYER, - .io_semantics.num_slots = 1, - .io_semantics.no_varying = true); - - nir_metadata_preserve(impl, - nir_metadata_dominance | nir_metadata_block_index); - return true; -} diff --git a/src/asahi/compiler/meson.build b/src/asahi/compiler/meson.build index 172455db223..18c99041fe8 100644 --- a/src/asahi/compiler/meson.build +++ b/src/asahi/compiler/meson.build @@ -8,13 +8,11 @@ libasahi_agx_files = files( 'agx_liveness.c', 'agx_insert_waits.c', 'agx_nir_lower_address.c', - 'agx_nir_lower_clip_distance.c', 'agx_nir_lower_cull_distance.c', 'agx_nir_lower_frag_sidefx.c', 'agx_nir_lower_sample_mask.c', 'agx_nir_lower_discard_zs_emit.c', 'agx_nir_lower_interpolation.c', - 'agx_nir_lower_layer.c', 'agx_nir_lower_shared_bitsize.c', 'agx_nir_lower_subgroups.c', 'agx_nir_opt_preamble.c', diff --git a/src/asahi/lib/agx_nir_lower_uvs.c b/src/asahi/lib/agx_nir_lower_uvs.c new file mode 100644 index 00000000000..a3edb051188 --- /dev/null +++ b/src/asahi/lib/agx_nir_lower_uvs.c @@ -0,0 +1,251 @@ +/* + * Copyright 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "util/bitscan.h" +#include "util/macros.h" +#include "agx_compile.h" +#include "agx_pack.h" +#include "agx_uvs.h" +#include "nir_builder_opcodes.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" +#include "shader_enums.h" + +struct ctx { + nir_def *layer, *viewport; + nir_cursor after_layer_viewport; + struct agx_unlinked_uvs_layout *layout; +}; + +static enum uvs_group +group_for_varying(gl_varying_slot loc) +{ + switch (loc) { + case VARYING_SLOT_POS: + return UVS_POSITION; + case VARYING_SLOT_PSIZ: + return UVS_PSIZ; + default: + return UVS_VARYINGS; + } +} + +static bool +lower(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct ctx *ctx = data; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + b->cursor = nir_instr_remove(&intr->instr); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned component = nir_intrinsic_component(intr); + + nir_def *value = intr->src[0].ssa; + nir_def *offset = intr->src[1].ssa; + + /* If there is only 1 user varying, it is at the base of the varying section. + * This saves us an indirection on simple separate shaders. + */ + bool single_vary = util_is_power_of_two_nonzero64(ctx->layout->written); + enum uvs_group group = group_for_varying(sem.location); + + nir_def *base; + if ((group == UVS_VARYINGS) && !single_vary) + base = nir_load_uvs_index_agx(b, .io_semantics = sem); + else + base = nir_imm_intN_t(b, ctx->layout->group_offs[group], 16); + + nir_def *index = nir_iadd(b, nir_iadd_imm(b, base, component), + nir_imul_imm(b, nir_u2u16(b, offset), 4)); + + nir_intrinsic_instr *new_store = nir_store_uvs_agx(b, value, index); + + /* Insert clip distance sysval writes, and gather layer/viewport writes so we + * can accumulate their system value. These are still lowered like normal to + * write them for the varying FS input. + */ + if (sem.location == VARYING_SLOT_LAYER) { + assert(ctx->layer == NULL && "only written once"); + ctx->layer = value; + ctx->after_layer_viewport = nir_after_instr(&new_store->instr); + } else if (sem.location == VARYING_SLOT_VIEWPORT) { + assert(ctx->viewport == NULL && "only written once"); + ctx->viewport = value; + ctx->after_layer_viewport = nir_after_instr(&new_store->instr); + } else if (sem.location == VARYING_SLOT_CLIP_DIST0) { + unsigned clip_base = ctx->layout->group_offs[UVS_CLIP_DIST]; + nir_def *index = nir_iadd_imm(b, nir_imul_imm(b, nir_u2u16(b, offset), 4), + clip_base + component); + + nir_store_uvs_agx(b, value, index); + } + + return true; +} + +static void +write_layer_viewport_sysval(struct ctx *ctx) +{ + nir_builder b = nir_builder_at(ctx->after_layer_viewport); + + nir_def *zero = nir_imm_intN_t(&b, 0, 16); + nir_def *layer = ctx->layer ? nir_u2u16(&b, ctx->layer) : zero; + nir_def *viewport = ctx->viewport ? nir_u2u16(&b, ctx->viewport) : zero; + + nir_store_uvs_agx( + &b, nir_pack_32_2x16_split(&b, layer, viewport), + nir_imm_int(&b, ctx->layout->group_offs[UVS_LAYER_VIEWPORT])); +} + +static bool +gather_components(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct agx_unlinked_uvs_layout *layout = data; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned component = nir_intrinsic_component(intr); + + if (nir_src_is_const(intr->src[1])) { + unsigned loc = sem.location + nir_src_as_uint(intr->src[1]); + layout->components[loc] = MAX2(layout->components[loc], component + 1); + } else { + for (unsigned i = 0; i < sem.num_slots; ++i) { + layout->components[sem.location + i] = 4; + } + } + + return false; +} + +bool +agx_nir_lower_uvs(nir_shader *s, struct agx_unlinked_uvs_layout *layout) +{ + bool progress = false; + + /* Scalarize up front so we can ignore vectors later */ + NIR_PASS(progress, s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, + NULL); + + /* Determine the unlinked UVS layout */ + NIR_PASS(progress, s, nir_shader_intrinsics_pass, gather_components, + nir_metadata_block_index | nir_metadata_dominance, layout); + + unsigned sizes[UVS_NUM_GROUP] = { + [UVS_POSITION] = 4, + [UVS_PSIZ] = !!(s->info.outputs_written & VARYING_BIT_PSIZ), + [UVS_LAYER_VIEWPORT] = !!(s->info.outputs_written & + (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)), + [UVS_CLIP_DIST] = s->info.clip_distance_array_size, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(layout->components); ++i) { + if (i != VARYING_SLOT_POS && i != VARYING_SLOT_PSIZ && + layout->components[i]) { + + layout->written |= BITFIELD64_BIT(i); + sizes[UVS_VARYINGS] += layout->components[i]; + } + } + + unsigned offs = 0; + for (enum uvs_group g = 0; g < UVS_NUM_GROUP; ++g) { + layout->group_offs[g] = offs; + offs += sizes[g]; + } + + layout->size = offs; + layout->user_size = sizes[UVS_VARYINGS]; + + /* Now lower in terms of the unlinked layout */ + struct ctx ctx = {.layout = layout}; + NIR_PASS(progress, s, nir_shader_intrinsics_pass, lower, + nir_metadata_block_index | nir_metadata_dominance, &ctx); + + if (ctx.layer || ctx.viewport) { + write_layer_viewport_sysval(&ctx); + } + + /* Finally, pack what we can. It's much cheaper to do this at compile-time + * than draw-time. + */ + agx_pack(&layout->osel, OUTPUT_SELECT, cfg) { + cfg.point_size = sizes[UVS_PSIZ]; + cfg.viewport_target = sizes[UVS_LAYER_VIEWPORT]; + cfg.render_target = cfg.viewport_target; + + cfg.clip_distance_plane_0 = sizes[UVS_CLIP_DIST] > 0; + cfg.clip_distance_plane_1 = sizes[UVS_CLIP_DIST] > 1; + cfg.clip_distance_plane_2 = sizes[UVS_CLIP_DIST] > 2; + cfg.clip_distance_plane_3 = sizes[UVS_CLIP_DIST] > 3; + cfg.clip_distance_plane_4 = sizes[UVS_CLIP_DIST] > 4; + cfg.clip_distance_plane_5 = sizes[UVS_CLIP_DIST] > 5; + cfg.clip_distance_plane_6 = sizes[UVS_CLIP_DIST] > 6; + cfg.clip_distance_plane_7 = sizes[UVS_CLIP_DIST] > 7; + } + + agx_pack(&layout->vdm, VDM_STATE_VERTEX_OUTPUTS, cfg) { + cfg.output_count_1 = offs; + cfg.output_count_2 = offs; + } + + return progress; +} + +void +agx_assign_uvs(struct agx_varyings_vs *varyings, + struct agx_unlinked_uvs_layout *layout, uint64_t flat_mask, + uint64_t linear_mask) +{ + *varyings = (struct agx_varyings_vs){0}; + + /* These are always flat-shaded from the FS perspective */ + flat_mask |= VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; + + /* The internal cull distance slots are always linearly-interpolated */ + linear_mask |= BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE, 2); + + assert(!(flat_mask & linear_mask)); + + /* TODO: Link FP16 varyings */ + unsigned num_32_smooth = 0, num_32_flat = 0, num_32_linear = 0; + struct { + uint32_t *num; + uint64_t mask; + } parts[] = { + {&num_32_smooth, ~flat_mask & ~linear_mask}, + {&num_32_flat, flat_mask}, + {&num_32_linear, linear_mask}, + }; + + unsigned base = layout->group_offs[UVS_VARYINGS]; + + for (unsigned p = 0; p < ARRAY_SIZE(parts); ++p) { + u_foreach_bit64(loc, parts[p].mask & layout->written) { + assert(loc < ARRAY_SIZE(varyings->slots)); + varyings->slots[loc] = base; + + base += layout->components[loc]; + (*parts[p].num) += layout->components[loc]; + } + } + + agx_pack(&varyings->counts_32, VARYING_COUNTS, cfg) { + cfg.smooth = num_32_smooth; + cfg.flat = num_32_flat; + cfg.linear = num_32_linear; + } + + agx_pack(&varyings->counts_16, VARYING_COUNTS, cfg) { + cfg.smooth = 0; + cfg.flat = 0; + cfg.linear = 0; + } +} diff --git a/src/asahi/lib/agx_nir_passes.h b/src/asahi/lib/agx_nir_passes.h index daa2a4929e8..1d2d4b9edaf 100644 --- a/src/asahi/lib/agx_nir_passes.h +++ b/src/asahi/lib/agx_nir_passes.h @@ -6,6 +6,9 @@ #pragma once #include +#include +#include "agx_pack.h" +#include "shader_enums.h" struct nir_shader; struct nir_instr; diff --git a/src/asahi/lib/agx_uvs.h b/src/asahi/lib/agx_uvs.h new file mode 100644 index 00000000000..424db3661bf --- /dev/null +++ b/src/asahi/lib/agx_uvs.h @@ -0,0 +1,79 @@ +/* + * Copyright 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include +#include "agx_pack.h" +#include "shader_enums.h" + +struct nir_shader; + +/* Matches the hardware order */ +enum uvs_group { + UVS_POSITION, + UVS_VARYINGS, + UVS_PSIZ, + UVS_LAYER_VIEWPORT, + UVS_CLIP_DIST, + UVS_NUM_GROUP, +}; + +/** + * Represents an "unlinked" UVS layout. This is computable from an unlinked + * vertex shader without knowing the associated fragment shader. The various UVS + * groups have fixed offsets, but the varyings within the varying group have + * indeterminate order since we don't yet know the fragment shader interpolation + * qualifiers. + */ +struct agx_unlinked_uvs_layout { + /* Offset of each group in the UVS in words. */ + uint8_t group_offs[UVS_NUM_GROUP]; + + /* Size of the UVS allocation in words. >= last group_offs element */ + uint8_t size; + + /* Size of the UVS_VARYINGS */ + uint8_t user_size; + + /* Number of 32-bit components written for each slot. TODO: Model 16-bit. + * + * Invariant: sum_{slot} (components[slot]) = + * group_offs[PSIZ] - group_offs[VARYINGS] + */ + uint8_t components[VARYING_SLOT_MAX]; + + /* Bit i set <===> components[i] != 0 && i != POS && i != PSIZ. For fast + * iteration of user varyings. + */ + uint64_t written; + + /* Fully packed data structure */ + struct agx_vdm_state_vertex_outputs_packed vdm; + + /* Partial data structure, must be merged with FS selects */ + struct agx_output_select_packed osel; +}; + +bool agx_nir_lower_uvs(struct nir_shader *s, + struct agx_unlinked_uvs_layout *layout); + +/** + * Represents a linked UVS layout. + */ +struct agx_varyings_vs { + /* Associated linked hardware data structures */ + struct agx_varying_counts_packed counts_32, counts_16; + + /* If the user varying slot is written, this is the base index that the first + * component of the slot is written to. The next components are found in the + * next indices. Otherwise 0, aliasing position. + */ + unsigned slots[VARYING_SLOT_MAX]; +}; + +void agx_assign_uvs(struct agx_varyings_vs *varyings, + struct agx_unlinked_uvs_layout *layout, uint64_t flat_mask, + uint64_t linear_mask); diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build index 3bf9590e415..047608b1c48 100644 --- a/src/asahi/lib/meson.build +++ b/src/asahi/lib/meson.build @@ -20,6 +20,7 @@ libasahi_lib_files = files( 'agx_nir_lower_tess.c', 'agx_nir_lower_texture.c', 'agx_nir_lower_tilebuffer.c', + 'agx_nir_lower_uvs.c', 'agx_nir_lower_vbo.c', 'agx_nir_predicate_layer_id.c', 'agx_ppp.h', diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c index b396b5d957e..235ba1cfd74 100644 --- a/src/gallium/drivers/asahi/agx_disk_cache.c +++ b/src/gallium/drivers/asahi/agx_disk_cache.c @@ -63,6 +63,7 @@ write_shader(struct blob *blob, const struct agx_compiled_shader *binary, blob_write_uint32(blob, shader_size); blob_write_bytes(blob, binary->bo->ptr.cpu, shader_size); blob_write_bytes(blob, &binary->info, sizeof(binary->info)); + blob_write_bytes(blob, &binary->uvs, sizeof(binary->uvs)); blob_write_uint32(blob, binary->push_range_count); blob_write_bytes(blob, binary->push, sizeof(binary->push[0]) * binary->push_range_count); @@ -96,6 +97,7 @@ read_shader(struct agx_screen *screen, struct blob_reader *blob, blob_copy_bytes(blob, binary->bo->ptr.cpu, binary_size); blob_copy_bytes(blob, &binary->info, sizeof(binary->info)); + blob_copy_bytes(blob, &binary->uvs, sizeof(binary->uvs)); binary->push_range_count = blob_read_uint32(blob); blob_copy_bytes(blob, binary->push, sizeof(binary->push[0]) * binary->push_range_count); diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index 7f3147a4fb3..9c2bd4ef955 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -189,6 +189,9 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, return load_sysval_root(b, 1, 16, &u->sprite_mask); case nir_intrinsic_load_clip_z_coeff_agx: return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff)); + case nir_intrinsic_load_uvs_index_agx: + return load_sysval_root( + b, 1, 16, &u->uvs_index[nir_intrinsic_io_semantics(intr).location]); case nir_intrinsic_load_polygon_stipple_agx: { nir_def *base = load_sysval_root(b, 1, 64, &u->polygon_stipple); nir_def *row = intr->src[0].ssa; diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 693ddfcbca9..4dd4ce59a47 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1521,59 +1521,10 @@ asahi_cs_shader_key_equal(const void *a, const void *b) return true; } -static unsigned -agx_find_linked_slot(struct agx_varyings_vs *vs, struct agx_varyings_fs *fs, - gl_varying_slot slot, unsigned offset) -{ - assert(offset < 4); - assert(slot != VARYING_SLOT_PNTC && "point coords aren't linked"); - - if (slot == VARYING_SLOT_POS) { - if (offset == 3) { - return 0; /* W */ - } else if (offset == 2) { - assert(fs->reads_z); - return 1; /* Z */ - } else { - unreachable("gl_Position.xy are not varyings"); - } - } - - unsigned vs_index = vs->slots[slot]; - - /* Varyings not written by vertex shader are undefined but we can't crash */ - if (!(vs_index < vs->nr_index)) - return 0; - - assert(vs_index >= 4 && "gl_Position should have been the first 4 slots"); - assert((vs_index < vs->base_index_fp16) == - ((vs_index + offset) < vs->base_index_fp16) && - "a given varying must have a consistent type"); - - unsigned vs_user_index = (vs_index + offset) - 4; - - if (fs->reads_z) - return vs_user_index + 2; - else - return vs_user_index + 1; -} - -static unsigned -agx_num_general_outputs(struct agx_varyings_vs *vs) -{ - unsigned nr_vs = vs->nr_index; - bool writes_psiz = vs->slots[VARYING_SLOT_PSIZ] < nr_vs; - - assert(nr_vs >= 4 && "gl_Position must be written"); - if (writes_psiz) - assert(nr_vs >= 5 && "gl_PointSize is written"); - - return nr_vs - (writes_psiz ? 5 : 4); -} - static uint32_t agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs, - struct agx_varyings_fs *fs, bool first_provoking_vertex, + unsigned nr_user_indices, struct agx_varyings_fs *fs, + bool first_provoking_vertex, uint8_t sprite_coord_enable, bool *generate_primitive_id) { @@ -1586,11 +1537,14 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs, size_t linkage_size = AGX_CF_BINDING_HEADER_LENGTH + (fs->nr_bindings * AGX_CF_BINDING_LENGTH); - void *tmp = alloca(linkage_size); - struct agx_cf_binding_header_packed *header = tmp; + struct agx_ptr t = agx_pool_alloc_aligned(pool, linkage_size, 256); + assert(t.gpu < (1ull << 32) && "varyings must be in low memory"); + + struct agx_cf_binding_header_packed *header = t.cpu; struct agx_cf_binding_packed *bindings = (void *)(header + 1); - unsigned nr_slots = agx_num_general_outputs(vs) + 1 + (fs->reads_z ? 1 : 0); + unsigned user_base = 1 + (fs->reads_z ? 1 : 0); + unsigned nr_slots = user_base + nr_user_indices; agx_pack(header, CF_BINDING_HEADER, cfg) { cfg.number_of_32_bit_slots = nr_slots; @@ -1598,35 +1552,45 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs, } for (unsigned i = 0; i < fs->nr_bindings; ++i) { + struct agx_cf_binding b = fs->bindings[i]; + agx_pack(bindings + i, CF_BINDING, cfg) { - cfg.base_coefficient_register = fs->bindings[i].cf_base; - cfg.components = fs->bindings[i].count; + cfg.base_coefficient_register = b.cf_base; + cfg.components = b.count; cfg.shade_model = agx_translate_shade_model(fs, i, first_provoking_vertex); - if (util_varying_is_point_coord(fs->bindings[i].slot, - sprite_coord_enable)) { - assert(fs->bindings[i].offset == 0); + if (util_varying_is_point_coord(b.slot, sprite_coord_enable)) { + assert(b.offset == 0); cfg.source = AGX_COEFFICIENT_SOURCE_POINT_COORD; - } else if (fs->bindings[i].slot == VARYING_SLOT_PRIMITIVE_ID && - vs->slots[VARYING_SLOT_PRIMITIVE_ID] == ~0) { + } else if (b.slot == VARYING_SLOT_PRIMITIVE_ID && + !vs->slots[VARYING_SLOT_PRIMITIVE_ID]) { cfg.source = AGX_COEFFICIENT_SOURCE_PRIMITIVE_ID; *generate_primitive_id = true; - } else { - cfg.base_slot = agx_find_linked_slot(vs, fs, fs->bindings[i].slot, - fs->bindings[i].offset); + } else if (b.slot == VARYING_SLOT_POS) { + assert(b.offset >= 2 && "gl_Position.xy are not varyings"); + assert(fs->reads_z || b.offset != 2); - assert(cfg.base_slot + cfg.components <= - MAX2(nr_slots, cfg.components) && - "overflow slots"); - } - - if (fs->bindings[i].slot == VARYING_SLOT_POS) { - if (fs->bindings[i].offset == 2) { + if (b.offset == 2) { cfg.source = AGX_COEFFICIENT_SOURCE_FRAGCOORD_Z; + cfg.base_slot = 1; } else { - assert(!fs->bindings[i].perspective && - "W must not be perspective divided"); + assert(!b.perspective && "W must not be perspective divided"); + } + } else { + unsigned vs_index = vs->slots[b.slot]; + assert(b.offset < 4); + + /* Varyings not written by vertex shader are undefined but we can't + * crash */ + if (vs_index) { + assert(vs_index >= 4 && + "gl_Position should have been the first 4 slots"); + + cfg.base_slot = user_base + (vs_index - 4) + b.offset; + + assert(cfg.base_slot + cfg.components <= nr_slots && + "overflow slots"); } } @@ -1635,16 +1599,7 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs, } } - struct agx_ptr ptr = agx_pool_alloc_aligned(pool, (3 * linkage_size), 256); - assert(ptr.gpu < (1ull << 32) && "varyings must be in low memory"); - - /* I don't understand why the data structures are repeated thrice */ - for (unsigned i = 0; i < 3; ++i) { - memcpy(((uint8_t *)ptr.cpu) + (i * linkage_size), (uint8_t *)tmp, - linkage_size); - } - - return ptr.gpu; + return t.gpu; } /* Dynamic lowered I/O version of nir_lower_clip_halfz */ @@ -1859,6 +1814,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, perf_debug(dev, "Compiling shader variant #%u", _mesa_hash_table_num_entries(so->variants)); + struct agx_unlinked_uvs_layout uvs = {0}; bool force_translucent = false; if (nir->info.stage == MESA_SHADER_VERTEX) { @@ -1871,6 +1827,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, key->next.hw.fixed_point_size); NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1, nir_metadata_block_index | nir_metadata_dominance, NULL); + NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs); } else { NIR_PASS(_, nir, agx_nir_lower_sysvals, PIPE_SHADER_VERTEX, false); NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx, @@ -1993,21 +1950,11 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, struct agx_shader_key base_key = {0}; - if (nir->info.stage == MESA_SHADER_VERTEX) { - struct asahi_vs_shader_key *key = &key_->vs; - - if (key->hw) { - base_key.vs.outputs_flat_shaded = key_->vs.next.hw.outputs_flat_shaded; - - base_key.vs.outputs_linear_shaded = - key_->vs.next.hw.outputs_linear_shaded; - } - } - struct agx_compiled_shader *compiled = agx_compile_nir(dev, nir, &base_key, debug, so->type); compiled->so = so; + compiled->uvs = uvs; /* reads_tib => Translucent pass type */ compiled->info.reads_tib |= force_translucent; @@ -2039,13 +1986,14 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1, nir_metadata_block_index | nir_metadata_dominance, NULL); - base_key.vs.outputs_flat_shaded = key->outputs_flat_shaded; - base_key.vs.outputs_linear_shaded = key->outputs_linear_shaded; + struct agx_unlinked_uvs_layout uvs = {0}; + NIR_PASS(_, gs_copy, agx_nir_lower_uvs, &uvs); compiled->gs_copy = agx_compile_nir(dev, gs_copy, &base_key, debug, PIPE_SHADER_GEOMETRY); compiled->gs_copy->so = so; compiled->gs_copy->stage = so->type; + compiled->gs_copy->uvs = uvs; } compiled->gs_output_mode = gs_out_prim; @@ -2427,10 +2375,9 @@ agx_update_vs(struct agx_context *ctx, unsigned index_size_B) * * vb_mask, attributes, vertex_buffers: VERTEX * point_size_per_vertex: RS - * outputs_{flat,linear}_shaded: FS_PROG */ if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB | - AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)) || + AGX_DIRTY_RS | AGX_DIRTY_PRIM)) || ctx->stage[PIPE_SHADER_TESS_EVAL].dirty || ctx->stage[PIPE_SHADER_GEOMETRY].dirty || ctx->stage[PIPE_SHADER_TESS_EVAL].shader || @@ -2451,11 +2398,6 @@ agx_update_vs(struct agx_context *ctx, unsigned index_size_B) */ key.next.hw.fixed_point_size = !ctx->rast->base.point_size_per_vertex && rasterized_prim == MESA_PRIM_POINTS; - - key.next.hw.outputs_flat_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded; - key.next.hw.outputs_linear_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded; } else { key.next.sw.index_size_B = index_size_B; } @@ -2511,10 +2453,6 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, /* TODO: Deduplicate */ .fixed_point_size = !ctx->rast->base.point_size_per_vertex && rasterized_prim == MESA_PRIM_POINTS, - .outputs_flat_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded, - .outputs_linear_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded, }; return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY, @@ -3564,8 +3502,9 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out) if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) || IS_DIRTY(PRIM)) { + batch->varyings = agx_link_varyings_vs_fs( - &batch->pipeline_pool, &vs->info.varyings.vs, + &batch->pipeline_pool, &batch->linked_varyings, vs->uvs.user_size, &ctx->fs->info.varyings.fs, ctx->rast->base.flatshade_first, (batch->reduced_prim == MESA_PRIM_POINTS) ? ctx->rast->base.sprite_coord_enable @@ -3596,10 +3535,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out) cfg.pipeline = agx_build_pipeline(batch, vs, PIPE_SHADER_VERTEX, 0, 0); } - agx_push(out, VDM_STATE_VERTEX_OUTPUTS, cfg) { - cfg.output_count_1 = vs->info.varyings.vs.nr_index; - cfg.output_count_2 = cfg.output_count_1; - } + agx_push_packed(out, vs->uvs.vdm, VDM_STATE_VERTEX_OUTPUTS); agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) { cfg.flat_shading_control = ctx->rast->base.flatshade_first @@ -3654,9 +3590,9 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out) .fragment_back_face = fragment_face_dirty, .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG), .fragment_back_stencil = IS_DIRTY(ZS), - .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG), - .varying_counts_32 = IS_DIRTY(VS_PROG), - .varying_counts_16 = IS_DIRTY(VS_PROG), + .output_select = varyings_dirty, + .varying_counts_32 = varyings_dirty, + .varying_counts_16 = varyings_dirty, .cull = IS_DIRTY(RS), .cull_2 = varyings_dirty, .fragment_shader = @@ -3742,40 +3678,24 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out) if (dirty.fragment_back_stencil) agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL); - if (dirty.output_select) { - agx_ppp_push(&ppp, OUTPUT_SELECT, cfg) { - cfg.varyings = !!fs->info.varyings.fs.nr_bindings; - cfg.point_size = vs->info.writes_psiz; - cfg.viewport_target = vs->info.writes_layer_viewport; - cfg.render_target = vs->info.writes_layer_viewport; - cfg.frag_coord_z = fs->info.varyings.fs.reads_z; - cfg.clip_distance_plane_0 = vs->info.varyings.vs.nr_clip_dists > 0; - cfg.clip_distance_plane_1 = vs->info.varyings.vs.nr_clip_dists > 1; - cfg.clip_distance_plane_2 = vs->info.varyings.vs.nr_clip_dists > 2; - cfg.clip_distance_plane_3 = vs->info.varyings.vs.nr_clip_dists > 3; - cfg.clip_distance_plane_4 = vs->info.varyings.vs.nr_clip_dists > 4; - cfg.clip_distance_plane_5 = vs->info.varyings.vs.nr_clip_dists > 5; - cfg.clip_distance_plane_6 = vs->info.varyings.vs.nr_clip_dists > 6; - cfg.clip_distance_plane_7 = vs->info.varyings.vs.nr_clip_dists > 7; - - assert(cfg.point_size || !is_points); - } - } - assert(dirty.varying_counts_32 == dirty.varying_counts_16); + assert(dirty.varying_counts_32 == dirty.output_select); - if (dirty.varying_counts_32) { - agx_ppp_push(&ppp, VARYING_COUNTS, cfg) { - cfg.smooth = vs->info.varyings.vs.num_32_smooth; - cfg.flat = vs->info.varyings.vs.num_32_flat; - cfg.linear = vs->info.varyings.vs.num_32_linear; + if (dirty.output_select) { + struct agx_output_select_packed osel; + agx_pack(&osel, OUTPUT_SELECT, cfg) { + cfg.varyings = !!fs->info.varyings.fs.nr_bindings; + cfg.frag_coord_z = fs->info.varyings.fs.reads_z; } - agx_ppp_push(&ppp, VARYING_COUNTS, cfg) { - cfg.smooth = vs->info.varyings.vs.num_16_smooth; - cfg.flat = vs->info.varyings.vs.num_16_flat; - cfg.linear = vs->info.varyings.vs.num_16_linear; - } + agx_merge(osel, vs->uvs.osel, OUTPUT_SELECT); + agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT); + + agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_32, + VARYING_COUNTS); + + agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_16, + VARYING_COUNTS); } if (dirty.cull) @@ -3817,7 +3737,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out) if (dirty.output_size) { agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) - cfg.count = vs->info.varyings.vs.nr_index; + cfg.count = vs->uvs.size; } agx_ppp_fini(&out, &ppp); @@ -5061,6 +4981,21 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, agx_batch_add_bo(batch, ctx->gs->gs_copy->bo); } + if (ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG)) { + struct agx_compiled_shader *vs = ctx->vs; + if (ctx->gs) + vs = ctx->gs->gs_copy; + + agx_assign_uvs( + &batch->linked_varyings, &vs->uvs, + ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded, + ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded); + + for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) { + batch->uniforms.uvs_index[i] = batch->linked_varyings.slots[i]; + } + } + /* Set draw ID */ if (ctx->vs->info.uses_draw_id) { batch->uniforms.draw_id = drawid_offset; diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 8c8a56f931f..c5335c769cf 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -14,6 +14,7 @@ #include "asahi/lib/agx_nir_lower_vbo.h" #include "asahi/lib/agx_scratch.h" #include "asahi/lib/agx_tilebuffer.h" +#include "asahi/lib/agx_uvs.h" #include "asahi/lib/pool.h" #include "asahi/lib/shaders/geometry.h" #include "compiler/nir/nir_lower_blend.h" @@ -29,6 +30,7 @@ #include "util/u_range.h" #include "agx_helpers.h" #include "agx_meta.h" +#include "agx_nir_passes.h" #ifdef __GLIBC__ #include @@ -162,6 +164,11 @@ struct PACKED agx_draw_uniforms { /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */ uint16_t clip_z_coeff; + + /* Mapping from varying slots written by the last vertex stage to UVS + * indices. This mapping must be compatible with the fragment shader. + */ + uint16_t uvs_index[VARYING_SLOT_MAX]; }; struct PACKED agx_stage_uniforms { @@ -221,6 +228,9 @@ struct agx_compiled_shader { unsigned push_range_count; struct agx_push_range push[AGX_MAX_PUSH_RANGES]; + /* UVS layout for the last vertex stage */ + struct agx_unlinked_uvs_layout uvs; + /* Auxiliary programs, or NULL if not used */ struct agx_compiled_shader *gs_count, *pre_gs; struct agx_compiled_shader *gs_copy; @@ -366,6 +376,7 @@ struct agx_batch { /* Current varyings linkage structures */ uint32_t varyings; + struct agx_varyings_vs linked_varyings; struct agx_draw_uniforms uniforms; struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES]; @@ -478,8 +489,6 @@ struct asahi_vs_shader_key { struct { bool fixed_point_size; - uint64_t outputs_flat_shaded; - uint64_t outputs_linear_shaded; } hw; } next; }; @@ -512,15 +521,13 @@ struct asahi_fs_shader_key { struct asahi_gs_shader_key { /* Rasterizer shader key */ - uint64_t outputs_flat_shaded; - uint64_t outputs_linear_shaded; bool fixed_point_size; /* If true, this GS is run only for its side effects (including XFB) */ bool rasterizer_discard; bool padding[6]; }; -static_assert(sizeof(struct asahi_gs_shader_key) == 24, "no holes"); +static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes"); union asahi_shader_key { struct asahi_vs_shader_key vs;