diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index 64074c71a55..3848fa8f4af 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -209,6 +209,10 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, NIR_PASS(_, s, panfrost_nir_lower_res_indices, &inputs); pan_nir_lower_texture_late(s, inputs.gpu_id); + /* nir_opt_varyings is replacing all flat highp types with float32, we need + * to figure out the varying types ourselves */ + inputs.trust_varying_flat_highp_types = false; + if (dev->arch >= 9) { inputs.valhall.use_ld_var_buf = panfrost_use_ld_var_buf(s); /* Always enable this for GL, it avoids crashes when using unbound diff --git a/src/panfrost/compiler/pan_compiler.h b/src/panfrost/compiler/pan_compiler.h index 310aeab0dc5..54088dec271 100644 --- a/src/panfrost/compiler/pan_compiler.h +++ b/src/panfrost/compiler/pan_compiler.h @@ -126,6 +126,11 @@ struct pan_compile_inputs { */ uint32_t fixed_varying_mask; + /* Optimizations as nir_opt_varyings can erase all flat types to float, when + * this field is false, varying types are inferred from their usage. + */ + bool trust_varying_flat_highp_types; + /* Settings to move constants into the FAU. */ struct { uint32_t *values; @@ -143,18 +148,170 @@ struct pan_compile_inputs { }; }; +enum pan_varying_section { + PAN_VARYING_SECTION_POSITION, + PAN_VARYING_SECTION_ATTRIBS, + /* Varyings computed on-the-fly */ + PAN_VARYING_SECTION_SPECIAL, + PAN_VARYING_SECTION_GENERIC, +}; + +/* Varyings which go in PAN_VARYING_SECTION_ATTRIBS */ +#define PAN_ATTRIB_VARYING_BITS \ + (VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | \ + VARYING_BIT_PRIMITIVE_ID) + +/* Varyings which go in PAN_VARYING_SECTION_SPECIAL (Midgard only) */ +#define PAN_SPECIAL_VARYING_BITS \ + (VARYING_BIT_PNTC | VARYING_BIT_POS | VARYING_BIT_FACE) + +/* Varyings which DO NOT go in PAN_VARYING_SECTION_GENERIC */ +#define PAN_HARDWARE_VARYING_BITS \ + (VARYING_BIT_POS | PAN_ATTRIB_VARYING_BITS | PAN_SPECIAL_VARYING_BITS) + +struct pan_varying_slot { + /* GLSL/SPIR-V location of the varying slot */ + gl_varying_slot location : 7; + + /* Format of the varying slot in memory + * (really nir_alu_type, but the compiler screams at you if you don't lie) */ + unsigned alu_type : 8; + unsigned ncomps : 3; + + enum pan_varying_section section : 2; + + /* Offset of the varying slot in the specified section of the varying + * buffer. For special VS outputs (see PAN_ATTRIB_VARYING_BITS), this is + * relative to the start of the position header. For all other varyings, + * this is relative to the start of the varying space. The offset will be + * -1 if unknown (before the memory layout is built). + */ + int offset : 12; +}; +static_assert(sizeof(struct pan_varying_slot) == 4, + "This struct has no holes"); + +static inline bool +pan_varying_slot_is_empty(const struct pan_varying_slot *slot) +{ + return slot->alu_type == nir_type_invalid; +} + +enum ENUM_PACKED pan_varying_knowledge { + PAN_VARYING_FORMAT_KNOWN = BITFIELD_BIT(0), + PAN_VARYING_LAYOUT_KNOWN = BITFIELD_BIT(1), +}; + +/* Contains information about varyings, both their format and the physical + * memory layout. The format is not necessarily what is actually stored in + * memory, but what format is in the register before the store_output, or what + * the shader expects after a load_input. The layout is optional and specifies + * the exact offset in memory of each varying, its section and the size of the + * generic buffer. The layout is only built for the Vertex Shader and passed + * on to the Fragment Shader if they are linked together, since the struct is + * valid even without format or layout information, the "known" field tracks + * what information the structure has, before accessing any format information + * you should check with `pan_varying_layout_require_format` that it is built + * and before accessing any layout information you should check with + * pan_varying_layout_require_layout if it is present. + * + * The format and layout are not split into two different structures to avoid + * duplicating indexing information. + * + * The slots are valid only up to `count`, but can also contain holes if they + * have been dead-code-eliminated after `nir_assign_io_var_locations`. Please + * use `pan_varying_slot_is_empty` to check if slots are empty. Empty slots are + * ignored by finding functions. + */ +PRAGMA_DIAGNOSTIC_PUSH +PRAGMA_DIAGNOSTIC_ERROR(-Wpadded) +struct pan_varying_layout { + uint8_t count; + enum pan_varying_knowledge known; + /* Size of the generic section, in bytes */ + uint16_t generic_size_B; + + struct pan_varying_slot slots[PAN_MAX_VARYINGS]; +}; +PRAGMA_DIAGNOSTIC_POP + +static inline const struct pan_varying_slot * +pan_varying_layout_find_slot(const struct pan_varying_layout *layout, + gl_varying_slot location) +{ + for (unsigned i = 0; i < layout->count; i++) { + if (layout->slots[i].location != location) + continue; + const struct pan_varying_slot *slot = &layout->slots[i]; + if (pan_varying_slot_is_empty(slot)) + break; + return slot; + } + + return NULL; +} + +static inline const struct pan_varying_slot * +pan_varying_layout_slot_at(const struct pan_varying_layout *layout, + unsigned index) +{ + if (index >= layout->count) + return NULL; + + const struct pan_varying_slot *slot = &layout->slots[index]; + if (pan_varying_slot_is_empty(slot)) + return NULL; + + return slot; +} + +static inline uint32_t +pan_get_fixed_varying_mask(unsigned varyings_used) +{ + return (varyings_used & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~PAN_ATTRIB_VARYING_BITS; +} + +static inline void +pan_varying_layout_require_format(const struct pan_varying_layout *layout) +{ + assert(layout); + if (!(layout->known & PAN_VARYING_FORMAT_KNOWN)) + assert(!"Format is required"); +} + +static inline void +pan_varying_layout_require_layout(const struct pan_varying_layout *layout) +{ + assert(layout); + if (!(layout->known & PAN_VARYING_LAYOUT_KNOWN)) + assert(!"Layout is required"); +} + +enum pipe_format +pan_varying_format(nir_alu_type type, unsigned ncomps); + +/** Builds a varying layout according to the SSO ABI we developed for OpenGL. + * + * This can be called on either shader stage and the two varying layouts are + * guaranteed to match if the same fixed_varyings are passed into both. + */ +void +pan_build_varying_layout_sso_abi(struct pan_varying_layout *layout, + nir_shader *nir, unsigned gpu_id, + uint32_t fixed_varyings); + +void +pan_varying_collect_formats(struct pan_varying_layout *registry, + nir_shader *nir, unsigned gpu_id, + bool trust_varying_flat_highp_types, + bool lower_mediump); + struct pan_shader_varying { gl_varying_slot location; enum pipe_format format; }; -static inline unsigned -pan_get_fixed_varying_mask(unsigned varyings_used) -{ - return (varyings_used & BITFIELD_MASK(VARYING_SLOT_VAR0)) & - ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; -} - struct bifrost_shader_blend_info { nir_alu_type type; uint32_t return_offset; diff --git a/src/panfrost/compiler/pan_nir_collect_varyings.c b/src/panfrost/compiler/pan_nir_collect_varyings.c index 8862c1147ae..2913fc0fd5d 100644 --- a/src/panfrost/compiler/pan_nir_collect_varyings.c +++ b/src/panfrost/compiler/pan_nir_collect_varyings.c @@ -1,15 +1,17 @@ /* * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. - * Copyright (C) 2019-2022 Collabora, Ltd. + * Copyright (C) 2019-2022,2026 Collabora, Ltd. * SPDX-License-Identifier: MIT */ #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "pan_nir.h" +#include "midgard/midgard_quirks.h" +#include "panfrost/model/pan_model.h" -static enum pipe_format -varying_format(nir_alu_type t, unsigned ncomps) +enum pipe_format +pan_varying_format(nir_alu_type t, unsigned ncomps) { assert(ncomps >= 1 && ncomps <= 4); @@ -29,8 +31,10 @@ varying_format(nir_alu_type t, unsigned ncomps) } conv[] = { VARYING_FORMAT(float, 32, FLOAT, 32), VARYING_FORMAT(uint, 32, UINT, 32), + VARYING_FORMAT(int, 32, SINT, 32), VARYING_FORMAT(float, 16, FLOAT, 16), VARYING_FORMAT(uint, 16, UINT, 16), + VARYING_FORMAT(int, 16, SINT, 16), }; #undef VARYING_FORMAT @@ -46,20 +50,21 @@ varying_format(nir_alu_type t, unsigned ncomps) struct slot_info { nir_alu_type type; + bool any_highp; unsigned count; unsigned index; }; struct walk_varyings_data { - struct pan_shader_info *info; + bool quirk_no_auto32; struct slot_info *slots; + bool trust_varying_flat_highp_types; }; static bool walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) { struct walk_varyings_data *wv_data = data; - struct pan_shader_info *info = wv_data->info; struct slot_info *slots = wv_data->slots; if (instr->type != nir_instr_type_intrinsic) @@ -67,7 +72,8 @@ walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); unsigned count; - unsigned size; + nir_alu_type type; + bool is_store; /* Only consider intrinsics that access varyings */ switch (intr->intrinsic) { @@ -77,7 +83,8 @@ walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) return false; count = nir_src_num_components(intr->src[0]); - size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr)); + type = nir_intrinsic_src_type(intr); + is_store = true; break; case nir_intrinsic_load_input: @@ -86,7 +93,8 @@ walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) return false; count = intr->def.num_components; - size = intr->def.bit_size; + type = nir_intrinsic_dest_type(intr); + is_store = false; break; default: @@ -98,21 +106,30 @@ walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) if (sem.no_varying) return false; - /* In a fragment shader, flat shading is lowered to load_input but - * interpolation is lowered to load_interpolated_input, so we can check - * the intrinsic to distinguish. - * - * In a vertex shader, we consider everything flat, as the information - * will not contribute to the final linked varyings -- flatness is used - * only to determine the type, and the GL linker uses the type from the - * fragment shader instead. - */ - bool flat = intr->intrinsic != nir_intrinsic_load_interpolated_input; - bool auto32 = !info->quirk_no_auto32 && size == 32; - nir_alu_type type = (flat && auto32) ? nir_type_uint : nir_type_float; + nir_alu_type base_type = nir_alu_type_get_base_type(type); + unsigned size = nir_alu_type_get_type_size(type); + assert(base_type & (nir_type_int | nir_type_uint | nir_type_float)); - assert(size == 32 || size == 16); - type |= size; + bool auto32 = !wv_data->quirk_no_auto32 && size == 32; + bool untrusted_type = !wv_data->trust_varying_flat_highp_types && + sem.location >= VARYING_SLOT_VAR0 && + !sem.medium_precision && + !b->shader->info.separate_shader; + if (untrusted_type) { + /* Don't trust the type, varying_opts might have smashed everything + * onto floats. Replace all flat varyings with ints and smooth varyings + * with floats, only exception is 16-bit flat varyings that should be + * stored/loaded as ints as the hardware cannot encode 16-bit flat ints. + * Read docs/drivers/panfrost/varyings.rst for details. + */ + bool is_flat = intr->intrinsic != nir_intrinsic_load_interpolated_input; + base_type = (is_flat && auto32) ? nir_type_uint : nir_type_float; + type = base_type | size; + if (is_store) + nir_intrinsic_set_src_type(intr, type); + else + nir_intrinsic_set_dest_type(intr, type); + } /* Count currently contains the number of components accessed by this * intrinsics. However, we may be accessing a fractional location, @@ -135,6 +152,9 @@ walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) slots[location].index = index; } + if (size == 32 && !sem.medium_precision) + slots[location].any_highp = true; + slots[location].count = MAX2(slots[location].count, count); } @@ -184,7 +204,10 @@ pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info) return; struct slot_info slots[64] = {0}; - struct walk_varyings_data wv_data = {info, slots}; + struct walk_varyings_data wv_data = { + .quirk_no_auto32 = info->quirk_no_auto32, + .slots = slots + }; nir_shader_instructions_pass(s, walk_varyings, nir_metadata_all, &wv_data); struct pan_shader_varying *varyings = (s->info.stage == MESA_SHADER_VERTEX) @@ -197,7 +220,7 @@ pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info) if (!slots[i].type) continue; - enum pipe_format format = varying_format(slots[i].type, slots[i].count); + enum pipe_format format = pan_varying_format(slots[i].type, slots[i].count); assert(format != PIPE_FORMAT_NONE); unsigned index = slots[i].index; @@ -216,3 +239,215 @@ pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info) info->varyings.noperspective = pan_nir_collect_noperspective_varyings_fs(s); } + +/* + * ABI: Special (desktop GL) slots come first, tightly packed. General varyings + * come later, sparsely packed. This handles both linked and separable shaders + * with a common code path, with minimal keying only for desktop GL. Each slot + * consumes 16 bytes (TODO: fp16, partial vectors). + * + * This is a copy+paste of the identical function in bifrost_compile.c + */ +static unsigned +bi_varying_base_bytes(gl_varying_slot slot, uint32_t fixed_varyings) +{ + if (slot >= VARYING_SLOT_VAR0) { + unsigned nr_special = util_bitcount(fixed_varyings); + unsigned general_index = (slot - VARYING_SLOT_VAR0); + + return 16 * (nr_special + general_index); + } else { + return 16 * (util_bitcount(fixed_varyings & BITFIELD_MASK(slot))); + } +} + +static const struct pan_varying_slot hw_varying_slots[] = {{ + .location = VARYING_SLOT_POS, + .alu_type = nir_type_float32, + .ncomps = 4, + .section = PAN_VARYING_SECTION_POSITION, + .offset = 0, +}, { + .location = VARYING_SLOT_PSIZ, + .alu_type = nir_type_float16, + .ncomps = 1, + .section = PAN_VARYING_SECTION_ATTRIBS, + .offset = 0, +}, { + .location = VARYING_SLOT_LAYER, + .alu_type = nir_type_uint8, + .ncomps = 1, + .section = PAN_VARYING_SECTION_ATTRIBS, + .offset = 2, +}, { + .location = VARYING_SLOT_VIEWPORT, + .alu_type = nir_type_uint8, + .ncomps = 1, + .section = PAN_VARYING_SECTION_ATTRIBS, + .offset = 2, +}, { + .location = VARYING_SLOT_PRIMITIVE_ID, + .alu_type = nir_type_uint32, + .ncomps = 1, + .section = PAN_VARYING_SECTION_ATTRIBS, + .offset = 12, +}}; + +/* On Midgard some attributes are computed on-the-fly from the drawing state, + * those are called special and require a custom descriptor definition. + * From v6 onwards those use the LD_VAR_SPECIAL instruction. + * Also on Midgard, VARYING_SLOT_TEX* might be point coordinates depending on + * the rasterizer state, if they are they should be theoretically in the special + * section. Since we don't know this yet we "misplace" them in the generic + * section anyway, they won't end up in the memory layout and they'll be handled + * by the descriptor emitter code. + * It's not a mistake, just a "happy little accident". + */ +static const struct pan_varying_slot special_varying_slots[] = {{ + .location = VARYING_SLOT_POS, + .alu_type = nir_type_float32, + .ncomps = 4, + .section = PAN_VARYING_SECTION_SPECIAL, + .offset = 0, +}, { + .location = VARYING_SLOT_PNTC, + .alu_type = nir_type_float32, + .ncomps = 1, + .section = PAN_VARYING_SECTION_SPECIAL, + .offset = 0, +}, { + .location = VARYING_SLOT_FACE, + .alu_type = nir_type_uint32, + .ncomps = 1, + .section = PAN_VARYING_SECTION_SPECIAL, + .offset = 0, +}}; + +static struct pan_varying_slot +hw_varying_slot(unsigned arch, mesa_shader_stage stage, gl_varying_slot slot) +{ + bool vs_pos = slot == VARYING_SLOT_POS && stage == MESA_SHADER_VERTEX; + /* pos is only special in fragment shader input, not vertex shader output */ + if (arch < 6 && !vs_pos) { + for (unsigned i = 0; i < ARRAY_SIZE(special_varying_slots); i++) { + if (special_varying_slots[i].location == slot) + return special_varying_slots[i]; + } + } + for (unsigned i = 0; i < ARRAY_SIZE(hw_varying_slots); i++) { + if (hw_varying_slots[i].location == slot) + return hw_varying_slots[i]; + } + UNREACHABLE("Invalid HW varying slot"); +} + +void +pan_varying_collect_formats(struct pan_varying_layout *layout, nir_shader *nir, + unsigned gpu_id, bool trust_varying_flat_highp_types, + bool lower_mediump) +{ + assert(nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_FRAGMENT); + memset(layout, 0, sizeof(*layout)); + + const unsigned gpu_arch = pan_arch(gpu_id); + bool quirk_no_auto32 = gpu_arch <= 5 && + (midgard_get_quirks(gpu_id) & MIDGARD_NO_AUTO32); + + struct slot_info slots[64] = {0}; + struct walk_varyings_data wv_data = { + .quirk_no_auto32 = quirk_no_auto32, + .slots = slots, + .trust_varying_flat_highp_types = trust_varying_flat_highp_types, + }; + + nir_shader_instructions_pass(nir, walk_varyings, nir_metadata_all, &wv_data); + + unsigned count = 0; + for (unsigned i = 0; i < ARRAY_SIZE(slots); i++) { + if (!slots[i].type) + continue; + + /* It's possible that something has been dead code eliminated between + * when the driver locations were set on variables and here. Don't + * trust our compaction to match the driver. Just copy over the index + * and accept that there's a hole in the mapping. + */ + unsigned idx = slots[i].index; + count = MAX2(count, idx + 1); + assert(count <= ARRAY_SIZE(layout->slots)); + assert(layout->slots[idx].alu_type == nir_type_invalid); + + if (BITFIELD64_BIT(i) & PAN_HARDWARE_VARYING_BITS) { + layout->slots[idx] = hw_varying_slot(gpu_arch, nir->info.stage, i); + } else { + nir_alu_type type = nir_alu_type_get_base_type(slots[i].type); + unsigned bit_size = nir_alu_type_get_type_size(slots[i].type); + + /* The Vulkan spec requires types to match across all uses of a + * location but doesn't actually require RelaxedPrecision to match + * for the whole location. So we can only apply mediump if every use + * of the location is mediump. + * Don't lower mediump integers, it has no measured impact and causes + * lots of bugs due to gallium shenanigans. + * Also allow the client to remove mediump lowering and keep the + * original types + */ + bool can_lower_size = lower_mediump && + bit_size == 32 && + type == nir_type_float && + !slots[i].any_highp; + if (can_lower_size) + bit_size = 16; + + layout->slots[idx] = (struct pan_varying_slot){ + .location = i, + .alu_type = type | bit_size, + .ncomps = slots[i].count, + .section = PAN_VARYING_SECTION_GENERIC, + /* Don't know the offset yet */ + .offset = -1, + }; + } + } + layout->count = count; + layout->generic_size_B = 0; + layout->known |= PAN_VARYING_FORMAT_KNOWN; +} + +void +pan_build_varying_layout_sso_abi(struct pan_varying_layout *layout, + nir_shader *nir, unsigned gpu_id, + uint32_t fixed_varyings) +{ + pan_varying_layout_require_format(layout); + + const unsigned gpu_arch = pan_arch(gpu_id); + unsigned generic_size_B = 0; + for (unsigned i = 0; i < layout->count; i++) { + struct pan_varying_slot *slot = &layout->slots[i]; + if (pan_varying_slot_is_empty(slot)) + continue; + + if (slot->section != PAN_VARYING_SECTION_GENERIC) { + ASSERTED const struct pan_varying_slot hw_slot = + hw_varying_slot(gpu_arch, nir->info.stage, slot->location); + + assert(memcmp(slot, &hw_slot, sizeof(*slot)) == 0); + } else { + unsigned offset = + bi_varying_base_bytes(slot->location, fixed_varyings); + assert(offset < (1 << 11)); + + const unsigned bit_size = nir_alu_type_get_type_size(slot->alu_type); + const unsigned size = slot->ncomps * (bit_size / 8); + generic_size_B = MAX2(generic_size_B, offset + size); + + assert(slot->offset == -1); + assert(offset < 4096); + slot->offset = offset; + } + } + layout->generic_size_B = generic_size_B; + layout->known |= PAN_VARYING_LAYOUT_KNOWN; +} diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index c696a103925..c65592df305 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -1345,6 +1345,8 @@ panvk_compile_shader(struct panvk_device *dev, nir_assign_io_var_locations(nir, nir_var_shader_out); panvk_lower_nir_io(nir); + inputs.trust_varying_flat_highp_types = true; + variant->own_bin = true; result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,