diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index 43b28465193..6d55125c4ac 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -69,6 +69,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen, struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id, .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE), + .fixed_varying_mask = state->key.fixed_varying_mask }; memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats)); diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 358523c2189..1ef8d09bd2b 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -3360,9 +3360,15 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch, pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { if (secondary_shader) { + unsigned v = vs->info.varyings.output_count; + unsigned f = fs->info.varyings.input_count; + unsigned slots = MAX2(v, f); + slots += util_bitcount(fs->key.fixed_varying_mask); + unsigned size = slots * 16; + /* Assumes 16 byte slots. We could do better. */ - cfg.vertex_packet_stride = vs->info.varyings.output_count * 16; - cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16; + cfg.vertex_packet_stride = size + 16; + cfg.vertex_attribute_stride = size; } else { /* Hardware requirement for "no varyings" */ cfg.vertex_packet_stride = 16; diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index b1044c0206d..e9567d9cbb3 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -307,6 +307,13 @@ panfrost_create_shader_state( else so->nir = cso->ir.nir; + /* Fix linkage early */ + if (so->nir->info.stage == MESA_SHADER_VERTEX) { + so->fixed_varying_mask = + (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; + } + /* Precompile for shader-db if we need to */ if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) { struct panfrost_context *ctx = pan_context(pctx); @@ -372,6 +379,7 @@ panfrost_build_key(struct panfrost_context *ctx, struct panfrost_device *dev = pan_device(ctx->base.screen); struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer; + struct panfrost_shader_variants *vs = ctx->shader[MESA_SHADER_VERTEX]; key->fs.nr_cbufs = fb->nr_cbufs; @@ -398,6 +406,12 @@ panfrost_build_key(struct panfrost_context *ctx, key->fs.rt_formats[i] = fmt; } } + + /* Funny desktop GL varying lowering on Valhall */ + if (dev->arch >= 9) { + assert(vs != NULL && "too early"); + key->fixed_varying_mask = vs->fixed_varying_mask; + } } /** @@ -508,13 +522,20 @@ panfrost_update_shader_variant(struct panfrost_context *ctx, if (type == PIPE_SHADER_COMPUTE) return; + /* We need linking information, defer this */ + if (type == PIPE_SHADER_FRAGMENT && !ctx->shader[PIPE_SHADER_VERTEX]) + return; + /* Match the appropriate variant */ signed variant = -1; struct panfrost_shader_variants *variants = ctx->shader[type]; simple_mtx_lock(&variants->lock); - struct panfrost_shader_key key = { 0 }; + struct panfrost_shader_key key = { + .fixed_varying_mask = variants->fixed_varying_mask + }; + panfrost_build_key(ctx, &key, variants->nir); for (unsigned i = 0; i < variants->variant_count; ++i) { @@ -539,6 +560,10 @@ static void panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso) { panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX); + + /* Fragment shaders are linked with vertex shaders */ + struct panfrost_context *ctx = pan_context(pctx); + panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); } static void diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index 737ed78c802..6d482911e10 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -272,6 +272,9 @@ struct panfrost_fs_key { }; struct panfrost_shader_key { + /* Valhall needs special handling for desktop GL varyings */ + uint32_t fixed_varying_mask; + /* If we need vertex shader keys, union it in */ struct panfrost_fs_key fs; }; @@ -315,6 +318,12 @@ struct panfrost_shader_variants { unsigned variant_count; + /* On vertex shaders, bit mask of special desktop-only varyings to link + * with the fragment shader. Used on Valhall to implement separable + * shaders for desktop GL. + */ + uint32_t fixed_varying_mask; + /* The current active variant */ unsigned active_variant; }; diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 12c09a68282..d0d9d1dfc6f 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -284,6 +284,41 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) bi_copy_component(b, instr, dest); } +/* + * ABI: Special (desktop GL) slots come first, tightly packed. General varyings + * come later, sparsely packed. This handles both linked and separable shaders + * with a common code path, with minimal keying only for desktop GL. Each slot + * consumes 16 bytes (TODO: fp16, partial vectors). + */ +static unsigned +bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) +{ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + uint32_t mask = ctx->inputs->fixed_varying_mask; + + if (sem.location >= VARYING_SLOT_VAR0) { + unsigned nr_special = util_bitcount(mask); + unsigned general_index = (sem.location - VARYING_SLOT_VAR0); + + return 16 * (nr_special + general_index); + } else { + return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); + } +} + +/* + * Compute the offset in bytes of a varying with an immediate offset, adding the + * offset to the base computed above. Convenience method. + */ +static unsigned +bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr) +{ + nir_src *src = nir_get_io_offset_src(intr); + assert(nir_src_is_const(*src) && "assumes immediate offset"); + + return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); +} + static void bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) { @@ -328,7 +363,8 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) if (b->shader->malloc_idvs && immediate) { /* Immediate index given in bytes. */ bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update, - vecsize, imm_index * 16); + vecsize, + bi_varying_offset(b->shader, instr)); } else if (immediate && smooth) { I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize, imm_index); @@ -339,24 +375,31 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) bi_index idx = bi_src_index(offset); unsigned base = nir_intrinsic_base(instr); - if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - if (b->shader->malloc_idvs) { /* Index needs to be in bytes, but NIR gives the index - * in slots. For now assume 16 bytes per slots. - * - * TODO: more complex linking? + * in slots. For now assume 16 bytes per element. */ - idx = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); - bi_ld_var_buf_f32_to(b, dest, src0, idx, regfmt, sample, - update, vecsize); + bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); + unsigned vbase = bi_varying_base_bytes(b->shader, instr); + + if (vbase != 0) + idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); + + bi_ld_var_buf_f32_to(b, dest, src0, idx_bytes, regfmt, + sample, update, vecsize); } else if (smooth) { + if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize); } else { - I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, - regfmt, vecsize); + if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + + I = bi_ld_var_flat_to(b, dest, idx, + BI_FUNCTION_NONE, regfmt, + vecsize); } } @@ -794,39 +837,6 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data) return false; } -/** - * Computes the offset in bytes of a varying. This assumes VARYING_SLOT_POS is - * mapped to location=0 and always present. This also assumes each slot - * consumes 16 bytes, which is a worst-case (highp vec4). In the future, this - * should be optimized to support fp16 and partial vectors. There are - * nontrivial interactions with separable shaders, however. - */ -static unsigned -bi_varying_offset(nir_shader *nir, nir_intrinsic_instr *intr) -{ - nir_src *offset = nir_get_io_offset_src(intr); - assert(nir_src_is_const(*offset) && "no indirect varyings on Valhall"); - - unsigned loc = 0; - unsigned slot = nir_intrinsic_base(intr) + nir_src_as_uint(*offset); - - nir_foreach_shader_out_variable(var, nir) { - if ((var->data.location == VARYING_SLOT_POS) || - (var->data.location == VARYING_SLOT_PSIZ)) - continue; - - if (var->data.driver_location > slot) - continue; - - if (var->data.driver_location == slot) - return loc; - - loc += 16; // todo size - } - - unreachable("Unlinked variable"); -} - static void bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) { @@ -880,7 +890,7 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) bi_src_index(&instr->src[0]), address, bi_word(address, 1), varying ? BI_SEG_VARY : BI_SEG_POS, - varying ? bi_varying_offset(b->shader->nir, instr) : 0); + varying ? bi_varying_offset(b->shader, instr) : 0); } else if (immediate) { bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b), diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index d16a9f67d73..59179f4888c 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -42,6 +42,7 @@ GENX(pan_shader_get_compiler_options)(void) #endif } +#if PAN_ARCH <= 7 static enum pipe_format varying_format(nir_alu_type t, unsigned ncomps) { @@ -157,6 +158,7 @@ collect_varyings(nir_shader *s, nir_variable_mode varying_mode, *varying_count = MAX2(*varying_count, loc + sz); } } +#endif #if PAN_ARCH >= 6 static enum mali_register_file_format @@ -230,8 +232,14 @@ GENX(pan_shader_compile)(nir_shader *s, info->vs.writes_point_size = s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); + +#if PAN_ARCH >= 9 + info->varyings.output_count = + util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0); +#else collect_varyings(s, nir_var_shader_out, info->varyings.output, &info->varyings.output_count); +#endif break; case MESA_SHADER_FRAGMENT: if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) @@ -286,8 +294,13 @@ GENX(pan_shader_compile)(nir_shader *s, info->fs.reads_face = (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) || BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE); +#if PAN_ARCH >= 9 + info->varyings.output_count = + util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0); +#else collect_varyings(s, nir_var_shader_in, info->varyings.input, &info->varyings.input_count); +#endif break; case MESA_SHADER_COMPUTE: info->wls_size = s->info.shared_size; diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 7d397d3ca17..3819e814272 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -190,6 +190,16 @@ struct panfrost_compile_inputs { uint8_t raw_fmt_mask; unsigned nr_cbufs; + /* Used on Valhall. + * + * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) + * written by the previous stage (fragment shader) or written by this + * stage (vertex shader). Bits are slots from gl_varying_slot. + * + * For modern APIs (GLES or VK), this should be 0. + */ + uint32_t fixed_varying_mask; + union { struct { bool static_rt_conv;