diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 7209d28e853..442e34b2f10 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6623,6 +6623,8 @@ bool nir_opt_load_skip_helpers(nir_shader *shader, nir_opt_load_skip_helpers_opt void nir_sweep(nir_shader *shader); +void nir_steal_function(nir_shader *nir, nir_function *f); + nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val); gl_system_value nir_system_value_from_intrinsic(nir_intrinsic_op intrin); diff --git a/src/compiler/nir/nir_opt_preamble.c b/src/compiler/nir/nir_opt_preamble.c index 8686926aab3..3084fda9519 100644 --- a/src/compiler/nir/nir_opt_preamble.c +++ b/src/compiler/nir/nir_opt_preamble.c @@ -167,7 +167,6 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx) case nir_intrinsic_load_fb_layers_v3d: case nir_intrinsic_load_fep_w_v3d: case nir_intrinsic_load_tcs_num_patches_amd: - case nir_intrinsic_load_sample_positions_pan: case nir_intrinsic_load_pipeline_stat_query_enabled_amd: case nir_intrinsic_load_prim_gen_query_enabled_amd: case nir_intrinsic_load_prim_xfb_query_enabled_amd: @@ -233,6 +232,8 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx) case nir_intrinsic_load_constant_agx: case nir_intrinsic_bindless_image_agx: case nir_intrinsic_bindless_sampler_agx: + case nir_intrinsic_load_texel_buf_conv_pan: + case nir_intrinsic_load_texel_buf_index_address_pan: return can_move_srcs(&instr->instr, ctx); /* Image/SSBO loads can be moved if they are CAN_REORDER and their @@ -240,6 +241,7 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx) */ case nir_intrinsic_image_load: case nir_intrinsic_image_samples_identical: + case nir_intrinsic_image_texel_address: case nir_intrinsic_bindless_image_load: case nir_intrinsic_load_global_bounded: case nir_intrinsic_load_ssbo: @@ -813,7 +815,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options, if (num_candidates == 0) { free(ctx.states); - return false; + return nir_no_progress(impl); } def_state **candidates = malloc(sizeof(*candidates) * num_candidates); @@ -880,7 +882,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options, if (num_candidates == 0) { free(ctx.states); free(candidates); - return false; + return nir_no_progress(impl); } /* Step 4: Figure out which candidates we're going to replace and assign an @@ -940,6 +942,12 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options, if (analyze_speculation_for_cf_list(&ctx, &impl->body)) analyze_reconstructed(&ctx, impl); + if (!BITSET_TEST_RANGE(ctx.reconstructed_defs, 0, impl->ssa_alloc)) { + free(ctx.states); + free(ctx.reconstructed_defs); + return nir_no_progress(impl); + } + /* Step 5: Actually do the replacement. */ struct hash_table *remap_table = _mesa_pointer_hash_table_create(NULL); @@ -980,11 +988,10 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options, } } - nir_progress(true, impl, nir_metadata_control_flow); - ralloc_free(remap_table); free(ctx.states); free(ctx.reconstructed_defs); _mesa_set_destroy(ctx.reconstructed_ifs, NULL); - return true; + + return nir_progress(true, impl, nir_metadata_control_flow); } diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c index 0bba79450a5..25c7663ce87 100644 --- a/src/compiler/nir/nir_sweep.c +++ b/src/compiler/nir/nir_sweep.c @@ -185,6 +185,29 @@ sweep_function(nir_shader *nir, nir_function *f) sweep_impl(nir, f->impl); } +/** Steals a function from one shader into another. + * + * This knows nothing about dependencies so if this function calls other + * functions, you'll need a more complex pass that uses nir_function_clone() + * to steal them all. + */ +void +nir_steal_function(nir_shader *nir, nir_function *f) +{ +#ifndef NDEBUG + if (f->impl) { + nir_foreach_block(block, f->impl) { + nir_foreach_instr(instr, block) { + assert(instr->type != nir_instr_type_call); + } + } + } +#endif + exec_node_remove(&f->node); + exec_list_push_tail(&nir->functions, &f->node); + sweep_function(nir, f); +} + void nir_sweep(nir_shader *nir) { diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 558ee51abf1..ba77467d315 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -6504,7 +6504,7 @@ bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id) } static int -compare_u32(const void* a, const void* b, void* _) +compare_u32(const void* a, const void* b) { const uint32_t va = (uintptr_t)a; const uint32_t vb = (uintptr_t)b; @@ -6673,9 +6673,9 @@ bi_compile_variant_nir(nir_shader *nir, sorted[idx++] = (uintptr_t)entry.data; } - util_qsort_r(sorted, const_amount, sizeof(uint32_t), compare_u32, NULL); + qsort(sorted, const_amount, sizeof(uint32_t), compare_u32); uint32_t max_amount = MIN2(const_amount, ctx->inputs->fau_consts.max_amount); - uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0; + uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0; ralloc_free(sorted); bi_foreach_instr_global_safe(ctx, I) { @@ -6837,7 +6837,7 @@ bi_compile_variant(nir_shader *nir, /* Software invariant: Only a secondary shader can appear at a nonzero * offset, to keep the ABI simple. */ - assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); + assert((offset == 0) ^ (idvs == BI_IDVS_VARYING || idvs == BI_IDVS_PILOT)); struct pan_stats *stats = idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats; @@ -6874,6 +6874,11 @@ bi_compile_variant(nir_shader *nir, info->vs.secondary_offset = offset; info->vs.secondary_preload = preload; info->vs.secondary_work_reg_count = ctx->info.work_reg_count; + } else if (idvs == BI_IDVS_PILOT) { + info->pilot.enable = (binary->size > offset); + info->pilot.offset = offset; + info->pilot.preload = preload; + info->pilot.work_reg_count = ctx->info.work_reg_count; } else { info->preload = preload; info->work_reg_count = ctx->info.work_reg_count; @@ -6996,6 +7001,26 @@ bifrost_compile_shader_nir(nir_shader *nir, info->tls_size = nir->scratch_size; + nir_shader *pilot = NULL; + nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); + if (entrypoint->preamble) { + pilot = nir_shader_create(nir, MESA_SHADER_COMPUTE, nir->options); + + pilot->info = nir->info; + pilot->info.stage = MESA_SHADER_COMPUTE; + pilot->info.workgroup_size[0] = 1; + pilot->info.workgroup_size[1] = 1; + pilot->info.workgroup_size[2] = 1; + memset(&pilot->info.cs, 0, sizeof(pilot->info.cs)); + + nir_function *pilot_main = entrypoint->preamble; + nir_steal_function(pilot, pilot_main); + entrypoint->preamble = NULL; + + pilot_main->is_preamble = false; + pilot_main->is_entrypoint = true; + } + pan_nir_collect_varyings(nir, info, PAN_MEDIUMP_VARY_32BIT); if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) { @@ -7010,6 +7035,9 @@ bifrost_compile_shader_nir(nir_shader *nir, bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE); } + if (pilot) + bi_compile_variant(pilot, inputs, binary, info, BI_IDVS_PILOT); + if (mesa_shader_stage_is_compute(nir->info.stage)) { /* Workgroups may be merged if the structure of the workgroup is * not software visible. This is true if neither shared memory diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index 6985d9f2191..fc4f9d6f16b 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1055,6 +1055,9 @@ enum bi_idvs_mode { /* IDVS2 in use. Compiling a deferred shader (v12+) */ BI_IDVS_ALL = 3, + + /* This is a pilot shader, and not a position or vertex shader */ + BI_IDVS_PILOT = 4, }; #define BI_MAX_REGS 64 diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build index 4ce16f92dd7..8693dac41ac 100644 --- a/src/panfrost/compiler/meson.build +++ b/src/panfrost/compiler/meson.build @@ -16,6 +16,7 @@ libpanfrost_compiler_files = files( 'pan_nir_lower_vertex_id.c', 'pan_nir_lower_writeout.c', 'pan_nir_lower_xfb.c', + 'pan_nir_opt_pilot.c', ) subdir('bifrost') diff --git a/src/panfrost/compiler/pan_compiler.h b/src/panfrost/compiler/pan_compiler.h index e29c9158474..68d3ab256e8 100644 --- a/src/panfrost/compiler/pan_compiler.h +++ b/src/panfrost/compiler/pan_compiler.h @@ -315,6 +315,23 @@ struct pan_shader_info { } cs; }; + struct { + /* True if a pilot shader is used */ + bool enable; + /* If the pilot shader is enabled, number of work registers used by + * the pilot shader + */ + uint16_t work_reg_count; + /* If the pilot shader is enabled, the pilot shader's offset in the + * program binary. + */ + uint32_t offset; + /* If the pilot shader is enabled, bit mask of preloaded registers + * used by the varying shader + */ + uint64_t preload; + } pilot; + /* Does the shader contains a barrier? or (for fragment shaders) does it * require helper invocations, which demand the same ordering guarantees * of the hardware? These notions are unified in the hardware, so we diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h index a7afdf97724..fc3d46ec200 100644 --- a/src/panfrost/compiler/pan_nir.h +++ b/src/panfrost/compiler/pan_nir.h @@ -60,6 +60,26 @@ bool pan_nir_lower_framebuffer(nir_shader *shader, unsigned blend_shader_nr_samples, bool broken_ld_special); +struct pan_nir_opt_pilot_params { + unsigned gpu_id; + + /* Offset in the FAU where the address to the FAU memory lives. This is + * used for writing FAUs from the pilot shader. + */ + uint32_t fau_addr_offset_B; + + /* Offset in the FAU where the pilot data should go */ + uint32_t fau_pilot_data_offset_B; + + /* Amount of space in the FAU available for pilot data */ + uint32_t fau_pilot_data_space_B; + + /* Amount of actual pilot data consumed. This is an output parameter */ + uint32_t pilot_data_size_B; +}; + +bool pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p); + uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s); /* Specify the mediump lowering behavior for pan_nir_collect_varyings */ diff --git a/src/panfrost/compiler/pan_nir_lower_writeout.c b/src/panfrost/compiler/pan_nir_lower_writeout.c index bfe16713bc8..6e52d8a9e45 100644 --- a/src/panfrost/compiler/pan_nir_lower_writeout.c +++ b/src/panfrost/compiler/pan_nir_lower_writeout.c @@ -93,144 +93,151 @@ kill_depth_stencil_writes(nir_builder *b, nir_intrinsic_instr *intr, return true; } +static bool +lower_zs_store_impl(nir_function_impl *impl) +{ + nir_intrinsic_instr *stores[3] = {NULL}; + nir_intrinsic_instr *last_mask_store = NULL; + nir_block *mask_block = NULL; + unsigned writeout = 0; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location == FRAG_RESULT_DEPTH) { + stores[0] = intr; + writeout |= PAN_WRITEOUT_Z; + } else if (sem.location == FRAG_RESULT_STENCIL) { + stores[1] = intr; + writeout |= PAN_WRITEOUT_S; + } else if (sem.dual_source_blend_index) { + assert(!stores[2]); /* there should be only 1 source for dual blending */ + stores[2] = intr; + writeout |= PAN_WRITEOUT_2; + } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) { + last_mask_store = intr; + mask_block = intr->instr.block; + } + } + } + + if (!writeout && !last_mask_store) + return nir_no_progress(impl); + + nir_block *common_block = mask_block; + + /* Ensure all stores are in the same block */ + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (!stores[i]) + continue; + + nir_block *block = stores[i]->instr.block; + + if (common_block) + assert(common_block == block); + else + common_block = block; + } + + /* move data stores in the common block to after the last mask store */ + if (last_mask_store) { + nir_cursor insert_point = nir_after_instr(&last_mask_store->instr); + nir_foreach_instr_safe(instr, mask_block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + /* stop when we've reached the last store to mask */ + if (intr == last_mask_store) + break; + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location >= FRAG_RESULT_DATA0 && + sem.location <= FRAG_RESULT_DATA7) { + nir_instr_move(insert_point, instr); + insert_point = nir_after_instr(instr); + } + } + } + + bool replaced = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (sem.location < FRAG_RESULT_DATA0) + continue; + + if (sem.dual_source_blend_index) + continue; + + assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); + + nir_builder b = + nir_builder_at(nir_after_block_before_jump(instr->block)); + + /* Trying to write depth twice results in the + * wrong blend shader being executed on + * Midgard */ + unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout); + + pan_nir_emit_combined_store(&b, intr, this_store, stores); + + nir_instr_remove(instr); + + replaced = true; + } + } + + /* Insert a store to the depth RT (0xff) if needed */ + if (!replaced) { + nir_builder b = + nir_builder_at(nir_after_block_before_jump(common_block)); + + pan_nir_emit_combined_store(&b, NULL, writeout, stores); + } + + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (stores[i]) + nir_instr_remove(&stores[i]->instr); + } + + return nir_progress(true, impl, nir_metadata_control_flow); +} + bool pan_nir_lower_zs_store(nir_shader *nir) { bool progress = false; - if (nir->info.stage != MESA_SHADER_FRAGMENT) + if (nir->info.stage != MESA_SHADER_FRAGMENT) { + nir_shader_preserve_all_metadata(nir); return false; + } /* Remove all stencil/depth writes if early fragment test is forced. */ if (nir->info.fs.early_fragment_tests) progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes, nir_metadata_control_flow, NULL); - nir_foreach_function_impl(impl, nir) { - nir_intrinsic_instr *stores[3] = {NULL}; - nir_intrinsic_instr *last_mask_store = NULL; - nir_block *mask_block = NULL; - unsigned writeout = 0; - - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.location == FRAG_RESULT_DEPTH) { - stores[0] = intr; - writeout |= PAN_WRITEOUT_Z; - } else if (sem.location == FRAG_RESULT_STENCIL) { - stores[1] = intr; - writeout |= PAN_WRITEOUT_S; - } else if (sem.dual_source_blend_index) { - assert(!stores[2]); /* there should be only 1 source for dual blending */ - stores[2] = intr; - writeout |= PAN_WRITEOUT_2; - } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) { - last_mask_store = intr; - mask_block = intr->instr.block; - } - } - } - - if (!writeout && !last_mask_store) - continue; - - nir_block *common_block = mask_block; - - /* Ensure all stores are in the same block */ - for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { - if (!stores[i]) - continue; - - nir_block *block = stores[i]->instr.block; - - if (common_block) - assert(common_block == block); - else - common_block = block; - } - - /* move data stores in the common block to after the last mask store */ - if (last_mask_store) { - nir_cursor insert_point = nir_after_instr(&last_mask_store->instr); - nir_foreach_instr_safe(instr, mask_block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - /* stop when we've reached the last store to mask */ - if (intr == last_mask_store) - break; - if (intr->intrinsic != nir_intrinsic_store_output) - continue; - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.location >= FRAG_RESULT_DATA0 && - sem.location <= FRAG_RESULT_DATA7) { - nir_instr_move(insert_point, instr); - insert_point = nir_after_instr(instr); - } - } - } - - bool replaced = false; - - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - - if (sem.location < FRAG_RESULT_DATA0) - continue; - - if (sem.dual_source_blend_index) - continue; - - assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); - - nir_builder b = - nir_builder_at(nir_after_block_before_jump(instr->block)); - - /* Trying to write depth twice results in the - * wrong blend shader being executed on - * Midgard */ - unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout); - - pan_nir_emit_combined_store(&b, intr, this_store, stores); - - nir_instr_remove(instr); - - replaced = true; - } - } - - /* Insert a store to the depth RT (0xff) if needed */ - if (!replaced) { - nir_builder b = - nir_builder_at(nir_after_block_before_jump(common_block)); - - pan_nir_emit_combined_store(&b, NULL, writeout, stores); - } - - for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { - if (stores[i]) - nir_instr_remove(&stores[i]->instr); - } - - progress = nir_progress(true, impl, nir_metadata_control_flow); - } + nir_foreach_function_impl(impl, nir) + progress |= lower_zs_store_impl(impl); return progress; } diff --git a/src/panfrost/compiler/pan_nir_opt_pilot.c b/src/panfrost/compiler/pan_nir_opt_pilot.c new file mode 100644 index 00000000000..698e833997e --- /dev/null +++ b/src/panfrost/compiler/pan_nir_opt_pilot.c @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2025 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "panfrost/model/pan_model.h" +#include "pan_nir.h" + +static void +pilot_def_size(nir_def *def, unsigned *size, unsigned *alignment, + nir_preamble_class *class) +{ + unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size; + unsigned bits =bit_size * def->num_components; + *size = align(bits / 8, 4); + *alignment = 4; + *class = nir_preamble_class_general; +} + +static float +pilot_instr_cost(nir_instr *instr, const void *data) +{ + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + if (alu->def.bit_size == 64 || alu->src[0].src.ssa->bit_size == 64) + return 10.0; + + /* Shrug */ + return 2.0; + } + + case nir_instr_type_tex: + return 20.0; + + case nir_instr_type_intrinsic: + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_ssbo: + return 20.0; + default: + /* Assume it's a sysval or something */ + return 0.0; + } + + case nir_instr_type_load_const: + case nir_instr_type_undef: + return 0.0; + + default: + return 1.0; + } +} + +static float +pilot_rewrite_cost(nir_def *def, const void *data) +{ + unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size; + unsigned bits = bit_size * def->num_components; + unsigned dwords = DIV_ROUND_UP(bits, 32); + + bool needs_mov = false; + nir_foreach_use(use, def) { + nir_instr *parent_instr = nir_src_parent_instr(use); + switch (parent_instr->type) { + case nir_instr_type_tex: + case nir_instr_type_intrinsic: + needs_mov = true; + break; + default: + break; + } + } + + /* It's not a mov but booleans require a conversion */ + if (def->bit_size == 1) + needs_mov = true; + + /* We want rewrites to always cost something or else we'll pull constants + * and undefs into the pilot shader. + */ + return needs_mov ? dwords * 2.0 : 1.0; +} + +static bool +pilot_avoid_instr(const nir_instr *instr, const void *data) +{ + return false; +} + +static bool +lower_pilot_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + const struct pan_nir_opt_pilot_params *p = data; + + switch (intr->intrinsic) { + case nir_intrinsic_load_subgroup_size: { + b->cursor = nir_before_instr(&intr->instr); + unsigned sg_size = pan_subgroup_size(pan_arch(p->gpu_id)); + nir_def_replace(&intr->def, nir_imm_int(b, sg_size)); + return true; + } + + case nir_intrinsic_load_preamble: { + b->cursor = nir_before_instr(&intr->instr); + + uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr); + assert(offset % 4 == 0); + + unsigned bit_size = intr->def.bit_size == 1 ? 32 : intr->def.bit_size; + unsigned bits = bit_size * intr->def.num_components; + nir_def *val = nir_load_push_constant(b, + DIV_ROUND_UP(bits, 32), 32, nir_imm_int(b, offset)); + + val = nir_bitcast_vector(b, val, bit_size); + val = nir_trim_vector(b, val, intr->def.num_components); + + if (intr->def.bit_size == 1) + val = nir_b2b1(b, val); + + nir_def_replace(&intr->def, val); + return true; + } + + case nir_intrinsic_store_preamble: { + b->cursor = nir_before_instr(&intr->instr); + + nir_def *val = intr->src[0].ssa; + if (val->bit_size == 1) + val = nir_b2b32(b, val); + + nir_def *push_const_buf = nir_load_push_constant( + b, 1, 64, nir_imm_int(b, p->fau_addr_offset_B)); + + uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr); + + nir_store_global(b, val, nir_iadd_imm(b, push_const_buf, offset), + .align_mul = 4); + + nir_instr_remove(&intr->instr); + return true; + } + + default: + return false; + } +} + +bool +pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p) +{ + bool progress = false; + + const nir_opt_preamble_options preamble_opts = { + .subgroup_size_uniform = true, + .def_size = pilot_def_size, + .preamble_storage_size = { + [nir_preamble_class_general] = p->fau_pilot_data_space_B, + }, + .instr_cost_cb = pilot_instr_cost, + .rewrite_cost_cb = pilot_rewrite_cost, + .avoid_instr_cb = pilot_avoid_instr, + }; + NIR_PASS(progress, nir, nir_opt_preamble, &preamble_opts, + &p->pilot_data_size_B); + if (!progress) + return false; + + NIR_PASS(progress, nir, nir_shader_intrinsics_pass, lower_pilot_intr, + nir_metadata_control_flow, p); + + return progress; +} diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index e1be606f699..99c259dcffd 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -172,9 +172,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) if (result != VK_SUCCESS) return; - result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1); - if (result != VK_SUCCESS) - return; + if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { + result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1); + if (result != VK_SUCCESS) + return; + } if (compute_state_dirty(cmdbuf, CS) || compute_state_dirty(cmdbuf, DESC_STATE)) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index b0c56c5f0aa..195f19604c7 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -452,6 +452,89 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; } +static void +launch_pilot(struct panvk_cmd_buffer *cmdbuf, + const struct panvk_shader_variant *s, + struct panvk_shader_desc_state *desc_state, + uint64_t push_uniforms) +{ + if (!s->info.pilot.enable) + return; + + const struct pan_compute_dim job_size = { 1, 1, 1 }; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + cs_update_compute_ctx(b) { + cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SRT_2), + desc_state->res_table); + + uint64_t fau_ptr = push_uniforms | ((uint64_t)s->fau.total_count << 56); + cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_2), fau_ptr); + + cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_2), + panvk_priv_mem_dev_addr(s->pilot_spd)); + + /* TSD_2 is set in update_tls() */ + + struct mali_compute_size_workgroup_packed wg_size; + pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; + cfg.allow_merging_workgroups = true; + } + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, WG_SIZE), wg_size.opaque[0]); + + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_X), 0); + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Y), 0); + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Z), 0); + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), job_size.x); + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Y), job_size.y); + cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), job_size.z); + } + + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing; + cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + 1, MALI_TASK_AXIS_X, cs_shader_res_sel(2, 2, 2, 2)); + +#if PAN_ARCH >= 11 + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index add_val = cs_scratch_reg64(b, 2); + + cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + + cs_add64(b, sync_addr, sync_addr, + PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); + cs_move64_to(b, add_val, 1); + panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true, + MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, + cs_defer_indirect()); +#else + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index iter_sb = cs_scratch_reg32(b, 2); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); + struct cs_index add_val = cs_scratch_reg64(b, 4); + + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + + cs_add64(b, sync_addr, sync_addr, + PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); + cs_move64_to(b, add_val, 1); + + cs_match_iter_sb(b, x, iter_sb, cmp_scratch) { + panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true, + MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); + } +#endif + + ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; +} + static bool has_depth_att(struct panvk_cmd_buffer *cmdbuf) { @@ -596,6 +679,8 @@ update_tls(struct panvk_cmd_buffer *cmdbuf) panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader); struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + struct cs_builder *cs_b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); if (!cmdbuf->state.gfx.tsd) { if (!state->desc.gpu) { @@ -616,6 +701,10 @@ update_tls(struct panvk_cmd_buffer *cmdbuf) cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), state->desc.gpu); #endif } + + cs_update_compute_ctx(cs_b) { + cs_move64_to(cs_b, cs_sr_reg64(b, COMPUTE, TSD_2), state->desc.gpu); + } } state->info.tls.size = @@ -1764,7 +1853,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf, } } - if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) { + bool fs_pilot = fs && fs->info.pilot.enable; + if (fs_user_dirty(cmdbuf) || + (fs_pilot && gfx_state_dirty(cmdbuf, DESC_STATE)) || + gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) { uint64_t fau_ptr = 0; if (fs) { @@ -1772,6 +1864,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf, if (result != VK_SUCCESS) return result; + if (fs->info.pilot.enable) + launch_pilot(cmdbuf, fs, &cmdbuf->state.gfx.fs.desc, + cmdbuf->state.gfx.fs.push_uniforms); + fau_ptr = cmdbuf->state.gfx.fs.push_uniforms | ((uint64_t)fs->fau.total_count << 56); } @@ -2310,10 +2406,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw); - result = prepare_push_uniforms(cmdbuf, draw); - if (result != VK_SUCCESS) - return result; - result = prepare_vs(cmdbuf, draw); if (result != VK_SUCCESS) return result; @@ -2322,6 +2414,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) if (result != VK_SUCCESS) return result; + result = prepare_push_uniforms(cmdbuf, draw); + if (result != VK_SUCCESS) + return result; + /* Assumes 16 byte slots. We could do better. */ uint32_t varying_size = get_varying_slots(cmdbuf) * 16; @@ -3069,6 +3165,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; + /* Wait on any pilot shaders */ + const struct panvk_cs_deps deps = { + .src[PANVK_SUBQUEUE_COMPUTE] = { + .wait_sb_mask = dev->csf.sb.all_iters_mask, + }, + .dst[PANVK_SUBQUEUE_FRAGMENT] = { + .wait_subqueue_mask = BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE), + }, + }; + panvk_per_arch(emit_barrier)(cmdbuf, deps); + /* Now initialize the fragment bits. */ cs_update_frag_ctx(b) { cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c index 50e09514f58..b55e92f22c5 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c @@ -103,9 +103,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info); - result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1); - if (result != VK_SUCCESS) - return; + if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { + result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1); + if (result != VK_SUCCESS) + return; + } struct pan_ptr copy_desc_job = {0}; diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c index e381fd614a8..7d524342373 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c @@ -1471,12 +1471,14 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); } - result = panvk_per_arch(cmd_prepare_push_uniforms)( - cmdbuf, vs, 1); - if (result != VK_SUCCESS) - return; + if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, vs, 1); + if (result != VK_SUCCESS) + return; + } - if (fs) { + if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, fs, 1); if (result != VK_SUCCESS) @@ -1633,12 +1635,14 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); } - result = panvk_per_arch(cmd_prepare_push_uniforms)( - cmdbuf, vs, 1); - if (result != VK_SUCCESS) - return; + if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { + result = panvk_per_arch(cmd_prepare_push_uniforms)( + cmdbuf, vs, 1); + if (result != VK_SUCCESS) + return; + } - if (fs) { + if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) { result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, fs, 1); if (result != VK_SUCCESS) diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h index 924af49e638..1f77174680d 100644 --- a/src/panfrost/vulkan/panvk_shader.h +++ b/src/panfrost/vulkan/panvk_shader.h @@ -348,6 +348,8 @@ struct panvk_shader_fau_info { BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS); BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS); uint32_t sysval_count; + uint32_t push_count; + uint32_t pilot_count; uint32_t total_count; }; @@ -416,6 +418,7 @@ struct panvk_shader_variant { #endif } spds; }; + struct panvk_priv_mem pilot_spd; #endif const char *nir_str; diff --git a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c index 9428bc9f950..3553ecd7c02 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c @@ -17,18 +17,12 @@ panvk_per_arch(cmd_prepare_push_uniforms)( switch (shader->info.stage) { case MESA_SHADER_COMPUTE: - if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) - return VK_SUCCESS; push_ptr = &cmdbuf->state.compute.push_uniforms; break; case MESA_SHADER_VERTEX: - if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) - return VK_SUCCESS; push_ptr = &cmdbuf->state.gfx.vs.push_uniforms; break; case MESA_SHADER_FRAGMENT: - if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) - return VK_SUCCESS; push_ptr = &cmdbuf->state.gfx.fs.push_uniforms; break; default: @@ -78,6 +72,9 @@ panvk_per_arch(cmd_prepare_push_uniforms)( BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS) faus[fau++] = push_consts[w]; + /* These are populated by the pilot shader */ + fau += shader->fau.pilot_count; + for (uint32_t i = 0; i < shader->info.fau_consts_count; i += 2) { faus[fau++] = (uint64_t)shader->info.fau_consts[i + 1] << 32 | shader->info.fau_consts[i]; diff --git a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c index 91376e3ab83..b6a5d89e48a 100644 --- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c +++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c @@ -569,7 +569,7 @@ load_resource_deref_desc(nir_builder *b, nir_deref_instr *deref, b, num_components, bit_size, nir_imm_int(b, pan_res_handle(VALHALL_RESOURCE_TABLE_IDX, set + 1)), set_offset, .range = ~0u, .align_mul = PANVK_DESCRIPTOR_SIZE, - .align_offset = desc_offset); + .align_offset = desc_offset, .access = ACCESS_CAN_SPECULATE); #endif } @@ -1162,6 +1162,9 @@ static bool collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex, struct lower_desc_ctx *ctx) { + /* We'll also set can_speculate here. */ + tex->can_speculate = true; + bool recorded = false; uint32_t plane = 0; int sampler_src_idx = @@ -1184,6 +1187,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex, struct panvk_subdesc_info subdesc = get_sampler_subdesc_info(binding_layout->type, plane); + if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT) + tex->can_speculate = false; + record_binding(ctx, set, binding, subdesc, max_idx); recorded = true; } @@ -1203,6 +1209,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex, struct panvk_subdesc_info subdesc = get_tex_subdesc_info(binding_layout->type, plane); + if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT) + tex->can_speculate = false; + record_binding(ctx, set, binding, subdesc, max_idx); recorded = true; } @@ -1210,11 +1219,66 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex, return recorded; } +static bool +mark_load_can_speculate(nir_intrinsic_instr *load, + const struct lower_desc_ctx *ctx) +{ + assert(load->intrinsic == nir_intrinsic_load_deref); + + /* If it's already marked CAN_SPECULATE, there's nothing to do */ + enum gl_access_qualifier access = nir_intrinsic_access(load); + if (access & ACCESS_CAN_SPECULATE) + return false; + + nir_deref_instr *deref = nir_src_as_deref(load->src[0]); + if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo)) + return false; + + if (nir_deref_mode_is(deref, nir_var_mem_ssbo) && + ((access & ACCESS_VOLATILE) || !(access & ACCESS_CAN_REORDER))) + return false; + + while (true) { + nir_deref_instr *parent = nir_deref_instr_parent(deref); + if (parent == NULL) + break; + deref = parent; + } + assert(deref->deref_type == nir_deref_type_cast); + + nir_intrinsic_instr *desc_load = nir_src_as_intrinsic(deref->parent); + if (desc_load == NULL || + desc_load->intrinsic != nir_intrinsic_load_vulkan_descriptor) + return false; + + nir_intrinsic_instr *desc_index = nir_src_as_intrinsic(desc_load->src[0]); + if (desc_index == NULL || + desc_index->intrinsic != nir_intrinsic_vulkan_resource_index) + return false; + + unsigned set = nir_intrinsic_desc_set(desc_index); + unsigned binding = nir_intrinsic_binding(desc_index); + const struct panvk_descriptor_set_layout *set_layout = + get_set_layout(set, ctx); + const struct panvk_descriptor_set_binding_layout *bind_layout = + &set_layout->bindings[binding]; + + if (bind_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT) + return false; + + access |= ACCESS_CAN_SPECULATE; + nir_intrinsic_set_access(load, access); + return true; +} + static bool collect_intr_desc_access(nir_builder *b, nir_intrinsic_instr *intrin, struct lower_desc_ctx *ctx) { switch (intrin->intrinsic) { + case nir_intrinsic_load_deref: + return mark_load_can_speculate(intrin, ctx); + case nir_intrinsic_vulkan_resource_index: { unsigned set, binding; @@ -1338,6 +1402,9 @@ panvk_per_arch(nir_lower_descriptors)( for (uint32_t i = 0; i < set_layout_count; i++) ctx.set_layouts[i] = to_panvk_descriptor_set_layout(set_layouts[i]); + /* Collect descriptor access. This also marks texture and UBO/SSBO loads + * as can_speculate based on descriptor binding flags. + */ NIR_PASS(progress, nir, nir_shader_instructions_pass, collect_instr_desc_access, nir_metadata_all, &ctx); if (!progress) diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 60dd109f215..e271fc849b8 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -65,6 +65,12 @@ #include "vk_shader.h" #include "vk_util.h" +static inline bool +panvk_wants_pilot_shader(mesa_shader_stage stage, unsigned arch) +{ + return arch >= 9 && stage == MESA_SHADER_FRAGMENT; +} + #define FAU_WORD_COUNT 64 struct panvk_lower_sysvals_context { @@ -700,6 +706,9 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader) shader_use_sysval(shader, graphics, blend.constants); } + if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) + shader_use_sysval(shader, common, push_uniforms); + progress = false; NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant, nir_metadata_all, shader); @@ -717,8 +726,8 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader) /* 32 FAUs (256 bytes) are reserved for API push constants */ assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 && "too many sysval FAUs"); - shader->fau.total_count = - shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts); + shader->fau.push_count = BITSET_COUNT(shader->fau.used_push_consts); + shader->fau.total_count = shader->fau.sysval_count + shader->fau.push_count; assert(shader->fau.total_count <= FAU_WORD_COUNT && "asking for more FAUs than the hardware has to offer"); @@ -925,6 +934,26 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, lower_load_push_consts(nir, shader); + if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) { + /* Our pilots will be more efficient if we optimize first. */ + pan_optimize_nir(nir, input.gpu_id); + + struct pan_nir_opt_pilot_params pilot = { + .gpu_id = input.gpu_id, + .fau_addr_offset_B = shader_remapped_sysval_offset( + shader, sysval_offset(common, push_uniforms)), + .fau_pilot_data_offset_B = + FAU_WORD_SIZE * shader->fau.total_count, + .fau_pilot_data_space_B = + FAU_WORD_SIZE * (FAU_WORD_COUNT - shader->fau.total_count), + }; + pan_nir_opt_pilot(nir, &pilot); + + shader->fau.pilot_count = + DIV_ROUND_UP(pilot.pilot_data_size_B, FAU_WORD_SIZE); + shader->fau.total_count += shader->fau.pilot_count; + } + /* Allow the remaining FAU space to be filled with constants. */ input.fau_consts.max_amount = 2 * (FAU_WORD_COUNT - shader->fau.total_count); @@ -1208,6 +1237,23 @@ panvk_shader_upload(struct panvk_device *dev, } #endif } + + if (shader->info.pilot.enable) { + shader->pilot_spd = + panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + if (!panvk_priv_mem_check_alloc(shader->pilot_spd)) + return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + panvk_priv_mem_write_desc(shader->pilot_spd, 0, SHADER_PROGRAM, cfg) { + cfg.stage = MALI_SHADER_STAGE_COMPUTE, + cfg.register_allocation = + pan_register_allocation(shader->info.pilot.work_reg_count); + cfg.binary = panvk_shader_variant_get_dev_addr(shader) + + shader->info.pilot.offset; + cfg.preload.r48_r63 = (shader->info.pilot.preload >> 48); + cfg.flush_to_zero_mode = shader_ftz_mode(shader); + } + } #endif return VK_SUCCESS; @@ -1237,6 +1283,7 @@ panvk_shader_variant_destroy(struct panvk_shader_variant *shader) panvk_pool_free_mem(&shader->spds.pos_triangles); #endif } + panvk_pool_free_mem(&shader->pilot_spd); #endif if (shader->own_bin)