Merge branch 'panvk/pilot' into 'main'

Draft: Very draft! Panvk pilot shaders

See merge request mesa/mesa!39009
This commit is contained in:
Faith Ekstrand 2025-12-20 01:47:24 +00:00
commit 600fdfbc09
18 changed files with 697 additions and 166 deletions

View file

@ -6623,6 +6623,8 @@ bool nir_opt_load_skip_helpers(nir_shader *shader, nir_opt_load_skip_helpers_opt
void nir_sweep(nir_shader *shader);
void nir_steal_function(nir_shader *nir, nir_function *f);
nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val);
gl_system_value nir_system_value_from_intrinsic(nir_intrinsic_op intrin);

View file

@ -167,7 +167,6 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
case nir_intrinsic_load_fb_layers_v3d:
case nir_intrinsic_load_fep_w_v3d:
case nir_intrinsic_load_tcs_num_patches_amd:
case nir_intrinsic_load_sample_positions_pan:
case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
case nir_intrinsic_load_prim_gen_query_enabled_amd:
case nir_intrinsic_load_prim_xfb_query_enabled_amd:
@ -233,6 +232,8 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
case nir_intrinsic_load_constant_agx:
case nir_intrinsic_bindless_image_agx:
case nir_intrinsic_bindless_sampler_agx:
case nir_intrinsic_load_texel_buf_conv_pan:
case nir_intrinsic_load_texel_buf_index_address_pan:
return can_move_srcs(&instr->instr, ctx);
/* Image/SSBO loads can be moved if they are CAN_REORDER and their
@ -240,6 +241,7 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
*/
case nir_intrinsic_image_load:
case nir_intrinsic_image_samples_identical:
case nir_intrinsic_image_texel_address:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_load_global_bounded:
case nir_intrinsic_load_ssbo:
@ -813,7 +815,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
if (num_candidates == 0) {
free(ctx.states);
return false;
return nir_no_progress(impl);
}
def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
@ -880,7 +882,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
if (num_candidates == 0) {
free(ctx.states);
free(candidates);
return false;
return nir_no_progress(impl);
}
/* Step 4: Figure out which candidates we're going to replace and assign an
@ -940,6 +942,12 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
if (analyze_speculation_for_cf_list(&ctx, &impl->body))
analyze_reconstructed(&ctx, impl);
if (!BITSET_TEST_RANGE(ctx.reconstructed_defs, 0, impl->ssa_alloc)) {
free(ctx.states);
free(ctx.reconstructed_defs);
return nir_no_progress(impl);
}
/* Step 5: Actually do the replacement. */
struct hash_table *remap_table =
_mesa_pointer_hash_table_create(NULL);
@ -980,11 +988,10 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
}
}
nir_progress(true, impl, nir_metadata_control_flow);
ralloc_free(remap_table);
free(ctx.states);
free(ctx.reconstructed_defs);
_mesa_set_destroy(ctx.reconstructed_ifs, NULL);
return true;
return nir_progress(true, impl, nir_metadata_control_flow);
}

View file

@ -185,6 +185,29 @@ sweep_function(nir_shader *nir, nir_function *f)
sweep_impl(nir, f->impl);
}
/** Steals a function from one shader into another.
*
* This knows nothing about dependencies so if this function calls other
* functions, you'll need a more complex pass that uses nir_function_clone()
* to steal them all.
*/
void
nir_steal_function(nir_shader *nir, nir_function *f)
{
#ifndef NDEBUG
if (f->impl) {
nir_foreach_block(block, f->impl) {
nir_foreach_instr(instr, block) {
assert(instr->type != nir_instr_type_call);
}
}
}
#endif
exec_node_remove(&f->node);
exec_list_push_tail(&nir->functions, &f->node);
sweep_function(nir, f);
}
void
nir_sweep(nir_shader *nir)
{

View file

@ -6504,7 +6504,7 @@ bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id)
}
static int
compare_u32(const void* a, const void* b, void* _)
compare_u32(const void* a, const void* b)
{
const uint32_t va = (uintptr_t)a;
const uint32_t vb = (uintptr_t)b;
@ -6673,7 +6673,7 @@ bi_compile_variant_nir(nir_shader *nir,
sorted[idx++] = (uintptr_t)entry.data;
}
util_qsort_r(sorted, const_amount, sizeof(uint32_t), compare_u32, NULL);
qsort(sorted, const_amount, sizeof(uint32_t), compare_u32);
uint32_t max_amount = MIN2(const_amount, ctx->inputs->fau_consts.max_amount);
uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0;
ralloc_free(sorted);
@ -6837,7 +6837,7 @@ bi_compile_variant(nir_shader *nir,
/* Software invariant: Only a secondary shader can appear at a nonzero
* offset, to keep the ABI simple. */
assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
assert((offset == 0) ^ (idvs == BI_IDVS_VARYING || idvs == BI_IDVS_PILOT));
struct pan_stats *stats =
idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
@ -6874,6 +6874,11 @@ bi_compile_variant(nir_shader *nir,
info->vs.secondary_offset = offset;
info->vs.secondary_preload = preload;
info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
} else if (idvs == BI_IDVS_PILOT) {
info->pilot.enable = (binary->size > offset);
info->pilot.offset = offset;
info->pilot.preload = preload;
info->pilot.work_reg_count = ctx->info.work_reg_count;
} else {
info->preload = preload;
info->work_reg_count = ctx->info.work_reg_count;
@ -6996,6 +7001,26 @@ bifrost_compile_shader_nir(nir_shader *nir,
info->tls_size = nir->scratch_size;
nir_shader *pilot = NULL;
nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
if (entrypoint->preamble) {
pilot = nir_shader_create(nir, MESA_SHADER_COMPUTE, nir->options);
pilot->info = nir->info;
pilot->info.stage = MESA_SHADER_COMPUTE;
pilot->info.workgroup_size[0] = 1;
pilot->info.workgroup_size[1] = 1;
pilot->info.workgroup_size[2] = 1;
memset(&pilot->info.cs, 0, sizeof(pilot->info.cs));
nir_function *pilot_main = entrypoint->preamble;
nir_steal_function(pilot, pilot_main);
entrypoint->preamble = NULL;
pilot_main->is_preamble = false;
pilot_main->is_entrypoint = true;
}
pan_nir_collect_varyings(nir, info, PAN_MEDIUMP_VARY_32BIT);
if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) {
@ -7010,6 +7035,9 @@ bifrost_compile_shader_nir(nir_shader *nir,
bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
}
if (pilot)
bi_compile_variant(pilot, inputs, binary, info, BI_IDVS_PILOT);
if (mesa_shader_stage_is_compute(nir->info.stage)) {
/* Workgroups may be merged if the structure of the workgroup is
* not software visible. This is true if neither shared memory

View file

@ -1055,6 +1055,9 @@ enum bi_idvs_mode {
/* IDVS2 in use. Compiling a deferred shader (v12+) */
BI_IDVS_ALL = 3,
/* This is a pilot shader, and not a position or vertex shader */
BI_IDVS_PILOT = 4,
};
#define BI_MAX_REGS 64

View file

@ -16,6 +16,7 @@ libpanfrost_compiler_files = files(
'pan_nir_lower_vertex_id.c',
'pan_nir_lower_writeout.c',
'pan_nir_lower_xfb.c',
'pan_nir_opt_pilot.c',
)
subdir('bifrost')

View file

@ -315,6 +315,23 @@ struct pan_shader_info {
} cs;
};
struct {
/* True if a pilot shader is used */
bool enable;
/* If the pilot shader is enabled, number of work registers used by
* the pilot shader
*/
uint16_t work_reg_count;
/* If the pilot shader is enabled, the pilot shader's offset in the
* program binary.
*/
uint32_t offset;
/* If the pilot shader is enabled, bit mask of preloaded registers
* used by the varying shader
*/
uint64_t preload;
} pilot;
/* Does the shader contains a barrier? or (for fragment shaders) does it
* require helper invocations, which demand the same ordering guarantees
* of the hardware? These notions are unified in the hardware, so we

View file

@ -60,6 +60,26 @@ bool pan_nir_lower_framebuffer(nir_shader *shader,
unsigned blend_shader_nr_samples,
bool broken_ld_special);
struct pan_nir_opt_pilot_params {
unsigned gpu_id;
/* Offset in the FAU where the address to the FAU memory lives. This is
* used for writing FAUs from the pilot shader.
*/
uint32_t fau_addr_offset_B;
/* Offset in the FAU where the pilot data should go */
uint32_t fau_pilot_data_offset_B;
/* Amount of space in the FAU available for pilot data */
uint32_t fau_pilot_data_space_B;
/* Amount of actual pilot data consumed. This is an output parameter */
uint32_t pilot_data_size_B;
};
bool pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p);
uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);
/* Specify the mediump lowering behavior for pan_nir_collect_varyings */

View file

@ -93,20 +93,9 @@ kill_depth_stencil_writes(nir_builder *b, nir_intrinsic_instr *intr,
return true;
}
bool
pan_nir_lower_zs_store(nir_shader *nir)
static bool
lower_zs_store_impl(nir_function_impl *impl)
{
bool progress = false;
if (nir->info.stage != MESA_SHADER_FRAGMENT)
return false;
/* Remove all stencil/depth writes if early fragment test is forced. */
if (nir->info.fs.early_fragment_tests)
progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
nir_metadata_control_flow, NULL);
nir_foreach_function_impl(impl, nir) {
nir_intrinsic_instr *stores[3] = {NULL};
nir_intrinsic_instr *last_mask_store = NULL;
nir_block *mask_block = NULL;
@ -140,7 +129,7 @@ pan_nir_lower_zs_store(nir_shader *nir)
}
if (!writeout && !last_mask_store)
continue;
return nir_no_progress(impl);
nir_block *common_block = mask_block;
@ -229,8 +218,26 @@ pan_nir_lower_zs_store(nir_shader *nir)
nir_instr_remove(&stores[i]->instr);
}
progress = nir_progress(true, impl, nir_metadata_control_flow);
return nir_progress(true, impl, nir_metadata_control_flow);
}
bool
pan_nir_lower_zs_store(nir_shader *nir)
{
bool progress = false;
if (nir->info.stage != MESA_SHADER_FRAGMENT) {
nir_shader_preserve_all_metadata(nir);
return false;
}
/* Remove all stencil/depth writes if early fragment test is forced. */
if (nir->info.fs.early_fragment_tests)
progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
nir_metadata_control_flow, NULL);
nir_foreach_function_impl(impl, nir)
progress |= lower_zs_store_impl(impl);
return progress;
}

View file

@ -0,0 +1,194 @@
/*
* Copyright (C) 2025 Collabora, Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "panfrost/model/pan_model.h"
#include "pan_nir.h"
static void
pilot_def_size(nir_def *def, unsigned *size, unsigned *alignment,
nir_preamble_class *class)
{
unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
unsigned bits =bit_size * def->num_components;
*size = align(bits / 8, 4);
*alignment = 4;
*class = nir_preamble_class_general;
}
static float
pilot_instr_cost(nir_instr *instr, const void *data)
{
switch (instr->type) {
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
if (alu->def.bit_size == 64 || alu->src[0].src.ssa->bit_size == 64)
return 10.0;
/* Shrug */
return 2.0;
}
case nir_instr_type_tex:
return 20.0;
case nir_instr_type_intrinsic:
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_ssbo:
return 20.0;
default:
/* Assume it's a sysval or something */
return 0.0;
}
case nir_instr_type_load_const:
case nir_instr_type_undef:
return 0.0;
default:
return 1.0;
}
}
static float
pilot_rewrite_cost(nir_def *def, const void *data)
{
unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
unsigned bits = bit_size * def->num_components;
unsigned dwords = DIV_ROUND_UP(bits, 32);
bool needs_mov = false;
nir_foreach_use(use, def) {
nir_instr *parent_instr = nir_src_parent_instr(use);
switch (parent_instr->type) {
case nir_instr_type_tex:
case nir_instr_type_intrinsic:
needs_mov = true;
break;
default:
break;
}
}
/* It's not a mov but booleans require a conversion */
if (def->bit_size == 1)
needs_mov = true;
/* We want rewrites to always cost something or else we'll pull constants
* and undefs into the pilot shader.
*/
return needs_mov ? dwords * 2.0 : 1.0;
}
static bool
pilot_avoid_instr(const nir_instr *instr, const void *data)
{
return false;
}
static bool
lower_pilot_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
const struct pan_nir_opt_pilot_params *p = data;
switch (intr->intrinsic) {
case nir_intrinsic_load_subgroup_size: {
b->cursor = nir_before_instr(&intr->instr);
unsigned sg_size = pan_subgroup_size(pan_arch(p->gpu_id));
nir_def_replace(&intr->def, nir_imm_int(b, sg_size));
return true;
}
case nir_intrinsic_load_preamble: {
b->cursor = nir_before_instr(&intr->instr);
uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
assert(offset % 4 == 0);
unsigned bit_size = intr->def.bit_size == 1 ? 32 : intr->def.bit_size;
unsigned bits = bit_size * intr->def.num_components;
nir_def *val = nir_load_push_constant(b,
DIV_ROUND_UP(bits, 32), 32, nir_imm_int(b, offset));
val = nir_bitcast_vector(b, val, bit_size);
val = nir_trim_vector(b, val, intr->def.num_components);
if (intr->def.bit_size == 1)
val = nir_b2b1(b, val);
nir_def_replace(&intr->def, val);
return true;
}
case nir_intrinsic_store_preamble: {
b->cursor = nir_before_instr(&intr->instr);
nir_def *val = intr->src[0].ssa;
if (val->bit_size == 1)
val = nir_b2b32(b, val);
nir_def *push_const_buf = nir_load_push_constant(
b, 1, 64, nir_imm_int(b, p->fau_addr_offset_B));
uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
nir_store_global(b, val, nir_iadd_imm(b, push_const_buf, offset),
.align_mul = 4);
nir_instr_remove(&intr->instr);
return true;
}
default:
return false;
}
}
bool
pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p)
{
bool progress = false;
const nir_opt_preamble_options preamble_opts = {
.subgroup_size_uniform = true,
.def_size = pilot_def_size,
.preamble_storage_size = {
[nir_preamble_class_general] = p->fau_pilot_data_space_B,
},
.instr_cost_cb = pilot_instr_cost,
.rewrite_cost_cb = pilot_rewrite_cost,
.avoid_instr_cb = pilot_avoid_instr,
};
NIR_PASS(progress, nir, nir_opt_preamble, &preamble_opts,
&p->pilot_data_size_B);
if (!progress)
return false;
NIR_PASS(progress, nir, nir_shader_intrinsics_pass, lower_pilot_intr,
nir_metadata_control_flow, p);
return progress;
}

View file

@ -172,9 +172,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
if (result != VK_SUCCESS)
return;
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
if (result != VK_SUCCESS)
return;
}
if (compute_state_dirty(cmdbuf, CS) ||
compute_state_dirty(cmdbuf, DESC_STATE)) {

View file

@ -452,6 +452,89 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
return VK_SUCCESS;
}
static void
launch_pilot(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_shader_variant *s,
struct panvk_shader_desc_state *desc_state,
uint64_t push_uniforms)
{
if (!s->info.pilot.enable)
return;
const struct pan_compute_dim job_size = { 1, 1, 1 };
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
cs_update_compute_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SRT_2),
desc_state->res_table);
uint64_t fau_ptr = push_uniforms | ((uint64_t)s->fau.total_count << 56);
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_2), fau_ptr);
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_2),
panvk_priv_mem_dev_addr(s->pilot_spd));
/* TSD_2 is set in update_tls() */
struct mali_compute_size_workgroup_packed wg_size;
pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
cfg.workgroup_size_x = 1;
cfg.workgroup_size_y = 1;
cfg.workgroup_size_z = 1;
cfg.allow_merging_workgroups = true;
}
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, WG_SIZE), wg_size.opaque[0]);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_X), 0);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Y), 0);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Z), 0);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), job_size.x);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Y), job_size.y);
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), job_size.z);
}
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1, MALI_TASK_AXIS_X, cs_shader_res_sel(2, 2, 2, 2));
#if PAN_ARCH >= 11
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index add_val = cs_scratch_reg64(b, 2);
cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
cs_move64_to(b, add_val, 1);
panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
cs_defer_indirect());
#else
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index iter_sb = cs_scratch_reg32(b, 2);
struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
struct cs_index add_val = cs_scratch_reg64(b, 4);
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
cs_move64_to(b, add_val, 1);
cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));
}
#endif
++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
}
static bool
has_depth_att(struct panvk_cmd_buffer *cmdbuf)
{
@ -596,6 +679,8 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader);
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
struct cs_builder *cs_b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
if (!cmdbuf->state.gfx.tsd) {
if (!state->desc.gpu) {
@ -616,6 +701,10 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), state->desc.gpu);
#endif
}
cs_update_compute_ctx(cs_b) {
cs_move64_to(cs_b, cs_sr_reg64(b, COMPUTE, TSD_2), state->desc.gpu);
}
}
state->info.tls.size =
@ -1764,7 +1853,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
}
}
if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
bool fs_pilot = fs && fs->info.pilot.enable;
if (fs_user_dirty(cmdbuf) ||
(fs_pilot && gfx_state_dirty(cmdbuf, DESC_STATE)) ||
gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
uint64_t fau_ptr = 0;
if (fs) {
@ -1772,6 +1864,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
if (result != VK_SUCCESS)
return result;
if (fs->info.pilot.enable)
launch_pilot(cmdbuf, fs, &cmdbuf->state.gfx.fs.desc,
cmdbuf->state.gfx.fs.push_uniforms);
fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
((uint64_t)fs->fau.total_count << 56);
}
@ -2310,10 +2406,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
result = prepare_push_uniforms(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
result = prepare_vs(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
@ -2322,6 +2414,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
if (result != VK_SUCCESS)
return result;
result = prepare_push_uniforms(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
/* Assumes 16 byte slots. We could do better. */
uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
@ -3069,6 +3165,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
/* Wait on any pilot shaders */
const struct panvk_cs_deps deps = {
.src[PANVK_SUBQUEUE_COMPUTE] = {
.wait_sb_mask = dev->csf.sb.all_iters_mask,
},
.dst[PANVK_SUBQUEUE_FRAGMENT] = {
.wait_subqueue_mask = BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE),
},
};
panvk_per_arch(emit_barrier)(cmdbuf, deps);
/* Now initialize the fragment bits. */
cs_update_frag_ctx(b) {
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),

View file

@ -103,9 +103,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
if (result != VK_SUCCESS)
return;
}
struct pan_ptr copy_desc_job = {0};

View file

@ -1471,12 +1471,14 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
}
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, vs, 1);
if (result != VK_SUCCESS)
return;
}
if (fs) {
if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, fs, 1);
if (result != VK_SUCCESS)
@ -1633,12 +1635,14 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
}
if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, vs, 1);
if (result != VK_SUCCESS)
return;
}
if (fs) {
if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, fs, 1);
if (result != VK_SUCCESS)

View file

@ -348,6 +348,8 @@ struct panvk_shader_fau_info {
BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
uint32_t sysval_count;
uint32_t push_count;
uint32_t pilot_count;
uint32_t total_count;
};
@ -416,6 +418,7 @@ struct panvk_shader_variant {
#endif
} spds;
};
struct panvk_priv_mem pilot_spd;
#endif
const char *nir_str;

View file

@ -17,18 +17,12 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
switch (shader->info.stage) {
case MESA_SHADER_COMPUTE:
if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.compute.push_uniforms;
break;
case MESA_SHADER_VERTEX:
if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.gfx.vs.push_uniforms;
break;
case MESA_SHADER_FRAGMENT:
if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS))
return VK_SUCCESS;
push_ptr = &cmdbuf->state.gfx.fs.push_uniforms;
break;
default:
@ -78,6 +72,9 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS)
faus[fau++] = push_consts[w];
/* These are populated by the pilot shader */
fau += shader->fau.pilot_count;
for (uint32_t i = 0; i < shader->info.fau_consts_count; i += 2) {
faus[fau++] = (uint64_t)shader->info.fau_consts[i + 1] << 32 |
shader->info.fau_consts[i];

View file

@ -569,7 +569,7 @@ load_resource_deref_desc(nir_builder *b, nir_deref_instr *deref,
b, num_components, bit_size,
nir_imm_int(b, pan_res_handle(VALHALL_RESOURCE_TABLE_IDX, set + 1)),
set_offset, .range = ~0u, .align_mul = PANVK_DESCRIPTOR_SIZE,
.align_offset = desc_offset);
.align_offset = desc_offset, .access = ACCESS_CAN_SPECULATE);
#endif
}
@ -1162,6 +1162,9 @@ static bool
collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
struct lower_desc_ctx *ctx)
{
/* We'll also set can_speculate here. */
tex->can_speculate = true;
bool recorded = false;
uint32_t plane = 0;
int sampler_src_idx =
@ -1184,6 +1187,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
struct panvk_subdesc_info subdesc =
get_sampler_subdesc_info(binding_layout->type, plane);
if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
tex->can_speculate = false;
record_binding(ctx, set, binding, subdesc, max_idx);
recorded = true;
}
@ -1203,6 +1209,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
struct panvk_subdesc_info subdesc =
get_tex_subdesc_info(binding_layout->type, plane);
if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
tex->can_speculate = false;
record_binding(ctx, set, binding, subdesc, max_idx);
recorded = true;
}
@ -1210,11 +1219,66 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
return recorded;
}
static bool
mark_load_can_speculate(nir_intrinsic_instr *load,
const struct lower_desc_ctx *ctx)
{
assert(load->intrinsic == nir_intrinsic_load_deref);
/* If it's already marked CAN_SPECULATE, there's nothing to do */
enum gl_access_qualifier access = nir_intrinsic_access(load);
if (access & ACCESS_CAN_SPECULATE)
return false;
nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo))
return false;
if (nir_deref_mode_is(deref, nir_var_mem_ssbo) &&
((access & ACCESS_VOLATILE) || !(access & ACCESS_CAN_REORDER)))
return false;
while (true) {
nir_deref_instr *parent = nir_deref_instr_parent(deref);
if (parent == NULL)
break;
deref = parent;
}
assert(deref->deref_type == nir_deref_type_cast);
nir_intrinsic_instr *desc_load = nir_src_as_intrinsic(deref->parent);
if (desc_load == NULL ||
desc_load->intrinsic != nir_intrinsic_load_vulkan_descriptor)
return false;
nir_intrinsic_instr *desc_index = nir_src_as_intrinsic(desc_load->src[0]);
if (desc_index == NULL ||
desc_index->intrinsic != nir_intrinsic_vulkan_resource_index)
return false;
unsigned set = nir_intrinsic_desc_set(desc_index);
unsigned binding = nir_intrinsic_binding(desc_index);
const struct panvk_descriptor_set_layout *set_layout =
get_set_layout(set, ctx);
const struct panvk_descriptor_set_binding_layout *bind_layout =
&set_layout->bindings[binding];
if (bind_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
return false;
access |= ACCESS_CAN_SPECULATE;
nir_intrinsic_set_access(load, access);
return true;
}
static bool
collect_intr_desc_access(nir_builder *b, nir_intrinsic_instr *intrin,
struct lower_desc_ctx *ctx)
{
switch (intrin->intrinsic) {
case nir_intrinsic_load_deref:
return mark_load_can_speculate(intrin, ctx);
case nir_intrinsic_vulkan_resource_index: {
unsigned set, binding;
@ -1338,6 +1402,9 @@ panvk_per_arch(nir_lower_descriptors)(
for (uint32_t i = 0; i < set_layout_count; i++)
ctx.set_layouts[i] = to_panvk_descriptor_set_layout(set_layouts[i]);
/* Collect descriptor access. This also marks texture and UBO/SSBO loads
* as can_speculate based on descriptor binding flags.
*/
NIR_PASS(progress, nir, nir_shader_instructions_pass,
collect_instr_desc_access, nir_metadata_all, &ctx);
if (!progress)

View file

@ -65,6 +65,12 @@
#include "vk_shader.h"
#include "vk_util.h"
static inline bool
panvk_wants_pilot_shader(mesa_shader_stage stage, unsigned arch)
{
return arch >= 9 && stage == MESA_SHADER_FRAGMENT;
}
#define FAU_WORD_COUNT 64
struct panvk_lower_sysvals_context {
@ -700,6 +706,9 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
shader_use_sysval(shader, graphics, blend.constants);
}
if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH))
shader_use_sysval(shader, common, push_uniforms);
progress = false;
NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
nir_metadata_all, shader);
@ -717,8 +726,8 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
/* 32 FAUs (256 bytes) are reserved for API push constants */
assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 &&
"too many sysval FAUs");
shader->fau.total_count =
shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
shader->fau.push_count = BITSET_COUNT(shader->fau.used_push_consts);
shader->fau.total_count = shader->fau.sysval_count + shader->fau.push_count;
assert(shader->fau.total_count <= FAU_WORD_COUNT &&
"asking for more FAUs than the hardware has to offer");
@ -925,6 +934,26 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
lower_load_push_consts(nir, shader);
if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) {
/* Our pilots will be more efficient if we optimize first. */
pan_optimize_nir(nir, input.gpu_id);
struct pan_nir_opt_pilot_params pilot = {
.gpu_id = input.gpu_id,
.fau_addr_offset_B = shader_remapped_sysval_offset(
shader, sysval_offset(common, push_uniforms)),
.fau_pilot_data_offset_B =
FAU_WORD_SIZE * shader->fau.total_count,
.fau_pilot_data_space_B =
FAU_WORD_SIZE * (FAU_WORD_COUNT - shader->fau.total_count),
};
pan_nir_opt_pilot(nir, &pilot);
shader->fau.pilot_count =
DIV_ROUND_UP(pilot.pilot_data_size_B, FAU_WORD_SIZE);
shader->fau.total_count += shader->fau.pilot_count;
}
/* Allow the remaining FAU space to be filled with constants. */
input.fau_consts.max_amount =
2 * (FAU_WORD_COUNT - shader->fau.total_count);
@ -1208,6 +1237,23 @@ panvk_shader_upload(struct panvk_device *dev,
}
#endif
}
if (shader->info.pilot.enable) {
shader->pilot_spd =
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
if (!panvk_priv_mem_check_alloc(shader->pilot_spd))
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
panvk_priv_mem_write_desc(shader->pilot_spd, 0, SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_COMPUTE,
cfg.register_allocation =
pan_register_allocation(shader->info.pilot.work_reg_count);
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
shader->info.pilot.offset;
cfg.preload.r48_r63 = (shader->info.pilot.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
}
#endif
return VK_SUCCESS;
@ -1237,6 +1283,7 @@ panvk_shader_variant_destroy(struct panvk_shader_variant *shader)
panvk_pool_free_mem(&shader->spds.pos_triangles);
#endif
}
panvk_pool_free_mem(&shader->pilot_spd);
#endif
if (shader->own_bin)