diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index c88cfb629de..4024a10b114 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -129,7 +129,7 @@ void bi_opt_push_ubo(bi_context *ctx) { struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); - bi_pick_ubo(&ctx->info->push, &analysis); + bi_pick_ubo(ctx->info.push, &analysis); ctx->ubo_mask = 0; @@ -165,7 +165,7 @@ bi_opt_push_ubo(bi_context *ctx) for (unsigned w = 0; w < channels; ++w) { /* FAU is grouped in pairs (2 x 4-byte) */ unsigned base = - pan_lookup_pushed_ubo(&ctx->info->push, ubo, + pan_lookup_pushed_ubo(ctx->info.push, ubo, (offset + 4 * w)); unsigned fau_idx = (base >> 1); diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index e326a1f7e06..c5dbdebd2ee 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -702,11 +702,11 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0; - assert(loc < ARRAY_SIZE(ctx->info->bifrost.blend)); - assert(!ctx->info->bifrost.blend[loc].return_offset); - ctx->info->bifrost.blend[loc].return_offset = + assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend)); + assert(!ctx->info.bifrost->blend[loc].return_offset); + ctx->info.bifrost->blend[loc].return_offset = util_dynarray_num_elements(emission, uint8_t); - assert(!(ctx->info->bifrost.blend[loc].return_offset & 0x7)); + assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7)); } unsigned diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c index 55bd55eb676..e2b3e6caaf4 100644 --- a/src/panfrost/bifrost/bi_ra.c +++ b/src/panfrost/bifrost/bi_ra.c @@ -520,7 +520,7 @@ bi_register_allocate(bi_context *ctx) unsigned iter_count = 1000; /* max iterations */ /* Number of bytes of memory we've spilled into */ - unsigned spill_count = ctx->info->tls_size; + unsigned spill_count = ctx->info.tls_size; /* Try with reduced register pressure to improve thread count on v7 */ if (ctx->arch == 7) { @@ -528,7 +528,7 @@ bi_register_allocate(bi_context *ctx) l = bi_allocate_registers(ctx, &success, false); if (success) { - ctx->info->work_reg_count = 32; + ctx->info.work_reg_count = 32; } else { lcra_free(l); l = NULL; @@ -541,7 +541,7 @@ bi_register_allocate(bi_context *ctx) l = bi_allocate_registers(ctx, &success, true); if (success) { - ctx->info->work_reg_count = 64; + ctx->info.work_reg_count = 64; } else { signed spill_node = bi_choose_spill_node(ctx, l); lcra_free(l); @@ -559,7 +559,7 @@ bi_register_allocate(bi_context *ctx) assert(success); assert(l != NULL); - ctx->info->tls_size = spill_count; + ctx->info.tls_size = spill_count; bi_install_registers(ctx, l); lcra_free(l); diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 7a1714693b9..e9163f92551 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -414,7 +414,7 @@ bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos); unsigned uniform = pan_lookup_sysval(b->shader->sysval_to_id, - &b->shader->info->sysvals, + b->shader->info.sysvals, sysval); unsigned idx = (uniform * 16) + offset; @@ -535,7 +535,7 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt) } assert(rt < 8); - b->shader->info->bifrost.blend[rt].type = T; + b->shader->info.bifrost->blend[rt].type = T; } /* Blend shaders do not need to run ATEST since they are dependent on a @@ -601,7 +601,7 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) for (unsigned i = 0; i < count; ++i) bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i)); - b->shader->info->bifrost.blend_src1_type = + b->shader->info.bifrost->blend_src1_type = nir_intrinsic_src_type(instr); return; @@ -3190,7 +3190,7 @@ bi_print_stats(bi_context *ctx, unsigned size, FILE *fp) float cycles_bound = MAX2(cycles_arith, cycles_message); /* Thread count and register pressure are traded off only on v7 */ - bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32); + bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); unsigned nr_threads = full_threads ? 2 : 1; /* Dump stats */ @@ -3663,7 +3663,7 @@ bi_lower_branch(bi_block *block) } static void -bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary) +bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) { unsigned final_clause = bi_pack(ctx, binary); @@ -3673,15 +3673,15 @@ bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary) bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); unsigned first_deps = first_clause ? first_clause->dependencies : 0; - ctx->info->bifrost.wait_6 = (first_deps & (1 << 6)); - ctx->info->bifrost.wait_7 = (first_deps & (1 << 7)); + ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); + ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); /* Pad the shader with enough zero bytes to trick the prefetcher, * unless we're compiling an empty shader (in which case we don't pad * so the size remains 0) */ unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; - if (binary->size) { + if (binary->size - offset) { memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0, prefetch_size); } @@ -3743,25 +3743,27 @@ bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) NIR_PASS_V(nir, pan_nir_reorder_writeout); } -void -bifrost_compile_shader_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info) +static bi_context * +bi_compile_variant_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct hash_table_u64 *sysval_to_id, + struct bi_shader_info info, + enum bi_idvs_mode idvs) { - bifrost_debug = debug_get_option_bifrost_debug(); - - bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); - bi_context *ctx = rzalloc(NULL, bi_context); - ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); + /* There may be another program in the dynarray, start at the end */ + unsigned offset = binary->size; + + ctx->sysval_to_id = sysval_to_id; ctx->inputs = inputs; ctx->nir = nir; - ctx->info = info; ctx->stage = nir->info.stage; ctx->quirks = bifrost_get_quirks(inputs->gpu_id); ctx->arch = inputs->gpu_id >> 12; + ctx->info = info; + ctx->idvs = idvs; /* If nothing is pushed, all UBOs need to be uploaded */ ctx->ubo_mask = ~0; @@ -3775,8 +3777,6 @@ bifrost_compile_shader_nir(nir_shader *nir, nir_print_shader(nir, stdout); } - info->tls_size = nir->scratch_size; - nir_foreach_function(func, nir) { if (!func->impl) continue; @@ -3875,33 +3875,69 @@ bifrost_compile_shader_nir(nir_shader *nir, /* Analyze after scheduling since we depend on instruction order. */ bi_analyze_helper_terminate(ctx); - /* A register is preloaded <==> it is live before the first block */ - bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); - info->preload = first_block->reg_live_in; - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) bi_print_shader(ctx, stdout); if (ctx->arch <= 8) { - bi_pack_clauses(ctx, binary); + bi_pack_clauses(ctx, binary, offset); } else { /* TODO: pack flat */ } - info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos); - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - disassemble_bifrost(stdout, binary->data, binary->size, + disassemble_bifrost(stdout, binary->data + offset, binary->size - offset, bifrost_debug & BIFROST_DBG_VERBOSE); fflush(stdout); } if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && !skip_internal) { - bi_print_stats(ctx, binary->size, stderr); + bi_print_stats(ctx, binary->size - offset, stderr); + } + + return ctx; +} + +static void +bi_compile_variant(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct hash_table_u64 *sysval_to_id, + struct pan_shader_info *info, + enum bi_idvs_mode idvs) +{ + struct bi_shader_info local_info = { + .push = &info->push, + .bifrost = &info->bifrost, + .tls_size = info->tls_size, + .sysvals = &info->sysvals + }; + + unsigned offset = binary->size; + + /* Software invariant: Only a secondary shader can appear at a nonzero + * offset, to keep the ABI simple. */ + assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); + + bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs); + + /* A register is preloaded <==> it is live before the first block */ + bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); + uint64_t preload = first_block->reg_live_in; + + info->ubo_mask |= ctx->ubo_mask; + info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); + + if (idvs == BI_IDVS_VARYING) { + info->vs.secondary_enable = (binary->size > offset); + info->vs.secondary_offset = offset; + info->vs.secondary_preload = preload; + info->vs.secondary_work_reg_count = ctx->info.work_reg_count; + } else { + info->preload = preload; + info->work_reg_count = ctx->info.work_reg_count; } - _mesa_hash_table_u64_destroy(ctx->sysval_to_id); ralloc_free(ctx); } @@ -3941,3 +3977,29 @@ bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) /* Otherwise, IDVS is usually better */ return true; } + +void +bifrost_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info) +{ + bifrost_debug = debug_get_option_bifrost_debug(); + + bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); + struct hash_table_u64 *sysval_to_id = panfrost_init_sysvals(&info->sysvals, NULL); + + info->tls_size = nir->scratch_size; + info->vs.idvs = bi_should_idvs(nir, inputs); + + if (info->vs.idvs) { + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION); + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING); + } else { + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); + } + + info->ubo_mask &= BITSET_MASK(nir->info.num_ubos); + + _mesa_hash_table_u64_destroy(sysval_to_id); +} diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 01712a8462c..7bc88a9fd85 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -619,6 +619,15 @@ typedef struct bi_block { uint8_t pass_flags; } bi_block; +/* Subset of pan_shader_info needed per-variant, in order to support IDVS */ +struct bi_shader_info { + struct panfrost_ubo_push *push; + struct bifrost_shader_info *bifrost; + struct panfrost_sysvals *sysvals; + unsigned tls_size; + unsigned work_reg_count; +}; + /* State of index-driven vertex shading for current shader */ enum bi_idvs_mode { /* IDVS not in use */ @@ -634,7 +643,7 @@ enum bi_idvs_mode { typedef struct { const struct panfrost_compile_inputs *inputs; nir_shader *nir; - struct pan_shader_info *info; + struct bi_shader_info info; gl_shader_stage stage; struct list_head blocks; /* list of bi_block */ struct hash_table_u64 *sysval_to_id;