mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 11:28:05 +02:00
pan/bi: Specialize shaders for IDVS
We need to compile multiple variants and report them together in a common shader info. To do so, we split off per-variant shader infos and combine at the end. glmark2 is very happy: https://people.collabora.com/~alyssa/idvs-g52.txt Highlights include -bshading up 41% fps and -bbump:bump-render=high-poly up 62% faster Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14154>
This commit is contained in:
parent
c59977c3bf
commit
1d21de788d
5 changed files with 113 additions and 42 deletions
|
|
@ -129,7 +129,7 @@ void
|
||||||
bi_opt_push_ubo(bi_context *ctx)
|
bi_opt_push_ubo(bi_context *ctx)
|
||||||
{
|
{
|
||||||
struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
|
struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
|
||||||
bi_pick_ubo(&ctx->info->push, &analysis);
|
bi_pick_ubo(ctx->info.push, &analysis);
|
||||||
|
|
||||||
ctx->ubo_mask = 0;
|
ctx->ubo_mask = 0;
|
||||||
|
|
||||||
|
|
@ -165,7 +165,7 @@ bi_opt_push_ubo(bi_context *ctx)
|
||||||
for (unsigned w = 0; w < channels; ++w) {
|
for (unsigned w = 0; w < channels; ++w) {
|
||||||
/* FAU is grouped in pairs (2 x 4-byte) */
|
/* FAU is grouped in pairs (2 x 4-byte) */
|
||||||
unsigned base =
|
unsigned base =
|
||||||
pan_lookup_pushed_ubo(&ctx->info->push, ubo,
|
pan_lookup_pushed_ubo(ctx->info.push, ubo,
|
||||||
(offset + 4 * w));
|
(offset + 4 * w));
|
||||||
|
|
||||||
unsigned fau_idx = (base >> 1);
|
unsigned fau_idx = (base >> 1);
|
||||||
|
|
|
||||||
|
|
@ -702,11 +702,11 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
|
||||||
|
|
||||||
|
|
||||||
unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0;
|
unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0;
|
||||||
assert(loc < ARRAY_SIZE(ctx->info->bifrost.blend));
|
assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend));
|
||||||
assert(!ctx->info->bifrost.blend[loc].return_offset);
|
assert(!ctx->info.bifrost->blend[loc].return_offset);
|
||||||
ctx->info->bifrost.blend[loc].return_offset =
|
ctx->info.bifrost->blend[loc].return_offset =
|
||||||
util_dynarray_num_elements(emission, uint8_t);
|
util_dynarray_num_elements(emission, uint8_t);
|
||||||
assert(!(ctx->info->bifrost.blend[loc].return_offset & 0x7));
|
assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned
|
unsigned
|
||||||
|
|
|
||||||
|
|
@ -520,7 +520,7 @@ bi_register_allocate(bi_context *ctx)
|
||||||
unsigned iter_count = 1000; /* max iterations */
|
unsigned iter_count = 1000; /* max iterations */
|
||||||
|
|
||||||
/* Number of bytes of memory we've spilled into */
|
/* Number of bytes of memory we've spilled into */
|
||||||
unsigned spill_count = ctx->info->tls_size;
|
unsigned spill_count = ctx->info.tls_size;
|
||||||
|
|
||||||
/* Try with reduced register pressure to improve thread count on v7 */
|
/* Try with reduced register pressure to improve thread count on v7 */
|
||||||
if (ctx->arch == 7) {
|
if (ctx->arch == 7) {
|
||||||
|
|
@ -528,7 +528,7 @@ bi_register_allocate(bi_context *ctx)
|
||||||
l = bi_allocate_registers(ctx, &success, false);
|
l = bi_allocate_registers(ctx, &success, false);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
ctx->info->work_reg_count = 32;
|
ctx->info.work_reg_count = 32;
|
||||||
} else {
|
} else {
|
||||||
lcra_free(l);
|
lcra_free(l);
|
||||||
l = NULL;
|
l = NULL;
|
||||||
|
|
@ -541,7 +541,7 @@ bi_register_allocate(bi_context *ctx)
|
||||||
l = bi_allocate_registers(ctx, &success, true);
|
l = bi_allocate_registers(ctx, &success, true);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
ctx->info->work_reg_count = 64;
|
ctx->info.work_reg_count = 64;
|
||||||
} else {
|
} else {
|
||||||
signed spill_node = bi_choose_spill_node(ctx, l);
|
signed spill_node = bi_choose_spill_node(ctx, l);
|
||||||
lcra_free(l);
|
lcra_free(l);
|
||||||
|
|
@ -559,7 +559,7 @@ bi_register_allocate(bi_context *ctx)
|
||||||
assert(success);
|
assert(success);
|
||||||
assert(l != NULL);
|
assert(l != NULL);
|
||||||
|
|
||||||
ctx->info->tls_size = spill_count;
|
ctx->info.tls_size = spill_count;
|
||||||
bi_install_registers(ctx, l);
|
bi_install_registers(ctx, l);
|
||||||
|
|
||||||
lcra_free(l);
|
lcra_free(l);
|
||||||
|
|
|
||||||
|
|
@ -414,7 +414,7 @@ bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
|
||||||
MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
|
MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
|
||||||
unsigned uniform =
|
unsigned uniform =
|
||||||
pan_lookup_sysval(b->shader->sysval_to_id,
|
pan_lookup_sysval(b->shader->sysval_to_id,
|
||||||
&b->shader->info->sysvals,
|
b->shader->info.sysvals,
|
||||||
sysval);
|
sysval);
|
||||||
unsigned idx = (uniform * 16) + offset;
|
unsigned idx = (uniform * 16) + offset;
|
||||||
|
|
||||||
|
|
@ -535,7 +535,7 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(rt < 8);
|
assert(rt < 8);
|
||||||
b->shader->info->bifrost.blend[rt].type = T;
|
b->shader->info.bifrost->blend[rt].type = T;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Blend shaders do not need to run ATEST since they are dependent on a
|
/* Blend shaders do not need to run ATEST since they are dependent on a
|
||||||
|
|
@ -601,7 +601,7 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
for (unsigned i = 0; i < count; ++i)
|
for (unsigned i = 0; i < count; ++i)
|
||||||
bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
|
bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
|
||||||
|
|
||||||
b->shader->info->bifrost.blend_src1_type =
|
b->shader->info.bifrost->blend_src1_type =
|
||||||
nir_intrinsic_src_type(instr);
|
nir_intrinsic_src_type(instr);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
|
@ -3190,7 +3190,7 @@ bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
|
||||||
float cycles_bound = MAX2(cycles_arith, cycles_message);
|
float cycles_bound = MAX2(cycles_arith, cycles_message);
|
||||||
|
|
||||||
/* Thread count and register pressure are traded off only on v7 */
|
/* Thread count and register pressure are traded off only on v7 */
|
||||||
bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);
|
bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
|
||||||
unsigned nr_threads = full_threads ? 2 : 1;
|
unsigned nr_threads = full_threads ? 2 : 1;
|
||||||
|
|
||||||
/* Dump stats */
|
/* Dump stats */
|
||||||
|
|
@ -3663,7 +3663,7 @@ bi_lower_branch(bi_block *block)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary)
|
bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
|
||||||
{
|
{
|
||||||
unsigned final_clause = bi_pack(ctx, binary);
|
unsigned final_clause = bi_pack(ctx, binary);
|
||||||
|
|
||||||
|
|
@ -3673,15 +3673,15 @@ bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary)
|
||||||
bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
|
bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
|
||||||
|
|
||||||
unsigned first_deps = first_clause ? first_clause->dependencies : 0;
|
unsigned first_deps = first_clause ? first_clause->dependencies : 0;
|
||||||
ctx->info->bifrost.wait_6 = (first_deps & (1 << 6));
|
ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
|
||||||
ctx->info->bifrost.wait_7 = (first_deps & (1 << 7));
|
ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
|
||||||
|
|
||||||
/* Pad the shader with enough zero bytes to trick the prefetcher,
|
/* Pad the shader with enough zero bytes to trick the prefetcher,
|
||||||
* unless we're compiling an empty shader (in which case we don't pad
|
* unless we're compiling an empty shader (in which case we don't pad
|
||||||
* so the size remains 0) */
|
* so the size remains 0) */
|
||||||
unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
|
unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
|
||||||
|
|
||||||
if (binary->size) {
|
if (binary->size - offset) {
|
||||||
memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
|
memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
|
||||||
0, prefetch_size);
|
0, prefetch_size);
|
||||||
}
|
}
|
||||||
|
|
@ -3743,25 +3743,27 @@ bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
|
||||||
NIR_PASS_V(nir, pan_nir_reorder_writeout);
|
NIR_PASS_V(nir, pan_nir_reorder_writeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
static bi_context *
|
||||||
bifrost_compile_shader_nir(nir_shader *nir,
|
bi_compile_variant_nir(nir_shader *nir,
|
||||||
const struct panfrost_compile_inputs *inputs,
|
const struct panfrost_compile_inputs *inputs,
|
||||||
struct util_dynarray *binary,
|
struct util_dynarray *binary,
|
||||||
struct pan_shader_info *info)
|
struct hash_table_u64 *sysval_to_id,
|
||||||
|
struct bi_shader_info info,
|
||||||
|
enum bi_idvs_mode idvs)
|
||||||
{
|
{
|
||||||
bifrost_debug = debug_get_option_bifrost_debug();
|
|
||||||
|
|
||||||
bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend);
|
|
||||||
|
|
||||||
bi_context *ctx = rzalloc(NULL, bi_context);
|
bi_context *ctx = rzalloc(NULL, bi_context);
|
||||||
ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
|
|
||||||
|
|
||||||
|
/* There may be another program in the dynarray, start at the end */
|
||||||
|
unsigned offset = binary->size;
|
||||||
|
|
||||||
|
ctx->sysval_to_id = sysval_to_id;
|
||||||
ctx->inputs = inputs;
|
ctx->inputs = inputs;
|
||||||
ctx->nir = nir;
|
ctx->nir = nir;
|
||||||
ctx->info = info;
|
|
||||||
ctx->stage = nir->info.stage;
|
ctx->stage = nir->info.stage;
|
||||||
ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
|
ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
|
||||||
ctx->arch = inputs->gpu_id >> 12;
|
ctx->arch = inputs->gpu_id >> 12;
|
||||||
|
ctx->info = info;
|
||||||
|
ctx->idvs = idvs;
|
||||||
|
|
||||||
/* If nothing is pushed, all UBOs need to be uploaded */
|
/* If nothing is pushed, all UBOs need to be uploaded */
|
||||||
ctx->ubo_mask = ~0;
|
ctx->ubo_mask = ~0;
|
||||||
|
|
@ -3775,8 +3777,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
|
||||||
nir_print_shader(nir, stdout);
|
nir_print_shader(nir, stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
info->tls_size = nir->scratch_size;
|
|
||||||
|
|
||||||
nir_foreach_function(func, nir) {
|
nir_foreach_function(func, nir) {
|
||||||
if (!func->impl)
|
if (!func->impl)
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -3875,33 +3875,69 @@ bifrost_compile_shader_nir(nir_shader *nir,
|
||||||
/* Analyze after scheduling since we depend on instruction order. */
|
/* Analyze after scheduling since we depend on instruction order. */
|
||||||
bi_analyze_helper_terminate(ctx);
|
bi_analyze_helper_terminate(ctx);
|
||||||
|
|
||||||
/* A register is preloaded <==> it is live before the first block */
|
|
||||||
bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
|
|
||||||
info->preload = first_block->reg_live_in;
|
|
||||||
|
|
||||||
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
|
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
|
||||||
bi_print_shader(ctx, stdout);
|
bi_print_shader(ctx, stdout);
|
||||||
|
|
||||||
if (ctx->arch <= 8) {
|
if (ctx->arch <= 8) {
|
||||||
bi_pack_clauses(ctx, binary);
|
bi_pack_clauses(ctx, binary, offset);
|
||||||
} else {
|
} else {
|
||||||
/* TODO: pack flat */
|
/* TODO: pack flat */
|
||||||
}
|
}
|
||||||
|
|
||||||
info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
|
|
||||||
|
|
||||||
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
|
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
|
||||||
disassemble_bifrost(stdout, binary->data, binary->size,
|
disassemble_bifrost(stdout, binary->data + offset, binary->size - offset,
|
||||||
bifrost_debug & BIFROST_DBG_VERBOSE);
|
bifrost_debug & BIFROST_DBG_VERBOSE);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
|
if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
|
||||||
!skip_internal) {
|
!skip_internal) {
|
||||||
bi_print_stats(ctx, binary->size, stderr);
|
bi_print_stats(ctx, binary->size - offset, stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
bi_compile_variant(nir_shader *nir,
|
||||||
|
const struct panfrost_compile_inputs *inputs,
|
||||||
|
struct util_dynarray *binary,
|
||||||
|
struct hash_table_u64 *sysval_to_id,
|
||||||
|
struct pan_shader_info *info,
|
||||||
|
enum bi_idvs_mode idvs)
|
||||||
|
{
|
||||||
|
struct bi_shader_info local_info = {
|
||||||
|
.push = &info->push,
|
||||||
|
.bifrost = &info->bifrost,
|
||||||
|
.tls_size = info->tls_size,
|
||||||
|
.sysvals = &info->sysvals
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned offset = binary->size;
|
||||||
|
|
||||||
|
/* Software invariant: Only a secondary shader can appear at a nonzero
|
||||||
|
* offset, to keep the ABI simple. */
|
||||||
|
assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
|
||||||
|
|
||||||
|
bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs);
|
||||||
|
|
||||||
|
/* A register is preloaded <==> it is live before the first block */
|
||||||
|
bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
|
||||||
|
uint64_t preload = first_block->reg_live_in;
|
||||||
|
|
||||||
|
info->ubo_mask |= ctx->ubo_mask;
|
||||||
|
info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
|
||||||
|
|
||||||
|
if (idvs == BI_IDVS_VARYING) {
|
||||||
|
info->vs.secondary_enable = (binary->size > offset);
|
||||||
|
info->vs.secondary_offset = offset;
|
||||||
|
info->vs.secondary_preload = preload;
|
||||||
|
info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
|
||||||
|
} else {
|
||||||
|
info->preload = preload;
|
||||||
|
info->work_reg_count = ctx->info.work_reg_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mesa_hash_table_u64_destroy(ctx->sysval_to_id);
|
|
||||||
ralloc_free(ctx);
|
ralloc_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3941,3 +3977,29 @@ bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
|
||||||
/* Otherwise, IDVS is usually better */
|
/* Otherwise, IDVS is usually better */
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
bifrost_compile_shader_nir(nir_shader *nir,
|
||||||
|
const struct panfrost_compile_inputs *inputs,
|
||||||
|
struct util_dynarray *binary,
|
||||||
|
struct pan_shader_info *info)
|
||||||
|
{
|
||||||
|
bifrost_debug = debug_get_option_bifrost_debug();
|
||||||
|
|
||||||
|
bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend);
|
||||||
|
struct hash_table_u64 *sysval_to_id = panfrost_init_sysvals(&info->sysvals, NULL);
|
||||||
|
|
||||||
|
info->tls_size = nir->scratch_size;
|
||||||
|
info->vs.idvs = bi_should_idvs(nir, inputs);
|
||||||
|
|
||||||
|
if (info->vs.idvs) {
|
||||||
|
bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION);
|
||||||
|
bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING);
|
||||||
|
} else {
|
||||||
|
bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE);
|
||||||
|
}
|
||||||
|
|
||||||
|
info->ubo_mask &= BITSET_MASK(nir->info.num_ubos);
|
||||||
|
|
||||||
|
_mesa_hash_table_u64_destroy(sysval_to_id);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -619,6 +619,15 @@ typedef struct bi_block {
|
||||||
uint8_t pass_flags;
|
uint8_t pass_flags;
|
||||||
} bi_block;
|
} bi_block;
|
||||||
|
|
||||||
|
/* Subset of pan_shader_info needed per-variant, in order to support IDVS */
|
||||||
|
struct bi_shader_info {
|
||||||
|
struct panfrost_ubo_push *push;
|
||||||
|
struct bifrost_shader_info *bifrost;
|
||||||
|
struct panfrost_sysvals *sysvals;
|
||||||
|
unsigned tls_size;
|
||||||
|
unsigned work_reg_count;
|
||||||
|
};
|
||||||
|
|
||||||
/* State of index-driven vertex shading for current shader */
|
/* State of index-driven vertex shading for current shader */
|
||||||
enum bi_idvs_mode {
|
enum bi_idvs_mode {
|
||||||
/* IDVS not in use */
|
/* IDVS not in use */
|
||||||
|
|
@ -634,7 +643,7 @@ enum bi_idvs_mode {
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const struct panfrost_compile_inputs *inputs;
|
const struct panfrost_compile_inputs *inputs;
|
||||||
nir_shader *nir;
|
nir_shader *nir;
|
||||||
struct pan_shader_info *info;
|
struct bi_shader_info info;
|
||||||
gl_shader_stage stage;
|
gl_shader_stage stage;
|
||||||
struct list_head blocks; /* list of bi_block */
|
struct list_head blocks; /* list of bi_block */
|
||||||
struct hash_table_u64 *sysval_to_id;
|
struct hash_table_u64 *sysval_to_id;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue