mesa/src/intel/compiler/brw/brw_compile_cs.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

288 lines
8.9 KiB
C++
Raw Normal View History

/*
* Copyright © 2010 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_shader.h"
#include "brw_analysis.h"
#include "brw_builder.h"
#include "brw_generator.h"
#include "brw_nir.h"
#include "brw_cfg.h"
#include "brw_private.h"
#include "intel_nir.h"
#include "shader_enums.h"
#include "dev/intel_debug.h"
#include "dev/intel_wa.h"
#include <memory>
static void
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
{
block->dwords = dwords;
block->regs = DIV_ROUND_UP(dwords, 8);
block->size = block->regs * 32;
}
extern "C" void
brw_cs_fill_push_const_info(const struct intel_device_info *devinfo,
struct brw_cs_prog_data *cs_prog_data,
int subgroup_id_index)
{
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
unsigned cross_thread_dwords, per_thread_dwords;
if (devinfo->verx10 < 125 && subgroup_id_index >= 0) {
/* Fill all but the last register with cross-thread payload */
cross_thread_dwords = 8 * (subgroup_id_index / 8);
per_thread_dwords = prog_data->push_sizes[0] / 4 - cross_thread_dwords;
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
} else {
/* Fill all data using cross-thread payload */
cross_thread_dwords = prog_data->push_sizes[0] / 4;
per_thread_dwords = 0u;
}
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
cs_prog_data->push.per_thread.size == 0);
assert(cs_prog_data->push.cross_thread.dwords +
cs_prog_data->push.per_thread.dwords ==
prog_data->push_sizes[0] / 4);
}
static bool
run_cs(brw_shader &s, bool allow_spilling)
{
assert(mesa_shader_stage_is_compute(s.stage));
s.payload_ = new brw_cs_thread_payload(s);
brw_from_nir(&s);
if (s.failed)
return false;
s.emit_cs_terminate();
brw_calculate_cfg(s);
brw_optimize(s);
s.assign_curb_setup();
brw_lower_3src_null_dest(s);
brw_workaround_emit_dummy_mov_instruction(s);
brw_allocate_registers(s, allow_spilling);
brw_workaround_source_arf_before_eot(s);
return !s.failed;
}
static bool
instr_uses_sampler(nir_builder *b, nir_instr *instr, void *cb_data)
{
if (instr->type != nir_instr_type_tex)
return false;
switch (nir_instr_as_tex(instr)->op) {
case nir_texop_tex:
case nir_texop_txd:
case nir_texop_txf:
case nir_texop_txl:
case nir_texop_txb:
case nir_texop_txf_ms:
case nir_texop_txf_ms_mcs_intel:
case nir_texop_lod:
case nir_texop_tg4:
case nir_texop_texture_samples:
return true;
default:
return false;
}
}
static bool
brw_nir_uses_sampler(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, instr_uses_sampler,
nir_metadata_all,
NULL);
}
const unsigned *
brw_compile_cs(const struct brw_compiler *compiler,
struct brw_compile_cs_params *params)
{
const struct intel_device_info *devinfo = compiler->devinfo;
struct nir_shader *nir = params->base.nir;
const struct brw_cs_prog_key *key = params->key;
struct brw_cs_prog_data *prog_data = params->prog_data;
const bool debug_enabled =
brw_should_print_shader(nir, params->base.debug_flag ?
params->base.debug_flag : DEBUG_CS,
params->base.source_hash);
brw_prog_data_init(&prog_data->base, &params->base);
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir) ||
key->base.uses_inline_push_addr;
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
if (!nir->info.workgroup_size_variable) {
prog_data->local_size[0] = nir->info.workgroup_size[0];
prog_data->local_size[1] = nir->info.workgroup_size[1];
prog_data->local_size[2] = nir->info.workgroup_size[2];
}
brw_pass_tracker pt_ = {
.nir = nir,
.dispatch_width = 0,
.compiler = compiler,
.key = &key->base,
.archiver = params->base.archiver,
}, *pt = &pt_;
BRW_NIR_SNAPSHOT("first");
brw_postprocess_nir_opts(pt);
brw: Do most of NIR postprocessing before cloning for SIMD variants We were doing a lot of NIR work repeatedly for each SIMD variant of compute and mesh shaders. Instead, do it once before cloning, and just do one final optimization loop and out-of-SSA for each. fossil-db results on Arc B580: Totals: Instrs: 233771096 -> 233794024 (+0.01%); split: -0.01%, +0.02% Subgroup size: 15922768 -> 15922736 (-0.00%); split: +0.00%, -0.00% Send messages: 12095619 -> 12098234 (+0.02%); split: -0.00%, +0.02% Loop count: 137562 -> 137523 (-0.03%) Cycle count: 32600323744 -> 32667411252 (+0.21%); split: -0.06%, +0.27% Spill count: 540908 -> 542027 (+0.21%); split: -0.07%, +0.28% Fill count: 700938 -> 698983 (-0.28%); split: -0.73%, +0.45% Scratch Memory Size: 37266432 -> 37304320 (+0.10%); split: -0.10%, +0.20% Max live registers: 72691728 -> 72692987 (+0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 67690309 -> 67688352 (-0.00%); split: -0.01%, +0.00% Totals from 3576 (0.45% of 789301) affected shaders: Instrs: 6932956 -> 6955884 (+0.33%); split: -0.41%, +0.74% Subgroup size: 88816 -> 88784 (-0.04%); split: +0.09%, -0.13% Send messages: 329168 -> 331783 (+0.79%); split: -0.02%, +0.81% Loop count: 8753 -> 8714 (-0.45%) Cycle count: 15153678820 -> 15220766328 (+0.44%); split: -0.14%, +0.58% Spill count: 213751 -> 214870 (+0.52%); split: -0.18%, +0.71% Fill count: 282616 -> 280661 (-0.69%); split: -1.82%, +1.13% Scratch Memory Size: 13056000 -> 13093888 (+0.29%); split: -0.27%, +0.56% Max live registers: 834757 -> 836016 (+0.15%); split: -0.11%, +0.26% Non SSA regs after NIR: 995033 -> 993076 (-0.20%); split: -0.48%, +0.28% Looking at a few of the shaders with substantial instruction count increases, it appears that it is largely due to more loops being unrolled, which is probably actually a good thing. The compile time impact of this patch appears to be negligable. However, doing postprocessing before SIMD cloning allows us to examine the postprocessed SSA-form NIR for improvements in an upcoming patch. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36750>
2025-08-11 14:54:09 -07:00
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,
.prog_data = prog_data,
.required_width = brw_required_dispatch_width(&nir->info),
};
brw: Skip compilation of larger SIMDs when pressure is too high This allows us to skip the entire backend compilation process for large SIMD widths when register pressure is high enough that we'd likely decide to prefer a smaller one in the end anyway. The hope is to make the same decisions as before, but with less CPU overhead. We are making mostly the same decisions as before: | API / Platform | Total Shaders | Changed | % Identical -------------------------------------------------- | VK / Arc A770 | 905,525 | 1,157 | 99.872% | | VK / Arc B580 | 788,127 | 53 | 99.993% | | VK / Panther | 786,333 | 13 | 99.998% | | GL / Arc A770 | 308,618 | 269 | 99.913% | | GL / Arc B580 | 264,066 | 13 | 99.995% | | GL / Panther | 273,212 | 0 | 100.000% | Improves compile times on my i7-12700K: | Game | Arc B580 | Arc A770 | --------------------------------------------------- | Assassins Creed: Odyssey | -13.47% | -10.98% | | Borderlands 3 (DX12) | -10.05% | -11.31% | | Dark Souls 3 | -21.06% | -21.08% | | Oblivion Remastered | -11.10% | -9.82% | | Phasmophobia | -32.73% | -31.00% | | Red Dead Redemption 2 | -20.10% | -14.38% | | Total War: Warhammer III | -10.11% | -14.44% | | Wolfenstein Youngblood | -15.91% | -13.47% | | Shadow of the Tomb Raider | -30.23% | -25.86% | It seems to have nearly no effect on compile times on Xe3 unfortunately, as only 1,014 shaders in fossil-db even fail SIMD32 compilation in the first place, and we want to let most of the "might succeed" cases through to the backend for throughput analysis. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36750>
2025-08-11 02:39:49 -07:00
unsigned pressure[SIMD_COUNT];
brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
for (unsigned i = 0; i < SIMD_COUNT; i++) {
simd_state.beyond_threshold[i] =
pressure[i] > compiler->register_pressure_threshold;
}
prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
std::unique_ptr<brw_shader> v[3];
for (unsigned i = 0; i < 3; i++) {
const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
if (!brw_simd_should_compile(simd_state, simd))
continue;
const unsigned dispatch_width = 8u << simd;
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
pt_ = {
.nir = shader,
.dispatch_width = dispatch_width,
.compiler = compiler,
.archiver = params->base.archiver,
};
BRW_NIR_SNAPSHOT("first");
brw_nir_apply_key(pt, &key->base, dispatch_width);
BRW_NIR_PASS(brw_nir_lower_simd, dispatch_width);
brw_nir_optimize(pt);
/* brw_nir_optimize undoes late lowerings. */
BRW_NIR_PASS(nir_opt_algebraic_late);
brw_postprocess_nir_out_of_ssa(pt, debug_enabled);
const brw_shader_params shader_params = {
.compiler = compiler,
.mem_ctx = params->base.mem_ctx,
.nir = shader,
.key = &key->base,
.prog_data = &prog_data->base,
.dispatch_width = dispatch_width,
.needs_register_pressure = params->base.stats != NULL,
.log_data = params->base.log_data,
.debug_enabled = debug_enabled,
.archiver = params->base.archiver,
};
v[simd] = std::make_unique<brw_shader>(&shader_params);
const bool allow_spilling = simd == 0 ||
(!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) ||
nir->info.workgroup_size_variable;
if (devinfo->ver < 30 || nir->info.workgroup_size_variable) {
ASSERTED const int first = brw_simd_first_compiled(simd_state);
assert(allow_spilling == (first < 0 || nir->info.workgroup_size_variable));
}
if (run_cs(*v[simd], allow_spilling)) {
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers &&
!nir->info.workgroup_size_variable)
break;
} else {
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
if (simd > 0) {
brw_shader_perf_log(compiler, params->base.log_data,
"SIMD%u shader failed to compile: %s\n",
dispatch_width, v[simd]->fail_msg);
}
}
}
const int selected_simd = brw_simd_select(simd_state);
if (selected_simd < 0) {
params->base.error_str =
ralloc_asprintf(params->base.mem_ctx,
"Can't compile shader: "
"SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
simd_state.error[0], simd_state.error[1],
simd_state.error[2]);
return NULL;
}
assert(selected_simd < 3);
if (!nir->info.workgroup_size_variable)
prog_data->prog_mask = 1 << selected_simd;
brw_generator g(compiler, &params->base, &prog_data->base,
MESA_SHADER_COMPUTE);
if (unlikely(debug_enabled)) {
char *name = ralloc_asprintf(params->base.mem_ctx,
"%s compute shader %s",
nir->info.label ?
nir->info.label : "unnamed",
nir->info.name);
g.enable_debug(name);
}
uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
struct genisa_stats *stats = params->base.stats;
for (unsigned simd = 0; simd < 3; simd++) {
if (prog_data->prog_mask & (1u << simd)) {
assert(v[simd]);
prog_data->prog_offset[simd] = g.generate_code(*v[simd], stats);
if (stats)
stats->max_dispatch_width = max_dispatch_width;
stats = stats ? stats + 1 : NULL;
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
v[simd]->grf_used);
max_dispatch_width = 8u << simd;
}
}
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}