mesa/src/intel/compiler/brw_compile_cs.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

306 lines
9.8 KiB
C++
Raw Normal View History

/*
* Copyright © 2010 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_shader.h"
#include "brw_analysis.h"
#include "brw_builder.h"
#include "brw_generator.h"
#include "brw_nir.h"
#include "brw_cfg.h"
#include "brw_private.h"
#include "intel_nir.h"
#include "shader_enums.h"
#include "dev/intel_debug.h"
#include "dev/intel_wa.h"
#include <memory>
static void
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
{
block->dwords = dwords;
block->regs = DIV_ROUND_UP(dwords, 8);
block->size = block->regs * 32;
}
static void
cs_fill_push_const_info(const struct intel_device_info *devinfo,
struct brw_cs_prog_data *cs_prog_data)
{
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
/* The thread ID should be stored in the last param dword */
assert(subgroup_id_index == -1 ||
subgroup_id_index == (int)prog_data->nr_params - 1);
unsigned cross_thread_dwords, per_thread_dwords;
if (subgroup_id_index >= 0) {
/* Fill all but the last register with cross-thread payload */
cross_thread_dwords = 8 * (subgroup_id_index / 8);
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
} else {
/* Fill all data using cross-thread payload */
cross_thread_dwords = prog_data->nr_params;
per_thread_dwords = 0u;
}
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
cs_prog_data->push.per_thread.size == 0);
assert(cs_prog_data->push.cross_thread.dwords +
cs_prog_data->push.per_thread.dwords ==
prog_data->nr_params);
}
static bool
run_cs(brw_shader &s, bool allow_spilling)
{
assert(mesa_shader_stage_is_compute(s.stage));
s.payload_ = new brw_cs_thread_payload(s);
brw_from_nir(&s);
if (s.failed)
return false;
s.emit_cs_terminate();
brw_calculate_cfg(s);
brw_optimize(s);
s.assign_curb_setup();
brw_lower_3src_null_dest(s);
brw_workaround_emit_dummy_mov_instruction(s);
brw_allocate_registers(s, allow_spilling);
brw_workaround_source_arf_before_eot(s);
return !s.failed;
}
static bool
instr_uses_sampler(nir_builder *b, nir_instr *instr, void *cb_data)
{
if (instr->type != nir_instr_type_tex)
return false;
switch (nir_instr_as_tex(instr)->op) {
case nir_texop_tex:
case nir_texop_txd:
case nir_texop_txf:
case nir_texop_txl:
case nir_texop_txb:
case nir_texop_txf_ms:
case nir_texop_txf_ms_mcs_intel:
case nir_texop_lod:
case nir_texop_tg4:
case nir_texop_texture_samples:
return true;
default:
return false;
}
}
static bool
brw_nir_uses_sampler(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, instr_uses_sampler,
nir_metadata_all,
NULL);
}
static inline uint32_t *
brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data,
unsigned nr_new_params)
{
unsigned old_nr_params = prog_data->nr_params;
prog_data->nr_params += nr_new_params;
prog_data->param = reralloc(ralloc_parent(prog_data->param),
prog_data->param, uint32_t,
prog_data->nr_params);
return prog_data->param + old_nr_params;
}
static void
brw_adjust_uniforms(brw_shader &s)
{
if (s.devinfo->verx10 >= 125)
return;
assert(mesa_shader_stage_is_compute(s.stage));
if (brw_get_subgroup_id_param_index(s.devinfo, s.prog_data) == -1) {
/* Add uniforms for builtins after regular NIR uniforms. */
assert(s.uniforms == s.prog_data->nr_params);
/* Subgroup ID must be the last uniform on the list. This will make
* easier later to split between cross thread and per thread
* uniforms.
*/
uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1);
*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
}
s.uniforms = s.prog_data->nr_params;
}
const unsigned *
brw_compile_cs(const struct brw_compiler *compiler,
struct brw_compile_cs_params *params)
{
const struct intel_device_info *devinfo = compiler->devinfo;
struct nir_shader *nir = params->base.nir;
const struct brw_cs_prog_key *key = params->key;
struct brw_cs_prog_data *prog_data = params->prog_data;
const bool debug_enabled =
brw_should_print_shader(nir, params->base.debug_flag ?
params->base.debug_flag : DEBUG_CS,
params->base.source_hash);
brw_prog_data_init(&prog_data->base, &params->base);
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir) ||
key->base.uses_inline_push_addr;
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
if (!nir->info.workgroup_size_variable) {
prog_data->local_size[0] = nir->info.workgroup_size[0];
prog_data->local_size[1] = nir->info.workgroup_size[1];
prog_data->local_size[2] = nir->info.workgroup_size[2];
}
brw: Do most of NIR postprocessing before cloning for SIMD variants We were doing a lot of NIR work repeatedly for each SIMD variant of compute and mesh shaders. Instead, do it once before cloning, and just do one final optimization loop and out-of-SSA for each. fossil-db results on Arc B580: Totals: Instrs: 233771096 -> 233794024 (+0.01%); split: -0.01%, +0.02% Subgroup size: 15922768 -> 15922736 (-0.00%); split: +0.00%, -0.00% Send messages: 12095619 -> 12098234 (+0.02%); split: -0.00%, +0.02% Loop count: 137562 -> 137523 (-0.03%) Cycle count: 32600323744 -> 32667411252 (+0.21%); split: -0.06%, +0.27% Spill count: 540908 -> 542027 (+0.21%); split: -0.07%, +0.28% Fill count: 700938 -> 698983 (-0.28%); split: -0.73%, +0.45% Scratch Memory Size: 37266432 -> 37304320 (+0.10%); split: -0.10%, +0.20% Max live registers: 72691728 -> 72692987 (+0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 67690309 -> 67688352 (-0.00%); split: -0.01%, +0.00% Totals from 3576 (0.45% of 789301) affected shaders: Instrs: 6932956 -> 6955884 (+0.33%); split: -0.41%, +0.74% Subgroup size: 88816 -> 88784 (-0.04%); split: +0.09%, -0.13% Send messages: 329168 -> 331783 (+0.79%); split: -0.02%, +0.81% Loop count: 8753 -> 8714 (-0.45%) Cycle count: 15153678820 -> 15220766328 (+0.44%); split: -0.14%, +0.58% Spill count: 213751 -> 214870 (+0.52%); split: -0.18%, +0.71% Fill count: 282616 -> 280661 (-0.69%); split: -1.82%, +1.13% Scratch Memory Size: 13056000 -> 13093888 (+0.29%); split: -0.27%, +0.56% Max live registers: 834757 -> 836016 (+0.15%); split: -0.11%, +0.26% Non SSA regs after NIR: 995033 -> 993076 (-0.20%); split: -0.48%, +0.28% Looking at a few of the shaders with substantial instruction count increases, it appears that it is largely due to more loops being unrolled, which is probably actually a good thing. The compile time impact of this patch appears to be negligable. However, doing postprocessing before SIMD cloning allows us to examine the postprocessed SSA-form NIR for improvements in an upcoming patch. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36750>
2025-08-11 14:54:09 -07:00
brw_postprocess_nir_opts(nir, compiler, key->base.robust_flags);
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,
.prog_data = prog_data,
.required_width = brw_required_dispatch_width(&nir->info),
};
prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
std::unique_ptr<brw_shader> v[3];
for (unsigned i = 0; i < 3; i++) {
const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
if (!brw_simd_should_compile(simd_state, simd))
continue;
const unsigned dispatch_width = 8u << simd;
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
brw_debug_archive_nir(params->base.archiver, shader, dispatch_width, "first");
brw_nir_apply_key(shader, compiler, &key->base,
dispatch_width);
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
brw: Do most of NIR postprocessing before cloning for SIMD variants We were doing a lot of NIR work repeatedly for each SIMD variant of compute and mesh shaders. Instead, do it once before cloning, and just do one final optimization loop and out-of-SSA for each. fossil-db results on Arc B580: Totals: Instrs: 233771096 -> 233794024 (+0.01%); split: -0.01%, +0.02% Subgroup size: 15922768 -> 15922736 (-0.00%); split: +0.00%, -0.00% Send messages: 12095619 -> 12098234 (+0.02%); split: -0.00%, +0.02% Loop count: 137562 -> 137523 (-0.03%) Cycle count: 32600323744 -> 32667411252 (+0.21%); split: -0.06%, +0.27% Spill count: 540908 -> 542027 (+0.21%); split: -0.07%, +0.28% Fill count: 700938 -> 698983 (-0.28%); split: -0.73%, +0.45% Scratch Memory Size: 37266432 -> 37304320 (+0.10%); split: -0.10%, +0.20% Max live registers: 72691728 -> 72692987 (+0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 67690309 -> 67688352 (-0.00%); split: -0.01%, +0.00% Totals from 3576 (0.45% of 789301) affected shaders: Instrs: 6932956 -> 6955884 (+0.33%); split: -0.41%, +0.74% Subgroup size: 88816 -> 88784 (-0.04%); split: +0.09%, -0.13% Send messages: 329168 -> 331783 (+0.79%); split: -0.02%, +0.81% Loop count: 8753 -> 8714 (-0.45%) Cycle count: 15153678820 -> 15220766328 (+0.44%); split: -0.14%, +0.58% Spill count: 213751 -> 214870 (+0.52%); split: -0.18%, +0.71% Fill count: 282616 -> 280661 (-0.69%); split: -1.82%, +1.13% Scratch Memory Size: 13056000 -> 13093888 (+0.29%); split: -0.27%, +0.56% Max live registers: 834757 -> 836016 (+0.15%); split: -0.11%, +0.26% Non SSA regs after NIR: 995033 -> 993076 (-0.20%); split: -0.48%, +0.28% Looking at a few of the shaders with substantial instruction count increases, it appears that it is largely due to more loops being unrolled, which is probably actually a good thing. The compile time impact of this patch appears to be negligable. However, doing postprocessing before SIMD cloning allows us to examine the postprocessed SSA-form NIR for improvements in an upcoming patch. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36750>
2025-08-11 14:54:09 -07:00
brw_nir_optimize(shader, devinfo);
brw_postprocess_nir_out_of_ssa(shader, dispatch_width,
params->base.archiver, debug_enabled);
const brw_shader_params shader_params = {
.compiler = compiler,
.mem_ctx = params->base.mem_ctx,
.nir = shader,
.key = &key->base,
.prog_data = &prog_data->base,
.dispatch_width = dispatch_width,
.needs_register_pressure = params->base.stats != NULL,
.log_data = params->base.log_data,
.debug_enabled = debug_enabled,
.archiver = params->base.archiver,
};
v[simd] = std::make_unique<brw_shader>(&shader_params);
brw_adjust_uniforms(*v[simd]);
const bool allow_spilling = simd == 0 ||
(!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) ||
nir->info.workgroup_size_variable;
if (devinfo->ver < 30 || nir->info.workgroup_size_variable) {
ASSERTED const int first = brw_simd_first_compiled(simd_state);
assert(allow_spilling == (first < 0 || nir->info.workgroup_size_variable));
}
if (run_cs(*v[simd], allow_spilling)) {
cs_fill_push_const_info(compiler->devinfo, prog_data);
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers &&
!nir->info.workgroup_size_variable)
break;
} else {
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
if (simd > 0) {
brw_shader_perf_log(compiler, params->base.log_data,
"SIMD%u shader failed to compile: %s\n",
dispatch_width, v[simd]->fail_msg);
}
}
}
const int selected_simd = brw_simd_select(simd_state);
if (selected_simd < 0) {
params->base.error_str =
ralloc_asprintf(params->base.mem_ctx,
"Can't compile shader: "
"SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
simd_state.error[0], simd_state.error[1],
simd_state.error[2]);
return NULL;
}
assert(selected_simd < 3);
if (!nir->info.workgroup_size_variable)
prog_data->prog_mask = 1 << selected_simd;
brw_generator g(compiler, &params->base, &prog_data->base,
MESA_SHADER_COMPUTE);
if (unlikely(debug_enabled)) {
char *name = ralloc_asprintf(params->base.mem_ctx,
"%s compute shader %s",
nir->info.label ?
nir->info.label : "unnamed",
nir->info.name);
g.enable_debug(name);
}
uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
struct brw_compile_stats *stats = params->base.stats;
for (unsigned simd = 0; simd < 3; simd++) {
if (prog_data->prog_mask & (1u << simd)) {
assert(v[simd]);
prog_data->prog_offset[simd] = g.generate_code(*v[simd], stats);
if (stats)
stats->max_dispatch_width = max_dispatch_width;
stats = stats ? stats + 1 : NULL;
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
v[simd]->grf_used);
max_dispatch_width = 8u << simd;
}
}
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}