mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-01 07:30:09 +01:00
intel/brw/xe3+: Optimize CS/TASK/MESH compile time optimistically assuming SIMD32.
This is similar in principle to the previous commit "intel/brw/xe3+:
brw_compile_fs() implementation for Xe3+." but applied to compute-like
shader stages. It changes the implementation of brw_compile_cs/task/mesh()
to reduce compile time and take advantage of wider dispatch modes more
aggressively than the original logic, since as of Xe3 SIMD32 builds
succeed without spills in most cases thanks to VRT.
The new "optimistic" SIMD selection logic starts with the SIMD width
that is potentially highest performance and only compiles additional
narrower variants if that fails (typically due to spilling), while the
old "pessimistic" logic did the opposite: It started with the
narrowest SIMD width and compiled additional variants with increasing
register pressure until one of them failed to compile.
In typical non-spilling cases where we formerly compiled SIMD16 and
SIMD32 variants of the same compute shader, this change will halve the
number of backend compilations required to build it.
XXX - Possibly don't do this in cases with variable workgroup size
until effect on runtime performance can be measured directly.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
v2: Don't do this for now in cases with variable workgroup size, still
compile every possible variant in such cases.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
This commit is contained in:
parent
7e1362e9c0
commit
6513bf65c3
4 changed files with 51 additions and 24 deletions
|
|
@ -135,6 +135,7 @@ const unsigned *
|
|||
brw_compile_cs(const struct brw_compiler *compiler,
|
||||
struct brw_compile_cs_params *params)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
struct nir_shader *nir = params->base.nir;
|
||||
const struct brw_cs_prog_key *key = params->key;
|
||||
struct brw_cs_prog_data *prog_data = params->prog_data;
|
||||
|
|
@ -166,7 +167,9 @@ brw_compile_cs(const struct brw_compiler *compiler,
|
|||
|
||||
std::unique_ptr<fs_visitor> v[3];
|
||||
|
||||
for (unsigned simd = 0; simd < 3; simd++) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
|
||||
|
||||
if (!brw_simd_should_compile(simd_state, simd))
|
||||
continue;
|
||||
|
||||
|
|
@ -192,16 +195,25 @@ brw_compile_cs(const struct brw_compiler *compiler,
|
|||
params->base.stats != NULL,
|
||||
debug_enabled);
|
||||
|
||||
const int first = brw_simd_first_compiled(simd_state);
|
||||
if (first >= 0)
|
||||
v[simd]->import_uniforms(v[first].get());
|
||||
const bool allow_spilling = simd == 0 ||
|
||||
(!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) ||
|
||||
nir->info.workgroup_size_variable;
|
||||
|
||||
const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
|
||||
if (devinfo->ver < 30 || nir->info.workgroup_size_variable) {
|
||||
const int first = brw_simd_first_compiled(simd_state);
|
||||
if (first >= 0)
|
||||
v[simd]->import_uniforms(v[first].get());
|
||||
assert(allow_spilling == (first < 0 || nir->info.workgroup_size_variable));
|
||||
}
|
||||
|
||||
if (run_cs(*v[simd], allow_spilling)) {
|
||||
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
||||
|
||||
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
||||
|
||||
if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers &&
|
||||
!nir->info.workgroup_size_variable)
|
||||
break;
|
||||
} else {
|
||||
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
||||
if (simd > 0) {
|
||||
|
|
|
|||
|
|
@ -398,6 +398,7 @@ const unsigned *
|
|||
brw_compile_task(const struct brw_compiler *compiler,
|
||||
struct brw_compile_task_params *params)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
struct nir_shader *nir = params->base.nir;
|
||||
const struct brw_task_prog_key *key = params->key;
|
||||
struct brw_task_prog_data *prog_data = params->prog_data;
|
||||
|
|
@ -441,7 +442,9 @@ brw_compile_task(const struct brw_compiler *compiler,
|
|||
|
||||
std::unique_ptr<fs_visitor> v[3];
|
||||
|
||||
for (unsigned simd = 0; simd < 3; simd++) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
|
||||
|
||||
if (!brw_simd_should_compile(simd_state, simd))
|
||||
continue;
|
||||
|
||||
|
|
@ -467,11 +470,16 @@ brw_compile_task(const struct brw_compiler *compiler,
|
|||
v[simd]->import_uniforms(v[first].get());
|
||||
}
|
||||
|
||||
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
||||
if (run_task_mesh(*v[simd], allow_spilling))
|
||||
const bool allow_spilling = simd == 0 ||
|
||||
(!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
|
||||
if (run_task_mesh(*v[simd], allow_spilling)) {
|
||||
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
||||
else
|
||||
|
||||
if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
|
||||
break;
|
||||
} else {
|
||||
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
||||
}
|
||||
}
|
||||
|
||||
int selected_simd = brw_simd_select(simd_state);
|
||||
|
|
@ -1688,6 +1696,7 @@ const unsigned *
|
|||
brw_compile_mesh(const struct brw_compiler *compiler,
|
||||
struct brw_compile_mesh_params *params)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
struct nir_shader *nir = params->base.nir;
|
||||
const struct brw_mesh_prog_key *key = params->key;
|
||||
struct brw_mesh_prog_data *prog_data = params->prog_data;
|
||||
|
|
@ -1742,7 +1751,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
|||
|
||||
std::unique_ptr<fs_visitor> v[3];
|
||||
|
||||
for (int simd = 0; simd < 3; simd++) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
|
||||
|
||||
if (!brw_simd_should_compile(simd_state, simd))
|
||||
continue;
|
||||
|
||||
|
|
@ -1780,11 +1791,16 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
|||
v[simd]->import_uniforms(v[first].get());
|
||||
}
|
||||
|
||||
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
||||
if (run_task_mesh(*v[simd], allow_spilling))
|
||||
const bool allow_spilling = simd == 0 ||
|
||||
(!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
|
||||
if (run_task_mesh(*v[simd], allow_spilling)) {
|
||||
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
||||
else
|
||||
|
||||
if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
|
||||
break;
|
||||
} else {
|
||||
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
||||
}
|
||||
}
|
||||
|
||||
int selected_simd = brw_simd_select(simd_state);
|
||||
|
|
|
|||
|
|
@ -87,23 +87,23 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
|
|||
const auto prog_data = get_prog_data(state);
|
||||
const unsigned width = 8u << simd;
|
||||
|
||||
if (state.required_width && state.required_width != width) {
|
||||
state.error[simd] = "Different than required dispatch width";
|
||||
return false;
|
||||
}
|
||||
|
||||
/* For shaders with variable size workgroup, in most cases we can compile
|
||||
* all the variants (exceptions are bindless dispatch & ray queries), since
|
||||
* the choice will happen only at dispatch time.
|
||||
*/
|
||||
const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
|
||||
|
||||
if (!workgroup_size_variable) {
|
||||
if (!workgroup_size_variable && !state.required_width) {
|
||||
if (state.spilled[simd]) {
|
||||
state.error[simd] = "Would spill";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.required_width && state.required_width != width) {
|
||||
state.error[simd] = "Different than required dispatch width";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cs_prog_data) {
|
||||
const unsigned workgroup_size = cs_prog_data->local_size[0] *
|
||||
cs_prog_data->local_size[1] *
|
||||
|
|
@ -112,8 +112,7 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
|
|||
unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
|
||||
|
||||
const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
|
||||
if (simd > min_simd && state.compiled[simd - 1] &&
|
||||
workgroup_size <= (width / 2)) {
|
||||
if (simd > min_simd && workgroup_size <= (width / 2)) {
|
||||
state.error[simd] = "Workgroup size already fits in smaller SIMD";
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -193,7 +193,7 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
|
|||
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
|
||||
|
||||
const unsigned wg_8_1_1[] = { 8, 1, 1 };
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);
|
||||
|
||||
const unsigned wg_16_1_1[] = { 16, 1, 1 };
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
|
||||
|
|
@ -240,10 +240,10 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
|
|||
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
|
||||
|
||||
const unsigned wg_8_1_1[] = { 8, 1, 1 };
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);
|
||||
|
||||
const unsigned wg_16_1_1[] = { 16, 1, 1 };
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), -1);
|
||||
|
||||
const unsigned wg_32_1_1[] = { 32, 1, 1 };
|
||||
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue