intel/compiler: Use SIMD selection helpers for variable workgroup size

Variable workgroup size works by compiling as much SIMD variants as
possible and then selecting the right one during dispatch (when the
actual workgroup size is passed to us).

Instead of replicating the logic in a separate function, reuse the
same logic for regular SIMD selection.  And move function for that
together with the remaining simd selection functions.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13249>
This commit is contained in:
Caio Marcelo de Oliveira Filho 2021-10-11 07:49:40 -07:00 committed by Marge Bot
parent 7dda0cf2b8
commit 4e7b71e00c
4 changed files with 138 additions and 36 deletions

View file

@ -10193,40 +10193,6 @@ brw_compile_cs(const struct brw_compiler *compiler,
return ret;
}
static unsigned
brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *cs_prog_data,
unsigned group_size)
{
const unsigned mask = cs_prog_data->prog_mask;
assert(mask != 0);
static const unsigned simd8 = 1 << 0;
static const unsigned simd16 = 1 << 1;
static const unsigned simd32 = 1 << 2;
if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32))
return 32;
const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
if ((mask & simd8) && group_size <= 8 * max_threads) {
/* Prefer SIMD16 if can do without spilling. Matches logic in
* brw_simd_selection.cpp.
*/
if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
return 16;
return 8;
}
if ((mask & simd16) && group_size <= 16 * max_threads)
return 16;
assert(mask & simd32);
assert(group_size <= 32 * max_threads);
return 32;
}
struct brw_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
@ -10238,9 +10204,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
override_local_size ? override_local_size :
prog_data->local_size;
const int simd =
override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
brw_simd_select(prog_data);
assert(simd >= 0 && simd < 3);
info.group_size = sizes[0] * sizes[1] * sizes[2];
info.simd_size =
brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
info.simd_size = 8u << simd;
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
const uint32_t remainder = info.group_size & (info.simd_size - 1);

View file

@ -47,6 +47,10 @@ void brw_simd_mark_compiled(unsigned simd,
int brw_simd_select(const struct brw_cs_prog_data *prog_data);
int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
const unsigned *sizes);
#ifdef __cplusplus
} /* extern "C" */
#endif

View file

@ -161,3 +161,42 @@ brw_simd_select(const struct brw_cs_prog_data *prog_data)
else
return -1;
}
int
brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
const unsigned *sizes)
{
assert(sizes);
if (prog_data->local_size[0] == sizes[0] &&
prog_data->local_size[1] == sizes[1] &&
prog_data->local_size[2] == sizes[2])
return brw_simd_select(prog_data);
void *mem_ctx = ralloc_context(NULL);
struct brw_cs_prog_data cloned = *prog_data;
for (unsigned i = 0; i < 3; i++)
cloned.local_size[i] = sizes[i];
cloned.prog_mask = 0;
cloned.prog_spilled = 0;
const char *error[3] = {0};
for (unsigned simd = 0; simd < 3; simd++) {
/* We are not recompiling, so use original results of prog_mask and
* prog_spilled as they will already contain all possible compilations.
*/
if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
0 /* required_dispatch_width */, &error[simd]) &&
test_bit(prog_data->prog_mask, simd)) {
brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
}
}
ralloc_free(mem_ctx);
return brw_simd_select(&cloned);
}

View file

@ -145,6 +145,15 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariable)
brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
const unsigned wg_8_1_1[] = { 8, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
const unsigned wg_16_1_1[] = { 16, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
const unsigned wg_32_1_1[] = { 32, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
}
TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
@ -161,6 +170,86 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
brw_simd_mark_compiled(SIMD32, prog_data, spilled);
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
const unsigned wg_8_1_1[] = { 8, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
const unsigned wg_16_1_1[] = { 16, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
const unsigned wg_32_1_1[] = { 32, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
}
TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
{
prog_data->local_size[0] = 0;
prog_data->local_size[1] = 0;
prog_data->local_size[2] = 0;
ASSERT_TRUE(should_compile(SIMD8));
ASSERT_TRUE(should_compile(SIMD16));
brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
ASSERT_TRUE(should_compile(SIMD32));
brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
const unsigned wg_8_1_1[] = { 8, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
const unsigned wg_16_1_1[] = { 16, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
const unsigned wg_32_1_1[] = { 32, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
}
TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD16)
{
prog_data->local_size[0] = 0;
prog_data->local_size[1] = 0;
prog_data->local_size[2] = 0;
ASSERT_TRUE(should_compile(SIMD8));
brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
ASSERT_TRUE(should_compile(SIMD16));
ASSERT_TRUE(should_compile(SIMD32));
brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD32);
const unsigned wg_8_1_1[] = { 8, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
const unsigned wg_16_1_1[] = { 16, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
const unsigned wg_32_1_1[] = { 32, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
}
TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
{
prog_data->local_size[0] = 0;
prog_data->local_size[1] = 0;
prog_data->local_size[2] = 0;
ASSERT_TRUE(should_compile(SIMD8));
ASSERT_TRUE(should_compile(SIMD16));
ASSERT_TRUE(should_compile(SIMD32));
brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
const unsigned wg_8_1_1[] = { 8, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
const unsigned wg_16_1_1[] = { 16, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
const unsigned wg_32_1_1[] = { 32, 1, 1 };
ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);
}
TEST_F(SIMDSelectionCS, SpillAtSIMD8)