diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index c66b3cd8ff6..ccd2f039340 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -10193,40 +10193,6 @@ brw_compile_cs(const struct brw_compiler *compiler, return ret; } -static unsigned -brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo, - const struct brw_cs_prog_data *cs_prog_data, - unsigned group_size) -{ - const unsigned mask = cs_prog_data->prog_mask; - assert(mask != 0); - - static const unsigned simd8 = 1 << 0; - static const unsigned simd16 = 1 << 1; - static const unsigned simd32 = 1 << 2; - - if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32)) - return 32; - - const uint32_t max_threads = devinfo->max_cs_workgroup_threads; - - if ((mask & simd8) && group_size <= 8 * max_threads) { - /* Prefer SIMD16 if can do without spilling. Matches logic in - * brw_simd_selection.cpp. - */ - if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16)) - return 16; - return 8; - } - - if ((mask & simd16) && group_size <= 16 * max_threads) - return 16; - - assert(mask & simd32); - assert(group_size <= 32 * max_threads); - return 32; -} - struct brw_cs_dispatch_info brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, const struct brw_cs_prog_data *prog_data, @@ -10238,9 +10204,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, override_local_size ? override_local_size : prog_data->local_size; + const int simd = + override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) : + brw_simd_select(prog_data); + assert(simd >= 0 && simd < 3); + info.group_size = sizes[0] * sizes[1] * sizes[2]; - info.simd_size = - brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size); + info.simd_size = 8u << simd; info.threads = DIV_ROUND_UP(info.group_size, info.simd_size); const uint32_t remainder = info.group_size & (info.simd_size - 1); diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h index d166a29e0d5..c4334ce3ff7 100644 --- a/src/intel/compiler/brw_private.h +++ b/src/intel/compiler/brw_private.h @@ -47,6 +47,10 @@ void brw_simd_mark_compiled(unsigned simd, int brw_simd_select(const struct brw_cs_prog_data *prog_data); +int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo, + const struct brw_cs_prog_data *prog_data, + const unsigned *sizes); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/intel/compiler/brw_simd_selection.c b/src/intel/compiler/brw_simd_selection.c index 551e882e1a8..2ecf568ad57 100644 --- a/src/intel/compiler/brw_simd_selection.c +++ b/src/intel/compiler/brw_simd_selection.c @@ -161,3 +161,42 @@ brw_simd_select(const struct brw_cs_prog_data *prog_data) else return -1; } + +int +brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo, + const struct brw_cs_prog_data *prog_data, + const unsigned *sizes) +{ + assert(sizes); + + if (prog_data->local_size[0] == sizes[0] && + prog_data->local_size[1] == sizes[1] && + prog_data->local_size[2] == sizes[2]) + return brw_simd_select(prog_data); + + void *mem_ctx = ralloc_context(NULL); + + struct brw_cs_prog_data cloned = *prog_data; + for (unsigned i = 0; i < 3; i++) + cloned.local_size[i] = sizes[i]; + + cloned.prog_mask = 0; + cloned.prog_spilled = 0; + + const char *error[3] = {0}; + + for (unsigned simd = 0; simd < 3; simd++) { + /* We are not recompiling, so use original results of prog_mask and + * prog_spilled as they will already contain all possible compilations. + */ + if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned, + 0 /* required_dispatch_width */, &error[simd]) && + test_bit(prog_data->prog_mask, simd)) { + brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd)); + } + } + + ralloc_free(mem_ctx); + + return brw_simd_select(&cloned); +} diff --git a/src/intel/compiler/test_simd_selection.cpp b/src/intel/compiler/test_simd_selection.cpp index f1be0bf185f..7344c57aca4 100644 --- a/src/intel/compiler/test_simd_selection.cpp +++ b/src/intel/compiler/test_simd_selection.cpp @@ -145,6 +145,15 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariable) brw_simd_mark_compiled(SIMD32, prog_data, not_spilled); ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32); + + const unsigned wg_8_1_1[] = { 8, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16); + + const unsigned wg_16_1_1[] = { 16, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16); + + const unsigned wg_32_1_1[] = { 32, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16); } TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled) @@ -161,6 +170,86 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled) brw_simd_mark_compiled(SIMD32, prog_data, spilled); ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32); + + const unsigned wg_8_1_1[] = { 8, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8); + + const unsigned wg_16_1_1[] = { 16, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8); + + const unsigned wg_32_1_1[] = { 32, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8); +} + +TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8) +{ + prog_data->local_size[0] = 0; + prog_data->local_size[1] = 0; + prog_data->local_size[2] = 0; + + ASSERT_TRUE(should_compile(SIMD8)); + ASSERT_TRUE(should_compile(SIMD16)); + brw_simd_mark_compiled(SIMD16, prog_data, not_spilled); + ASSERT_TRUE(should_compile(SIMD32)); + brw_simd_mark_compiled(SIMD32, prog_data, not_spilled); + + ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32); + + const unsigned wg_8_1_1[] = { 8, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16); + + const unsigned wg_16_1_1[] = { 16, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16); + + const unsigned wg_32_1_1[] = { 32, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16); +} + +TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD16) +{ + prog_data->local_size[0] = 0; + prog_data->local_size[1] = 0; + prog_data->local_size[2] = 0; + + ASSERT_TRUE(should_compile(SIMD8)); + brw_simd_mark_compiled(SIMD8, prog_data, not_spilled); + ASSERT_TRUE(should_compile(SIMD16)); + ASSERT_TRUE(should_compile(SIMD32)); + brw_simd_mark_compiled(SIMD32, prog_data, not_spilled); + + ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD32); + + const unsigned wg_8_1_1[] = { 8, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8); + + const unsigned wg_16_1_1[] = { 16, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8); + + const unsigned wg_32_1_1[] = { 32, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8); +} + +TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16) +{ + prog_data->local_size[0] = 0; + prog_data->local_size[1] = 0; + prog_data->local_size[2] = 0; + + ASSERT_TRUE(should_compile(SIMD8)); + ASSERT_TRUE(should_compile(SIMD16)); + ASSERT_TRUE(should_compile(SIMD32)); + brw_simd_mark_compiled(SIMD32, prog_data, not_spilled); + + ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32); + + const unsigned wg_8_1_1[] = { 8, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32); + + const unsigned wg_16_1_1[] = { 16, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32); + + const unsigned wg_32_1_1[] = { 32, 1, 1 }; + ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32); } TEST_F(SIMDSelectionCS, SpillAtSIMD8)