mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 15:50:11 +01:00
Variable workgroup size works by compiling as much SIMD variants as possible and then selecting the right one during dispatch (when the actual workgroup size is passed to us). Instead of replicating the logic in a separate function, reuse the same logic for regular SIMD selection. And move function for that together with the remaining simd selection functions. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13249>
202 lines
6.4 KiB
C
202 lines
6.4 KiB
C
/*
|
|
* Copyright © 2021 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "brw_private.h"
|
|
#include "compiler/shader_info.h"
|
|
#include "intel/dev/intel_debug.h"
|
|
#include "intel/dev/intel_device_info.h"
|
|
#include "util/ralloc.h"
|
|
|
|
unsigned
|
|
brw_required_dispatch_width(const struct shader_info *info,
|
|
enum brw_subgroup_size_type subgroup_size_type)
|
|
{
|
|
unsigned required = 0;
|
|
|
|
if ((int)subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
|
|
assert(gl_shader_stage_uses_workgroup(info->stage));
|
|
/* These enum values are expressly chosen to be equal to the subgroup
|
|
* size that they require.
|
|
*/
|
|
required = (unsigned)subgroup_size_type;
|
|
}
|
|
|
|
if (gl_shader_stage_is_compute(info->stage) && info->cs.subgroup_size > 0) {
|
|
assert(required == 0 || required == info->cs.subgroup_size);
|
|
required = info->cs.subgroup_size;
|
|
}
|
|
|
|
return required;
|
|
}
|
|
|
|
static inline bool
|
|
test_bit(unsigned mask, unsigned bit) {
|
|
return mask & (1u << bit);
|
|
}
|
|
|
|
bool
|
|
brw_simd_should_compile(void *mem_ctx,
|
|
unsigned simd,
|
|
const struct intel_device_info *devinfo,
|
|
struct brw_cs_prog_data *prog_data,
|
|
unsigned required,
|
|
const char **error)
|
|
|
|
{
|
|
assert(!test_bit(prog_data->prog_mask, simd));
|
|
assert(error);
|
|
|
|
const unsigned width = 8u << simd;
|
|
|
|
/* For shaders with variable size workgroup, we will always compile all the
|
|
* variants, since the choice will happen only at dispatch time.
|
|
*/
|
|
const bool workgroup_size_variable = prog_data->local_size[0] == 0;
|
|
|
|
if (!workgroup_size_variable) {
|
|
if (test_bit(prog_data->prog_spilled, simd)) {
|
|
*error = ralloc_asprintf(
|
|
mem_ctx, "SIMD%u skipped because would spill", width);
|
|
return false;
|
|
}
|
|
|
|
const unsigned workgroup_size = prog_data->local_size[0] *
|
|
prog_data->local_size[1] *
|
|
prog_data->local_size[2];
|
|
|
|
unsigned max_threads = devinfo->max_cs_workgroup_threads;
|
|
|
|
if (required && required != width) {
|
|
*error = ralloc_asprintf(
|
|
mem_ctx, "SIMD%u skipped because required dispatch width is %u",
|
|
width, required);
|
|
return false;
|
|
}
|
|
|
|
/* TODO: Ignore SIMD larger than workgroup if previous SIMD already passed. */
|
|
|
|
if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
|
|
*error = ralloc_asprintf(
|
|
mem_ctx, "SIMD%u can't fit all %u invocations in %u threads",
|
|
width, workgroup_size, max_threads);
|
|
return false;
|
|
}
|
|
|
|
/* The SIMD32 is only enabled for cases it is needed unless forced.
|
|
*
|
|
* TODO: Use performance_analysis and drop this rule.
|
|
*/
|
|
if (width == 32) {
|
|
if (!INTEL_DEBUG(DEBUG_DO32) && prog_data->prog_mask) {
|
|
*error = ralloc_strdup(
|
|
mem_ctx, "SIMD32 skipped because not required");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
const bool env_skip[3] = {
|
|
INTEL_DEBUG(DEBUG_NO8),
|
|
INTEL_DEBUG(DEBUG_NO16),
|
|
INTEL_DEBUG(DEBUG_NO32),
|
|
};
|
|
|
|
if (unlikely(env_skip[simd])) {
|
|
*error = ralloc_asprintf(
|
|
mem_ctx, "SIMD%u skipped because INTEL_DEBUG=no%u",
|
|
width, width);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
brw_simd_mark_compiled(unsigned simd, struct brw_cs_prog_data *prog_data, bool spilled)
|
|
{
|
|
assert(!test_bit(prog_data->prog_mask, simd));
|
|
|
|
prog_data->prog_mask |= 1u << simd;
|
|
|
|
/* If a SIMD spilled, all the larger ones would spill too. */
|
|
if (spilled) {
|
|
for (unsigned i = simd; i < 3; i++)
|
|
prog_data->prog_spilled |= 1u << i;
|
|
}
|
|
}
|
|
|
|
int
|
|
brw_simd_select(const struct brw_cs_prog_data *prog_data)
|
|
{
|
|
assert((prog_data->prog_mask & ~0x7u) == 0);
|
|
const unsigned not_spilled_mask =
|
|
prog_data->prog_mask & ~prog_data->prog_spilled;
|
|
|
|
/* Util functions index bits from 1 instead of 0, adjust before return. */
|
|
|
|
if (not_spilled_mask)
|
|
return util_last_bit(not_spilled_mask) - 1;
|
|
else if (prog_data->prog_mask)
|
|
return ffs(prog_data->prog_mask) - 1;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
int
|
|
brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
|
|
const struct brw_cs_prog_data *prog_data,
|
|
const unsigned *sizes)
|
|
{
|
|
assert(sizes);
|
|
|
|
if (prog_data->local_size[0] == sizes[0] &&
|
|
prog_data->local_size[1] == sizes[1] &&
|
|
prog_data->local_size[2] == sizes[2])
|
|
return brw_simd_select(prog_data);
|
|
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
|
|
struct brw_cs_prog_data cloned = *prog_data;
|
|
for (unsigned i = 0; i < 3; i++)
|
|
cloned.local_size[i] = sizes[i];
|
|
|
|
cloned.prog_mask = 0;
|
|
cloned.prog_spilled = 0;
|
|
|
|
const char *error[3] = {0};
|
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
|
/* We are not recompiling, so use original results of prog_mask and
|
|
* prog_spilled as they will already contain all possible compilations.
|
|
*/
|
|
if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
|
|
0 /* required_dispatch_width */, &error[simd]) &&
|
|
test_bit(prog_data->prog_mask, simd)) {
|
|
brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
|
|
}
|
|
}
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
return brw_simd_select(&cloned);
|
|
}
|