mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-05 11:10:10 +01:00
clover: Calculate optimal work group size when it's not specified by the user.
Inspired by a patch sent to the mailing list by Tom Stellard, but using a different algorithm to calculate the optimal block size that has been found to be considerably more effective. Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
This commit is contained in:
parent
67a3037444
commit
bf045bf9b4
5 changed files with 176 additions and 15 deletions
|
|
@ -4,6 +4,7 @@ CPP_SOURCES := \
|
|||
util/algorithm.hpp \
|
||||
util/compat.cpp \
|
||||
util/compat.hpp \
|
||||
util/factor.hpp \
|
||||
util/functional.hpp \
|
||||
util/lazy.hpp \
|
||||
util/pointer.hpp \
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
|
|||
break;
|
||||
|
||||
case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
|
||||
buf.as_vector<size_t>() = kern.block_size();
|
||||
buf.as_vector<size_t>() = kern.required_block_size();
|
||||
break;
|
||||
|
||||
case CL_KERNEL_LOCAL_MEM_SIZE:
|
||||
|
|
@ -220,6 +220,24 @@ namespace {
|
|||
if (!d_grid_size || any_of(is_zero(), grid_size))
|
||||
throw error(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
|
||||
return grid_size;
|
||||
}
|
||||
|
||||
std::vector<size_t>
|
||||
validate_grid_offset(const command_queue &q, cl_uint dims,
|
||||
const size_t *d_grid_offset) {
|
||||
if (d_grid_offset)
|
||||
return range(d_grid_offset, dims);
|
||||
else
|
||||
return std::vector<size_t>(dims, 0);
|
||||
}
|
||||
|
||||
std::vector<size_t>
|
||||
validate_block_size(const command_queue &q, const kernel &kern,
|
||||
cl_uint dims, const size_t *d_grid_size,
|
||||
const size_t *d_block_size) {
|
||||
auto grid_size = range(d_grid_size, dims);
|
||||
|
||||
if (d_block_size) {
|
||||
auto block_size = range(d_block_size, dims);
|
||||
|
||||
|
|
@ -233,15 +251,12 @@ namespace {
|
|||
if (fold(multiplies(), 1u, block_size) >
|
||||
q.dev.max_threads_per_block())
|
||||
throw error(CL_INVALID_WORK_GROUP_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t>
|
||||
pad_vector(const size_t *p, unsigned n, size_t x) {
|
||||
if (p)
|
||||
return { p, p + n };
|
||||
else
|
||||
return { n, x };
|
||||
return block_size;
|
||||
|
||||
} else {
|
||||
return kern.optimal_block_size(q, grid_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -254,13 +269,13 @@ clEnqueueNDRangeKernel(cl_command_queue d_q, cl_kernel d_kern,
|
|||
auto &q = obj(d_q);
|
||||
auto &kern = obj(d_kern);
|
||||
auto deps = objs<wait_list_tag>(d_deps, num_deps);
|
||||
auto grid_size = validate_grid_size(q, dims, d_grid_size);
|
||||
auto grid_offset = validate_grid_offset(q, dims, d_grid_offset);
|
||||
auto block_size = validate_block_size(q, kern, dims,
|
||||
d_grid_size, d_block_size);
|
||||
|
||||
validate_common(q, kern, deps);
|
||||
validate_grid(q, dims, d_grid_size, d_block_size);
|
||||
|
||||
auto grid_offset = pad_vector(d_grid_offset, dims, 0);
|
||||
auto grid_size = pad_vector(d_grid_size, dims, 1);
|
||||
auto block_size = pad_vector(d_block_size, dims, 1);
|
||||
hard_event *hev = new hard_event(
|
||||
q, CL_COMMAND_NDRANGE_KERNEL, deps,
|
||||
[=, &kern, &q](event &) {
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "core/kernel.hpp"
|
||||
#include "core/resource.hpp"
|
||||
#include "util/factor.hpp"
|
||||
#include "util/u_math.h"
|
||||
#include "pipe/p_context.h"
|
||||
|
||||
|
|
@ -126,7 +127,15 @@ kernel::name() const {
|
|||
}
|
||||
|
||||
std::vector<size_t>
|
||||
kernel::block_size() const {
|
||||
kernel::optimal_block_size(const command_queue &q,
|
||||
const std::vector<size_t> &grid_size) const {
|
||||
return factor::find_grid_optimal_factor<size_t>(
|
||||
q.dev.max_threads_per_block(), q.dev.max_block_size(),
|
||||
grid_size);
|
||||
}
|
||||
|
||||
std::vector<size_t>
|
||||
kernel::required_block_size() const {
|
||||
return { 0, 0, 0 };
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -121,7 +121,12 @@ namespace clover {
|
|||
size_t mem_private() const;
|
||||
|
||||
const std::string &name() const;
|
||||
std::vector<size_t> block_size() const;
|
||||
|
||||
std::vector<size_t>
|
||||
optimal_block_size(const command_queue &q,
|
||||
const std::vector<size_t> &grid_size) const;
|
||||
std::vector<size_t>
|
||||
required_block_size() const;
|
||||
|
||||
argument_range args();
|
||||
const_argument_range args() const;
|
||||
|
|
|
|||
131
src/gallium/state_trackers/clover/util/factor.hpp
Normal file
131
src/gallium/state_trackers/clover/util/factor.hpp
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
//
|
||||
// Copyright 2013 Francisco Jerez
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a
|
||||
// copy of this software and associated documentation files (the "Software"),
|
||||
// to deal in the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
//
|
||||
|
||||
#ifndef CLOVER_UTIL_FACTOR_HPP
|
||||
#define CLOVER_UTIL_FACTOR_HPP
|
||||
|
||||
#include "util/range.hpp"
|
||||
|
||||
namespace clover {
|
||||
namespace factor {
|
||||
///
|
||||
/// Calculate all prime integer factors of \p x.
|
||||
///
|
||||
/// If \p limit is non-zero, terminate early as soon as enough
|
||||
/// factors have been collected to reach the product \p limit.
|
||||
///
|
||||
template<typename T>
|
||||
std::vector<T>
|
||||
find_integer_prime_factors(T x, T limit = 0)
|
||||
{
|
||||
const T max_d = (limit > 0 && limit < x ? limit : x);
|
||||
const T min_x = x / max_d;
|
||||
std::vector<T> factors;
|
||||
|
||||
for (T d = 2; d <= max_d && x > min_x; d++) {
|
||||
if (x % d == 0) {
|
||||
for (; x % d == 0; x /= d);
|
||||
factors.push_back(d);
|
||||
}
|
||||
}
|
||||
|
||||
return factors;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
///
|
||||
/// Walk the power set of prime factors of the n-dimensional
|
||||
/// integer array \p grid subject to the constraints given by
|
||||
/// \p limits.
|
||||
///
|
||||
template<typename T>
|
||||
std::pair<T, std::vector<T>>
|
||||
next_grid_factor(const std::pair<T, std::vector<T>> &limits,
|
||||
const std::vector<T> &grid,
|
||||
const std::vector<std::vector<T>> &factors,
|
||||
std::pair<T, std::vector<T>> block,
|
||||
unsigned d = 0, unsigned i = 0) {
|
||||
if (d >= factors.size()) {
|
||||
// We're done.
|
||||
return {};
|
||||
|
||||
} else if (i >= factors[d].size()) {
|
||||
// We're done with this grid dimension, try the next.
|
||||
return next_grid_factor(limits, grid, factors,
|
||||
std::move(block), d + 1, 0);
|
||||
|
||||
} else {
|
||||
T f = factors[d][i];
|
||||
|
||||
// Try the next power of this factor.
|
||||
block.first *= f;
|
||||
block.second[d] *= f;
|
||||
|
||||
if (block.first <= limits.first &&
|
||||
block.second[d] <= limits.second[d] &&
|
||||
grid[d] % block.second[d] == 0) {
|
||||
// We've found a valid grid divisor.
|
||||
return block;
|
||||
|
||||
} else {
|
||||
// Overflow, back off to the zeroth power,
|
||||
while (block.second[d] % f == 0) {
|
||||
block.second[d] /= f;
|
||||
block.first /= f;
|
||||
}
|
||||
|
||||
// ...and carry to the next factor.
|
||||
return next_grid_factor(limits, grid, factors,
|
||||
std::move(block), d, i + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Find the divisor of the integer array \p grid that gives the
|
||||
/// highest possible product not greater than \p product_limit
|
||||
/// subject to the constraints given by \p coord_limit.
|
||||
///
|
||||
template<typename T>
|
||||
std::vector<T>
|
||||
find_grid_optimal_factor(T product_limit,
|
||||
const std::vector<T> &coord_limit,
|
||||
const std::vector<T> &grid) {
|
||||
const std::vector<std::vector<T>> factors =
|
||||
map(find_integer_prime_factors<T>, grid, coord_limit);
|
||||
const auto limits = std::make_pair(product_limit, coord_limit);
|
||||
auto best = std::make_pair(T(1), std::vector<T>(grid.size(), T(1)));
|
||||
|
||||
for (auto block = best;
|
||||
block.first != 0 && best.first != product_limit;
|
||||
block = detail::next_grid_factor(limits, grid, factors, block)) {
|
||||
if (block.first > best.first)
|
||||
best = block;
|
||||
}
|
||||
|
||||
return best.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
Loading…
Add table
Reference in a new issue