mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-02 04:48:26 +02:00
This is done by grep ALIGN( to align( docs,*.xml,blake3 is excluded Signed-off-by: Yonggang Luo <luoyonggang@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38365>
352 lines
12 KiB
C
352 lines
12 KiB
C
/*
|
|
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "rkt_task.h"
|
|
#include "rkt_ml.h"
|
|
|
|
static unsigned
|
|
calc_entries_per_slice(struct rkt_operation *operation)
|
|
{
|
|
unsigned bpe = sizeof(uint8_t);
|
|
unsigned atomics_per_entry = CBUF_ENTRY_SIZE / FEATURE_ATOMIC_SIZE;
|
|
unsigned total_c_atomics =
|
|
DIV_ROUND_UP(operation->input_channels * bpe, FEATURE_ATOMIC_SIZE);
|
|
unsigned last_c_atomics = total_c_atomics % atomics_per_entry;
|
|
unsigned int_c_entries =
|
|
(total_c_atomics / atomics_per_entry) * operation->input_width;
|
|
unsigned frac_c_entries =
|
|
(last_c_atomics == 3)
|
|
? operation->input_width
|
|
: DIV_ROUND_UP(last_c_atomics * operation->input_width,
|
|
atomics_per_entry);
|
|
|
|
return int_c_entries + frac_c_entries;
|
|
}
|
|
|
|
static unsigned
|
|
calc_input_banks(struct rkt_operation *operation)
|
|
{
|
|
unsigned entries_per_slice = calc_entries_per_slice(operation);
|
|
return DIV_ROUND_UP(entries_per_slice * operation->input_height,
|
|
CBUF_ENTRIES_PER_BANK);
|
|
}
|
|
|
|
static unsigned
|
|
calc_weights_banks(struct rkt_operation *operation)
|
|
{
|
|
unsigned bpe = sizeof(uint8_t);
|
|
unsigned bytes = operation->weights_width * operation->weights_height *
|
|
operation->input_channels * bpe;
|
|
unsigned entries;
|
|
unsigned banks;
|
|
|
|
if (!operation->depthwise)
|
|
bytes *= operation->output_channels;
|
|
entries = DIV_ROUND_UP(bytes, CBUF_ENTRY_SIZE);
|
|
banks = DIV_ROUND_UP(entries, CBUF_ENTRIES_PER_BANK);
|
|
|
|
/* Why do we need an extra bank? The calc above might be wrong on this HW */
|
|
banks++;
|
|
|
|
return banks;
|
|
}
|
|
|
|
static unsigned
|
|
calc_line_stride(unsigned width)
|
|
{
|
|
return width * ATOMIC_K_SIZE * sizeof(uint8_t);
|
|
}
|
|
|
|
static void
|
|
calc_explicit_padding(const struct rkt_operation *operation,
|
|
unsigned *pad_top, unsigned *pad_bottom,
|
|
unsigned *pad_left, unsigned *pad_right)
|
|
{
|
|
if (operation->padding_same && operation->weights_width > 1) {
|
|
/* Convert from implicit to explicit padding */
|
|
unsigned pad_along_width =
|
|
MAX2((operation->output_width - 1) * operation->stride +
|
|
operation->weights_width - operation->input_width,
|
|
0);
|
|
unsigned pad_along_height =
|
|
MAX2((operation->output_height - 1) * operation->stride +
|
|
operation->weights_height - operation->input_height,
|
|
0);
|
|
*pad_left = pad_along_height / 2;
|
|
*pad_right = pad_along_height - *pad_left;
|
|
*pad_top = pad_along_width / 2;
|
|
*pad_bottom = pad_along_width - *pad_top;
|
|
} else {
|
|
*pad_left = 0;
|
|
*pad_right = 0;
|
|
*pad_top = 0;
|
|
*pad_bottom = 0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
fill_task(struct rkt_ml_subgraph *subgraph,
|
|
struct rkt_operation *operation,
|
|
struct split_task *task)
|
|
{
|
|
task->stride_x = operation->stride;
|
|
task->stride_y = operation->stride;
|
|
|
|
task->input_width = operation->input_width;
|
|
if (task->input_width == 8 &&
|
|
(operation->addition_input || operation->add_tensor != -1))
|
|
task->input_width *= 2;
|
|
|
|
task->input_height = operation->input_height;
|
|
task->input_channels =
|
|
align(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
|
|
FEATURE_ATOMIC_SIZE);
|
|
task->input_channels_real = operation->input_channels;
|
|
task->input_zero_point = operation->input_zero_point;
|
|
task->input_scale = operation->input_scale;
|
|
|
|
task->output_width = operation->output_width;
|
|
task->output_height = operation->output_height;
|
|
|
|
task->output_channels_real = operation->output_channels;
|
|
task->output_channels = align(MAX2(operation->output_channels, 32), 32);
|
|
if (operation->depthwise) {
|
|
if (task->output_channels_real <= 32)
|
|
task->output_channels *= 2;
|
|
task->output_channels = align(task->output_channels, 64);
|
|
}
|
|
|
|
task->output_zero_point = operation->output_zero_point;
|
|
task->output_scale = operation->output_scale;
|
|
|
|
if (task->input_channels_real == 1 &&
|
|
(task->output_channels_real > 1 ||
|
|
(operation->addition_input || operation->add_tensor != -1))) {
|
|
task->input_width = MAX2(task->input_width, FEATURE_ATOMIC_SIZE);
|
|
task->input_line_stride =
|
|
MAX2(calc_line_stride(operation->input_width) / FEATURE_ATOMIC_SIZE,
|
|
FEATURE_ATOMIC_SIZE);
|
|
|
|
if (operation->input_channels == 32 && operation->input_width == 80) {
|
|
task->input_line_stride *= 4;
|
|
task->input_surface_stride = (float)task->input_line_stride *
|
|
(((float)task->input_height / 4) - 1);
|
|
} else
|
|
task->input_surface_stride =
|
|
(float)task->input_line_stride * (((float)task->input_height) - 1);
|
|
} else {
|
|
task->input_line_stride = calc_line_stride(operation->input_width) / 4;
|
|
task->input_surface_stride =
|
|
(float)task->input_line_stride * (((float)task->input_height / 4) - 1);
|
|
}
|
|
|
|
if (task->input_width == 8 &&
|
|
(operation->addition_input || operation->add_tensor != -1)) {
|
|
task->input_line_stride /= 2;
|
|
task->input_surface_stride = 112;
|
|
}
|
|
|
|
int output_line_stride = calc_line_stride(operation->output_width);
|
|
task->output_surface_stride = output_line_stride * task->output_height;
|
|
task->output_surface_stride /= FEATURE_ATOMIC_SIZE;
|
|
|
|
if (task->input_channels_real == 1)
|
|
task->input_data_entries = task->input_width * task->input_height;
|
|
else if (task->input_width == 40 && task->input_channels_real == 40)
|
|
task->input_data_entries = 40;
|
|
else
|
|
task->input_data_entries = DIV_ROUND_UP(
|
|
task->input_width * 2 *
|
|
DIV_ROUND_UP(task->input_channels_real, FEATURE_ATOMIC_SIZE),
|
|
8);
|
|
|
|
task->weights_width = operation->weights_width;
|
|
task->weights_height = operation->weights_height;
|
|
task->weights_zero_point = operation->weights_zero_point;
|
|
task->weights_scale = operation->weights_scale;
|
|
|
|
if (operation->depthwise)
|
|
task->weights_kernels = 1;
|
|
else
|
|
task->weights_kernels = align(operation->output_channels, 2);
|
|
|
|
task->surfaces_per_row = task->output_width * task->output_height * 2;
|
|
if (operation->depthwise)
|
|
task->surfaces_per_row *= 2;
|
|
}
|
|
|
|
void
|
|
rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
|
|
struct rkt_operation *operation)
|
|
{
|
|
/* Function mostly taken from NVDLA */
|
|
unsigned entries_per_slice = calc_entries_per_slice(operation);
|
|
unsigned input_banks_required = calc_input_banks(operation);
|
|
unsigned weights_banks_required = calc_weights_banks(operation);
|
|
unsigned available_weights_banks = weights_banks_required;
|
|
unsigned available_input_banks = CBUF_BANKS - weights_banks_required;
|
|
unsigned pad_top;
|
|
unsigned pad_bottom;
|
|
unsigned pad_left;
|
|
unsigned pad_right;
|
|
|
|
calc_explicit_padding(operation, &pad_top, &pad_bottom, &pad_left,
|
|
&pad_right);
|
|
|
|
if (weights_banks_required + 1 < CBUF_BANKS) {
|
|
/* Full weights, partial input */
|
|
operation->reuse_weights_cbuf = true;
|
|
} else {
|
|
/* Partial weights, partial input */
|
|
operation->reuse_weights_cbuf = false;
|
|
available_input_banks = 7;
|
|
available_weights_banks = CBUF_BANKS - available_input_banks;
|
|
}
|
|
|
|
if (input_banks_required <= available_input_banks) {
|
|
/* Full weights, full input */
|
|
|
|
struct split_task task = {0};
|
|
|
|
task.num = 0;
|
|
fill_task(subgraph, operation, &task);
|
|
task.input_banks = input_banks_required;
|
|
task.weights_banks = CBUF_BANKS - task.input_banks;
|
|
task.input_height = operation->input_height;
|
|
|
|
task.pad_top = pad_top;
|
|
task.pad_bottom = pad_bottom;
|
|
task.pad_left = pad_left;
|
|
task.pad_right = pad_right;
|
|
|
|
task.atomic_count = task.output_width * task.output_height;
|
|
|
|
util_dynarray_append(&operation->tasks, task);
|
|
|
|
return;
|
|
}
|
|
|
|
struct split_task task = {0};
|
|
unsigned available_slices =
|
|
(CBUF_ENTRIES_PER_BANK * available_input_banks) / entries_per_slice;
|
|
|
|
task.num = 0;
|
|
fill_task(subgraph, operation, &task);
|
|
task.input_banks = available_input_banks;
|
|
task.weights_banks = available_weights_banks;
|
|
|
|
task.top_slice = 0;
|
|
task.bottom_slice = available_slices - 1;
|
|
|
|
task.pad_top = pad_top;
|
|
task.pad_left = pad_left;
|
|
task.pad_right = pad_right;
|
|
|
|
util_dynarray_append(&operation->tasks, task);
|
|
|
|
for (unsigned slice = operation->weights_height - pad_top - 1;
|
|
slice < operation->input_height;) {
|
|
memset(&task, 0, sizeof(task));
|
|
|
|
struct split_task *prev_task = util_dynarray_element(
|
|
&operation->tasks, struct split_task,
|
|
util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
|
|
|
|
while (slice <= prev_task->bottom_slice) {
|
|
slice += operation->stride;
|
|
}
|
|
if (slice > prev_task->bottom_slice) {
|
|
slice -= operation->stride;
|
|
}
|
|
|
|
task.num = util_dynarray_num_elements(&operation->tasks, struct split_task);
|
|
fill_task(subgraph, operation, &task);
|
|
task.top_slice = MIN2(slice, prev_task->bottom_slice) -
|
|
(operation->weights_height - 1) + operation->stride;
|
|
task.bottom_slice = task.top_slice + available_slices - 1;
|
|
task.pad_left = pad_left;
|
|
task.pad_right = pad_right;
|
|
|
|
// check if current task is the last one
|
|
if (task.bottom_slice >= operation->input_height - 1) {
|
|
task.bottom_slice = operation->input_height - 1;
|
|
task.pad_bottom = pad_bottom;
|
|
util_dynarray_append(&operation->tasks, task);
|
|
break;
|
|
}
|
|
|
|
slice = task.top_slice + operation->weights_height - 1;
|
|
util_dynarray_append(&operation->tasks, task);
|
|
}
|
|
|
|
struct split_task *last_task = util_dynarray_element(
|
|
&operation->tasks, struct split_task,
|
|
util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
|
|
if (last_task->top_slice >= operation->input_height ||
|
|
last_task->bottom_slice >= (operation->input_height + pad_bottom)) {
|
|
(void)util_dynarray_pop_ptr(&operation->tasks, struct split_task);
|
|
}
|
|
|
|
// determine overlap slices between 2 split chunks
|
|
for (int i = 1;
|
|
i < util_dynarray_num_elements(&operation->tasks, struct split_task);
|
|
i++) {
|
|
struct split_task *prev_task =
|
|
util_dynarray_element(&operation->tasks, struct split_task, i - 1);
|
|
struct split_task *cur_task =
|
|
util_dynarray_element(&operation->tasks, struct split_task, i);
|
|
|
|
if (prev_task->bottom_slice >= cur_task->top_slice) {
|
|
cur_task->num_overlap_slices =
|
|
prev_task->bottom_slice - cur_task->top_slice + 1;
|
|
prev_task->num_retain_slices = cur_task->num_overlap_slices;
|
|
} else {
|
|
cur_task->num_overlap_slices = 0;
|
|
prev_task->num_retain_slices = 0;
|
|
}
|
|
}
|
|
|
|
unsigned output_height_processed = 0;
|
|
for (int i = 0;
|
|
i < util_dynarray_num_elements(&operation->tasks, struct split_task);
|
|
i++) {
|
|
struct split_task *cur_task =
|
|
util_dynarray_element(&operation->tasks, struct split_task, i);
|
|
|
|
unsigned slice = cur_task->top_slice + (operation->weights_height - 1) -
|
|
cur_task->pad_top;
|
|
|
|
while (slice <= cur_task->bottom_slice + cur_task->pad_bottom) {
|
|
slice += operation->stride;
|
|
cur_task->convolutions++;
|
|
}
|
|
|
|
cur_task->bottom_slice =
|
|
MIN2(cur_task->bottom_slice, operation->input_height - 1);
|
|
|
|
cur_task->input_height = cur_task->bottom_slice - cur_task->top_slice + 1;
|
|
|
|
cur_task->output_width = (cur_task->input_width + cur_task->pad_left +
|
|
cur_task->pad_right - operation->weights_width) /
|
|
operation->stride +
|
|
1;
|
|
cur_task->output_height =
|
|
(cur_task->input_height + cur_task->pad_top + cur_task->pad_bottom -
|
|
operation->weights_height) /
|
|
operation->stride +
|
|
1;
|
|
cur_task->atomic_count = cur_task->output_width * cur_task->output_height;
|
|
|
|
cur_task->input_offset =
|
|
calc_line_stride(operation->input_width) * cur_task->top_slice;
|
|
cur_task->output_offset =
|
|
calc_line_stride(operation->output_width) * output_height_processed;
|
|
|
|
cur_task->input_banks = available_input_banks;
|
|
cur_task->weights_banks = available_weights_banks;
|
|
|
|
output_height_processed += cur_task->output_height;
|
|
}
|
|
}
|