mesa/src/gallium/drivers/rocket/rkt_task.c
Yonggang Luo ecb0ccf603 treewide: Replace calling to function ALIGN with align
This is done by grep ALIGN( to align(

docs,*.xml,blake3 is excluded

Signed-off-by: Yonggang Luo <luoyonggang@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38365>
2025-11-12 21:58:40 +00:00

352 lines
12 KiB
C

/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "rkt_task.h"
#include "rkt_ml.h"
static unsigned
calc_entries_per_slice(struct rkt_operation *operation)
{
unsigned bpe = sizeof(uint8_t);
unsigned atomics_per_entry = CBUF_ENTRY_SIZE / FEATURE_ATOMIC_SIZE;
unsigned total_c_atomics =
DIV_ROUND_UP(operation->input_channels * bpe, FEATURE_ATOMIC_SIZE);
unsigned last_c_atomics = total_c_atomics % atomics_per_entry;
unsigned int_c_entries =
(total_c_atomics / atomics_per_entry) * operation->input_width;
unsigned frac_c_entries =
(last_c_atomics == 3)
? operation->input_width
: DIV_ROUND_UP(last_c_atomics * operation->input_width,
atomics_per_entry);
return int_c_entries + frac_c_entries;
}
static unsigned
calc_input_banks(struct rkt_operation *operation)
{
unsigned entries_per_slice = calc_entries_per_slice(operation);
return DIV_ROUND_UP(entries_per_slice * operation->input_height,
CBUF_ENTRIES_PER_BANK);
}
static unsigned
calc_weights_banks(struct rkt_operation *operation)
{
unsigned bpe = sizeof(uint8_t);
unsigned bytes = operation->weights_width * operation->weights_height *
operation->input_channels * bpe;
unsigned entries;
unsigned banks;
if (!operation->depthwise)
bytes *= operation->output_channels;
entries = DIV_ROUND_UP(bytes, CBUF_ENTRY_SIZE);
banks = DIV_ROUND_UP(entries, CBUF_ENTRIES_PER_BANK);
/* Why do we need an extra bank? The calc above might be wrong on this HW */
banks++;
return banks;
}
static unsigned
calc_line_stride(unsigned width)
{
return width * ATOMIC_K_SIZE * sizeof(uint8_t);
}
static void
calc_explicit_padding(const struct rkt_operation *operation,
unsigned *pad_top, unsigned *pad_bottom,
unsigned *pad_left, unsigned *pad_right)
{
if (operation->padding_same && operation->weights_width > 1) {
/* Convert from implicit to explicit padding */
unsigned pad_along_width =
MAX2((operation->output_width - 1) * operation->stride +
operation->weights_width - operation->input_width,
0);
unsigned pad_along_height =
MAX2((operation->output_height - 1) * operation->stride +
operation->weights_height - operation->input_height,
0);
*pad_left = pad_along_height / 2;
*pad_right = pad_along_height - *pad_left;
*pad_top = pad_along_width / 2;
*pad_bottom = pad_along_width - *pad_top;
} else {
*pad_left = 0;
*pad_right = 0;
*pad_top = 0;
*pad_bottom = 0;
}
}
static void
fill_task(struct rkt_ml_subgraph *subgraph,
struct rkt_operation *operation,
struct split_task *task)
{
task->stride_x = operation->stride;
task->stride_y = operation->stride;
task->input_width = operation->input_width;
if (task->input_width == 8 &&
(operation->addition_input || operation->add_tensor != -1))
task->input_width *= 2;
task->input_height = operation->input_height;
task->input_channels =
align(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
FEATURE_ATOMIC_SIZE);
task->input_channels_real = operation->input_channels;
task->input_zero_point = operation->input_zero_point;
task->input_scale = operation->input_scale;
task->output_width = operation->output_width;
task->output_height = operation->output_height;
task->output_channels_real = operation->output_channels;
task->output_channels = align(MAX2(operation->output_channels, 32), 32);
if (operation->depthwise) {
if (task->output_channels_real <= 32)
task->output_channels *= 2;
task->output_channels = align(task->output_channels, 64);
}
task->output_zero_point = operation->output_zero_point;
task->output_scale = operation->output_scale;
if (task->input_channels_real == 1 &&
(task->output_channels_real > 1 ||
(operation->addition_input || operation->add_tensor != -1))) {
task->input_width = MAX2(task->input_width, FEATURE_ATOMIC_SIZE);
task->input_line_stride =
MAX2(calc_line_stride(operation->input_width) / FEATURE_ATOMIC_SIZE,
FEATURE_ATOMIC_SIZE);
if (operation->input_channels == 32 && operation->input_width == 80) {
task->input_line_stride *= 4;
task->input_surface_stride = (float)task->input_line_stride *
(((float)task->input_height / 4) - 1);
} else
task->input_surface_stride =
(float)task->input_line_stride * (((float)task->input_height) - 1);
} else {
task->input_line_stride = calc_line_stride(operation->input_width) / 4;
task->input_surface_stride =
(float)task->input_line_stride * (((float)task->input_height / 4) - 1);
}
if (task->input_width == 8 &&
(operation->addition_input || operation->add_tensor != -1)) {
task->input_line_stride /= 2;
task->input_surface_stride = 112;
}
int output_line_stride = calc_line_stride(operation->output_width);
task->output_surface_stride = output_line_stride * task->output_height;
task->output_surface_stride /= FEATURE_ATOMIC_SIZE;
if (task->input_channels_real == 1)
task->input_data_entries = task->input_width * task->input_height;
else if (task->input_width == 40 && task->input_channels_real == 40)
task->input_data_entries = 40;
else
task->input_data_entries = DIV_ROUND_UP(
task->input_width * 2 *
DIV_ROUND_UP(task->input_channels_real, FEATURE_ATOMIC_SIZE),
8);
task->weights_width = operation->weights_width;
task->weights_height = operation->weights_height;
task->weights_zero_point = operation->weights_zero_point;
task->weights_scale = operation->weights_scale;
if (operation->depthwise)
task->weights_kernels = 1;
else
task->weights_kernels = align(operation->output_channels, 2);
task->surfaces_per_row = task->output_width * task->output_height * 2;
if (operation->depthwise)
task->surfaces_per_row *= 2;
}
void
rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
struct rkt_operation *operation)
{
/* Function mostly taken from NVDLA */
unsigned entries_per_slice = calc_entries_per_slice(operation);
unsigned input_banks_required = calc_input_banks(operation);
unsigned weights_banks_required = calc_weights_banks(operation);
unsigned available_weights_banks = weights_banks_required;
unsigned available_input_banks = CBUF_BANKS - weights_banks_required;
unsigned pad_top;
unsigned pad_bottom;
unsigned pad_left;
unsigned pad_right;
calc_explicit_padding(operation, &pad_top, &pad_bottom, &pad_left,
&pad_right);
if (weights_banks_required + 1 < CBUF_BANKS) {
/* Full weights, partial input */
operation->reuse_weights_cbuf = true;
} else {
/* Partial weights, partial input */
operation->reuse_weights_cbuf = false;
available_input_banks = 7;
available_weights_banks = CBUF_BANKS - available_input_banks;
}
if (input_banks_required <= available_input_banks) {
/* Full weights, full input */
struct split_task task = {0};
task.num = 0;
fill_task(subgraph, operation, &task);
task.input_banks = input_banks_required;
task.weights_banks = CBUF_BANKS - task.input_banks;
task.input_height = operation->input_height;
task.pad_top = pad_top;
task.pad_bottom = pad_bottom;
task.pad_left = pad_left;
task.pad_right = pad_right;
task.atomic_count = task.output_width * task.output_height;
util_dynarray_append(&operation->tasks, task);
return;
}
struct split_task task = {0};
unsigned available_slices =
(CBUF_ENTRIES_PER_BANK * available_input_banks) / entries_per_slice;
task.num = 0;
fill_task(subgraph, operation, &task);
task.input_banks = available_input_banks;
task.weights_banks = available_weights_banks;
task.top_slice = 0;
task.bottom_slice = available_slices - 1;
task.pad_top = pad_top;
task.pad_left = pad_left;
task.pad_right = pad_right;
util_dynarray_append(&operation->tasks, task);
for (unsigned slice = operation->weights_height - pad_top - 1;
slice < operation->input_height;) {
memset(&task, 0, sizeof(task));
struct split_task *prev_task = util_dynarray_element(
&operation->tasks, struct split_task,
util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
while (slice <= prev_task->bottom_slice) {
slice += operation->stride;
}
if (slice > prev_task->bottom_slice) {
slice -= operation->stride;
}
task.num = util_dynarray_num_elements(&operation->tasks, struct split_task);
fill_task(subgraph, operation, &task);
task.top_slice = MIN2(slice, prev_task->bottom_slice) -
(operation->weights_height - 1) + operation->stride;
task.bottom_slice = task.top_slice + available_slices - 1;
task.pad_left = pad_left;
task.pad_right = pad_right;
// check if current task is the last one
if (task.bottom_slice >= operation->input_height - 1) {
task.bottom_slice = operation->input_height - 1;
task.pad_bottom = pad_bottom;
util_dynarray_append(&operation->tasks, task);
break;
}
slice = task.top_slice + operation->weights_height - 1;
util_dynarray_append(&operation->tasks, task);
}
struct split_task *last_task = util_dynarray_element(
&operation->tasks, struct split_task,
util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
if (last_task->top_slice >= operation->input_height ||
last_task->bottom_slice >= (operation->input_height + pad_bottom)) {
(void)util_dynarray_pop_ptr(&operation->tasks, struct split_task);
}
// determine overlap slices between 2 split chunks
for (int i = 1;
i < util_dynarray_num_elements(&operation->tasks, struct split_task);
i++) {
struct split_task *prev_task =
util_dynarray_element(&operation->tasks, struct split_task, i - 1);
struct split_task *cur_task =
util_dynarray_element(&operation->tasks, struct split_task, i);
if (prev_task->bottom_slice >= cur_task->top_slice) {
cur_task->num_overlap_slices =
prev_task->bottom_slice - cur_task->top_slice + 1;
prev_task->num_retain_slices = cur_task->num_overlap_slices;
} else {
cur_task->num_overlap_slices = 0;
prev_task->num_retain_slices = 0;
}
}
unsigned output_height_processed = 0;
for (int i = 0;
i < util_dynarray_num_elements(&operation->tasks, struct split_task);
i++) {
struct split_task *cur_task =
util_dynarray_element(&operation->tasks, struct split_task, i);
unsigned slice = cur_task->top_slice + (operation->weights_height - 1) -
cur_task->pad_top;
while (slice <= cur_task->bottom_slice + cur_task->pad_bottom) {
slice += operation->stride;
cur_task->convolutions++;
}
cur_task->bottom_slice =
MIN2(cur_task->bottom_slice, operation->input_height - 1);
cur_task->input_height = cur_task->bottom_slice - cur_task->top_slice + 1;
cur_task->output_width = (cur_task->input_width + cur_task->pad_left +
cur_task->pad_right - operation->weights_width) /
operation->stride +
1;
cur_task->output_height =
(cur_task->input_height + cur_task->pad_top + cur_task->pad_bottom -
operation->weights_height) /
operation->stride +
1;
cur_task->atomic_count = cur_task->output_width * cur_task->output_height;
cur_task->input_offset =
calc_line_stride(operation->input_width) * cur_task->top_slice;
cur_task->output_offset =
calc_line_stride(operation->output_width) * output_height_processed;
cur_task->input_banks = available_input_banks;
cur_task->weights_banks = available_weights_banks;
output_height_processed += cur_task->output_height;
}
}