etnaviv/ml: Fix reshuffle TP jobs on V8

What we had didn't work on V8, but with these fixes for V8, these jobs
still run well on V7.

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
This commit is contained in:
Tomeu Vizoso 2024-10-23 10:41:30 +02:00 committed by Marge Bot
parent f186844545
commit 93298a873b
2 changed files with 153 additions and 207 deletions

View file

@ -12,6 +12,16 @@
#define MAX_CONFIG_BOS 4
/*
* SWAP - swap value of @a and @b
*/
#define SWAP(a, b) \
do { \
__typeof(a) __tmp = (a); \
(a) = (b); \
(b) = __tmp; \
} while (0)
enum etna_job_type {
ETNA_JOB_TYPE_NN,
ETNA_JOB_TYPE_TP,

View file

@ -344,49 +344,72 @@ create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_o
return bo;
}
static void
set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used)
static unsigned
split_reshuffle(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims, unsigned *pad_x_out, unsigned *pad_y_out)
{
map->in_image_x_size = operation->input_width;
unsigned remaining_out_size, remaining_in_size;
unsigned dim_to_split = 0;
if (operation->padding_same && operation->input_channels > 1) {
map->in_image_y_size = operation->input_height;
map->in_image_z_size = operation->input_channels / tp_cores_used;
} else if (operation->padding_same && operation->input_channels == 1) {
switch(operation->input_width) {
case 3:
case 5:
map->in_image_y_size = operation->input_height;
break;
case 8:
switch(operation->weight_width) {
case 3:
map->in_image_y_size = operation->input_height;
break;
case 5:
map->in_image_y_size = 5;
break;
if (out_dims[1] >= out_dims[dim_to_split])
dim_to_split = 1;
if (out_dims[2] >= out_dims[dim_to_split])
dim_to_split = 2;
remaining_in_size = in_dims[dim_to_split];
remaining_out_size = out_dims[dim_to_split];
for (unsigned i = 0; i <= tp_core; i++) {
unsigned size = DIV_ROUND_UP(remaining_out_size, (tp_cores_used - i));
unsigned pad_x = 0;
unsigned pad_y = 0;
if (operation->padding_same) {
if (operation->weight_width == 5) {
if (i == 0 || dim_to_split != 0)
pad_x++;
if (i == 0 || dim_to_split != 1)
pad_y++;
}
break;
case 80:
case 112:
switch(operation->weight_width) {
case 3:
map->in_image_y_size = operation->input_height / tp_cores_used + 2;
break;
case 5:
map->in_image_y_size = operation->input_height / tp_cores_used + 1;
break;
}
break;
default:
unreachable("Unsupported input width");
if (operation->input_width % 2)
if (i == 0 || dim_to_split != 0)
pad_x++;
if (operation->input_height % 2)
if (i == 0 || dim_to_split != 1)
pad_y++;
}
map->in_image_z_size = operation->input_channels;
} else {
map->in_image_y_size = operation->input_height / tp_cores_used;
map->in_image_z_size = operation->input_channels;
if (i < tp_cores_used - 1) {
in_dims[dim_to_split] = size;
if (dim_to_split != 2)
in_dims[dim_to_split] *= operation->stride;
if (dim_to_split == 0)
in_dims[dim_to_split] -= pad_x;
else if (dim_to_split == 1)
in_dims[dim_to_split] -= pad_y;
remaining_in_size -= in_dims[dim_to_split];
} else
in_dims[dim_to_split] = remaining_in_size;
if (i == tp_core) {
if (pad_x_out)
*pad_x_out = pad_x;
if (pad_y_out)
*pad_y_out = pad_y;
}
out_dims[dim_to_split] = size;
remaining_out_size -= size;
}
return dim_to_split;
}
static struct etna_bo *
@ -394,10 +417,15 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
unsigned tp_core, unsigned tp_cores_used)
{
struct etna_context *ctx = etna_context(subgraph->base.context);
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
sizeof(struct etna_tp_params),
DRM_ETNA_GEM_CACHE_WC);
sizeof(struct etna_tp_params),
DRM_ETNA_GEM_CACHE_WC);
unsigned input_width = operation->input_width;
unsigned input_height = operation->input_height;
unsigned output_width = operation->output_width;
unsigned output_height = operation->output_height;
unsigned in_dims[3];
unsigned out_dims[3];
etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
@ -405,178 +433,101 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
set_default_tp_config(map);
set_input_size(operation, map, tp_cores_used);
map->in_image_stride = operation->input_width;
map->in_image_slice = operation->input_width * operation->input_height;
if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) {
if (operation->weight_width == 5 && operation->input_width < 8) {
map->in_window_x_start = 0xfffe;
map->in_window_y_start = 0xfffe;
} else {
map->in_window_x_start = 0xffff;
map->in_window_y_start = 0xffff;
}
} else {
map->in_window_x_start = 0x0;
map->in_window_y_start = 0x0;
if (input_height > input_width) {
SWAP(input_width, input_height);
SWAP(output_width, output_height);
}
map->in_window_x_end = operation->input_width - 1;
map->in_window_y_end = (operation->input_height / tp_cores_used) - 1;
map->in_tile_x_size = operation->input_width;
map->in_tile_x_inc = operation->input_width;
in_dims[0] = input_width;
in_dims[1] = input_height;
in_dims[2] = operation->input_channels;
if (operation->input_width <= 8 && operation->input_channels == 1) {
map->in_tile_y_size = operation->input_height;
map->in_tile_y_inc = operation->input_height;
} else {
map->in_tile_y_size = operation->input_height / tp_cores_used;
map->in_tile_y_inc = operation->input_height / tp_cores_used;
}
out_dims[0] = output_width;
out_dims[1] = output_height;
out_dims[2] = operation->input_channels;
if (operation->padding_same) {
switch(operation->weight_width) {
case 3:
map->in_window_x_end += 2;
if (operation->input_width < 8) {
map->in_tile_x_size += 3;
map->in_tile_y_size += 1;
map->in_tile_y_inc += 1;
} else {
map->in_tile_x_size += 2;
}
break;
case 5:
map->in_window_x_end += 3;
if (operation->input_width < 8) {
map->in_tile_x_size += 5;
} else {
map->in_tile_x_size += 4;
}
break;
default:
unreachable("Unsupported weight size");
}
unsigned pad_x = 0;
unsigned pad_y = 0;
unsigned split_dim = split_reshuffle(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims, &pad_x, &pad_y);
if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5)
map->in_tile_x_size = operation->input_width / tp_cores_used + 2;
map->in_image_x_size = in_dims[0];
map->in_image_y_size = in_dims[1];
map->in_image_z_size = in_dims[2];
if (operation->input_width > 8 && operation->input_channels == 1) {
switch(operation->weight_width) {
case 3:
map->in_window_y_end = (operation->input_height / tp_cores_used) + 1;
break;
case 5:
map->in_window_y_end = (operation->input_height / tp_cores_used);
break;
default:
unreachable("Unsupported weight size");
}
} else
map->in_window_y_end = map->in_window_x_end;
ML_DBG("map->in_image_z_size %d in_dims[2] %d split_dim %d\n", map->in_image_z_size, in_dims[2], split_dim);
map->in_tile_x_inc = map->in_tile_x_size;
map->in_image_stride = operation->input_height;
map->in_image_slice = input_width * input_height;
if (operation->input_channels > 1) {
map->in_tile_y_size = map->in_tile_x_size;
map->in_tile_y_inc = map->in_tile_x_size;
} else {
map->in_tile_y_size += 2;
map->in_tile_y_inc += 2;
}
} else {
if (operation->input_width < 8) {
map->in_window_x_end += 1;
map->in_window_y_end += 1;
map->in_tile_x_size += 1;
map->in_tile_y_size += 1;
map->in_tile_x_inc += 1;
map->in_tile_y_inc += 1;
}
}
map->in_window_x_start = 0x0 - pad_x;
map->in_window_y_start = 0x0 - pad_y;
unsigned out_loop_0_count = 0x2;
map->in_window_x_end = out_dims[0] * out_loop_0_count - 1 - pad_x;
map->in_window_y_end = out_dims[1] * 2 - 1 - pad_y;
map->in_tile_x_size = out_dims[0] * out_loop_0_count;
map->in_tile_x_inc = map->in_tile_x_size;
map->in_tile_y_size = out_dims[1] * 2;
map->in_tile_y_inc = out_dims[1] * 2;
struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
if (operation->padding_same)
map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
else
map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core;
unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
offset = etna_ml_get_offset(subgraph, operation->output_tensor);
map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
if (operation->padding_same)
map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
else
map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core;
for (unsigned i = 0; i < tp_core; i++) {
unsigned in_dims[3];
unsigned out_dims[3];
unsigned in_offset = 0;
unsigned out_offset = 0;
in_dims[0] = input_width;
in_dims[1] = input_height;
in_dims[2] = operation->input_channels;
out_dims[0] = output_width;
out_dims[1] = output_height;
out_dims[2] = operation->input_channels;
unsigned split_dim = split_reshuffle(subgraph, operation, i, tp_cores_used, in_dims, out_dims, NULL, NULL);
switch(split_dim) {
case 0:
in_offset = in_dims[0];
out_offset = out_dims[0];
break;
case 1:
in_offset = map->in_image_stride * in_dims[1];
out_offset = output_height * out_dims[1];
break;
case 2:
in_offset = map->in_image_slice * in_dims[2];
out_offset = out_dims[2] * map->in_tile_x_size * map->in_tile_y_size;
break;
default:
break;
}
map->in_image_base_address += in_offset;
map->out_image_base_address += out_offset;
}
map->out_loop_1_reset = 0x1;
map->out_loop_2_reset = 0x0;
map->out_loop_3_reset = 0x1;
map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2);
map->out_loop_0_inc = output_width * output_height;
map->out_loop_1_inc = 0x1;
map->out_loop_0_count = 0x2;
map->out_loop_1_count = round(operation->input_width / 2.0);
map->out_loop_2_count = 0x2;
map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used);
if (operation->padding_same) {
switch(operation->weight_width) {
case 3:
map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2);
map->out_loop_1_count += 1;
break;
case 5:
map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2);
map->out_loop_1_count += 2;
break;
default:
unreachable("Unsupported weight size");
}
if (operation->input_channels == 1)
map->out_loop_3_count += 1;
else
map->out_loop_3_count = map->out_loop_1_count;
}
map->out_loop_0_count = out_loop_0_count;
map->out_loop_1_count = out_dims[0];
map->out_loop_2_count = out_loop_0_count;
map->out_loop_3_count = out_dims[1];
map->out_loop_2_inc = map->out_loop_0_inc * 2;
map->out_loop_3_inc = map->out_loop_1_count;
map->out_loop_3_inc = output_width;
map->out_loop_6_inc = map->out_loop_0_inc * 4;
if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) {
if (tp_core > 0) {
map->in_image_y_size -= 2;
map->in_window_y_end -= 2;
map->in_tile_y_size -= 2;
map->in_tile_y_inc -= 2;
map->out_loop_3_count -= 1;
}
if (tp_core == tp_core_count - 1) {
map->in_image_y_size -= 2;
}
if (tp_core > 0) {
map->in_image_base_address += operation->input_width * 2;
map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1);
}
}
unsigned alu_size = operation->input_width;
if (operation->padding_same) {
alu_size += 1;
if (operation->weight_width == 5)
alu_size += 1;
if (operation->input_width == 5)
alu_size += 1;
}
map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size);
map->in_zp = operation->input_zero_point;
map->out_zp = operation->input_zero_point;
@ -588,21 +539,6 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
map->out_image_circular_buf_size = 0x0;
map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
if (map->in_image_y_size < 2) {
map->in_image_y_size = operation->input_width;
map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used;
map->in_window_y_end = operation->input_width;
map->in_tile_y_size = operation->input_width + 1;
map->in_tile_y_inc = operation->input_width + 1;
map->out_loop_3_count += 1;
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
}
etna_bo_cpu_fini(bo);
return bo;