mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 11:08:03 +02:00
etnaviv/ml: Fix reshuffle TP jobs on V8
What we had didn't work on V8, but with these fixes for V8, these jobs still run well on V7. Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
This commit is contained in:
parent
f186844545
commit
93298a873b
2 changed files with 153 additions and 207 deletions
|
|
@ -12,6 +12,16 @@
|
|||
|
||||
#define MAX_CONFIG_BOS 4
|
||||
|
||||
/*
|
||||
* SWAP - swap value of @a and @b
|
||||
*/
|
||||
#define SWAP(a, b) \
|
||||
do { \
|
||||
__typeof(a) __tmp = (a); \
|
||||
(a) = (b); \
|
||||
(b) = __tmp; \
|
||||
} while (0)
|
||||
|
||||
enum etna_job_type {
|
||||
ETNA_JOB_TYPE_NN,
|
||||
ETNA_JOB_TYPE_TP,
|
||||
|
|
|
|||
|
|
@ -344,49 +344,72 @@ create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_o
|
|||
return bo;
|
||||
}
|
||||
|
||||
static void
|
||||
set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used)
|
||||
static unsigned
|
||||
split_reshuffle(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims, unsigned *pad_x_out, unsigned *pad_y_out)
|
||||
{
|
||||
map->in_image_x_size = operation->input_width;
|
||||
unsigned remaining_out_size, remaining_in_size;
|
||||
unsigned dim_to_split = 0;
|
||||
|
||||
if (operation->padding_same && operation->input_channels > 1) {
|
||||
map->in_image_y_size = operation->input_height;
|
||||
map->in_image_z_size = operation->input_channels / tp_cores_used;
|
||||
} else if (operation->padding_same && operation->input_channels == 1) {
|
||||
switch(operation->input_width) {
|
||||
case 3:
|
||||
case 5:
|
||||
map->in_image_y_size = operation->input_height;
|
||||
break;
|
||||
case 8:
|
||||
switch(operation->weight_width) {
|
||||
case 3:
|
||||
map->in_image_y_size = operation->input_height;
|
||||
break;
|
||||
case 5:
|
||||
map->in_image_y_size = 5;
|
||||
break;
|
||||
if (out_dims[1] >= out_dims[dim_to_split])
|
||||
dim_to_split = 1;
|
||||
|
||||
if (out_dims[2] >= out_dims[dim_to_split])
|
||||
dim_to_split = 2;
|
||||
|
||||
remaining_in_size = in_dims[dim_to_split];
|
||||
remaining_out_size = out_dims[dim_to_split];
|
||||
|
||||
for (unsigned i = 0; i <= tp_core; i++) {
|
||||
unsigned size = DIV_ROUND_UP(remaining_out_size, (tp_cores_used - i));
|
||||
unsigned pad_x = 0;
|
||||
unsigned pad_y = 0;
|
||||
|
||||
if (operation->padding_same) {
|
||||
if (operation->weight_width == 5) {
|
||||
if (i == 0 || dim_to_split != 0)
|
||||
pad_x++;
|
||||
|
||||
if (i == 0 || dim_to_split != 1)
|
||||
pad_y++;
|
||||
}
|
||||
break;
|
||||
case 80:
|
||||
case 112:
|
||||
switch(operation->weight_width) {
|
||||
case 3:
|
||||
map->in_image_y_size = operation->input_height / tp_cores_used + 2;
|
||||
break;
|
||||
case 5:
|
||||
map->in_image_y_size = operation->input_height / tp_cores_used + 1;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
unreachable("Unsupported input width");
|
||||
|
||||
if (operation->input_width % 2)
|
||||
if (i == 0 || dim_to_split != 0)
|
||||
pad_x++;
|
||||
|
||||
if (operation->input_height % 2)
|
||||
if (i == 0 || dim_to_split != 1)
|
||||
pad_y++;
|
||||
}
|
||||
map->in_image_z_size = operation->input_channels;
|
||||
} else {
|
||||
map->in_image_y_size = operation->input_height / tp_cores_used;
|
||||
map->in_image_z_size = operation->input_channels;
|
||||
|
||||
if (i < tp_cores_used - 1) {
|
||||
in_dims[dim_to_split] = size;
|
||||
|
||||
if (dim_to_split != 2)
|
||||
in_dims[dim_to_split] *= operation->stride;
|
||||
|
||||
if (dim_to_split == 0)
|
||||
in_dims[dim_to_split] -= pad_x;
|
||||
else if (dim_to_split == 1)
|
||||
in_dims[dim_to_split] -= pad_y;
|
||||
|
||||
remaining_in_size -= in_dims[dim_to_split];
|
||||
} else
|
||||
in_dims[dim_to_split] = remaining_in_size;
|
||||
|
||||
if (i == tp_core) {
|
||||
if (pad_x_out)
|
||||
*pad_x_out = pad_x;
|
||||
if (pad_y_out)
|
||||
*pad_y_out = pad_y;
|
||||
}
|
||||
|
||||
out_dims[dim_to_split] = size;
|
||||
|
||||
remaining_out_size -= size;
|
||||
}
|
||||
|
||||
return dim_to_split;
|
||||
}
|
||||
|
||||
static struct etna_bo *
|
||||
|
|
@ -394,10 +417,15 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
|
|||
unsigned tp_core, unsigned tp_cores_used)
|
||||
{
|
||||
struct etna_context *ctx = etna_context(subgraph->base.context);
|
||||
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
|
||||
struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
|
||||
sizeof(struct etna_tp_params),
|
||||
DRM_ETNA_GEM_CACHE_WC);
|
||||
sizeof(struct etna_tp_params),
|
||||
DRM_ETNA_GEM_CACHE_WC);
|
||||
unsigned input_width = operation->input_width;
|
||||
unsigned input_height = operation->input_height;
|
||||
unsigned output_width = operation->output_width;
|
||||
unsigned output_height = operation->output_height;
|
||||
unsigned in_dims[3];
|
||||
unsigned out_dims[3];
|
||||
|
||||
etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
|
||||
|
||||
|
|
@ -405,178 +433,101 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
|
|||
|
||||
set_default_tp_config(map);
|
||||
|
||||
set_input_size(operation, map, tp_cores_used);
|
||||
|
||||
map->in_image_stride = operation->input_width;
|
||||
map->in_image_slice = operation->input_width * operation->input_height;
|
||||
|
||||
if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) {
|
||||
if (operation->weight_width == 5 && operation->input_width < 8) {
|
||||
map->in_window_x_start = 0xfffe;
|
||||
map->in_window_y_start = 0xfffe;
|
||||
} else {
|
||||
map->in_window_x_start = 0xffff;
|
||||
map->in_window_y_start = 0xffff;
|
||||
}
|
||||
} else {
|
||||
map->in_window_x_start = 0x0;
|
||||
map->in_window_y_start = 0x0;
|
||||
if (input_height > input_width) {
|
||||
SWAP(input_width, input_height);
|
||||
SWAP(output_width, output_height);
|
||||
}
|
||||
|
||||
map->in_window_x_end = operation->input_width - 1;
|
||||
map->in_window_y_end = (operation->input_height / tp_cores_used) - 1;
|
||||
map->in_tile_x_size = operation->input_width;
|
||||
map->in_tile_x_inc = operation->input_width;
|
||||
in_dims[0] = input_width;
|
||||
in_dims[1] = input_height;
|
||||
in_dims[2] = operation->input_channels;
|
||||
|
||||
if (operation->input_width <= 8 && operation->input_channels == 1) {
|
||||
map->in_tile_y_size = operation->input_height;
|
||||
map->in_tile_y_inc = operation->input_height;
|
||||
} else {
|
||||
map->in_tile_y_size = operation->input_height / tp_cores_used;
|
||||
map->in_tile_y_inc = operation->input_height / tp_cores_used;
|
||||
}
|
||||
out_dims[0] = output_width;
|
||||
out_dims[1] = output_height;
|
||||
out_dims[2] = operation->input_channels;
|
||||
|
||||
if (operation->padding_same) {
|
||||
switch(operation->weight_width) {
|
||||
case 3:
|
||||
map->in_window_x_end += 2;
|
||||
if (operation->input_width < 8) {
|
||||
map->in_tile_x_size += 3;
|
||||
map->in_tile_y_size += 1;
|
||||
map->in_tile_y_inc += 1;
|
||||
} else {
|
||||
map->in_tile_x_size += 2;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
map->in_window_x_end += 3;
|
||||
if (operation->input_width < 8) {
|
||||
map->in_tile_x_size += 5;
|
||||
} else {
|
||||
map->in_tile_x_size += 4;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
unreachable("Unsupported weight size");
|
||||
}
|
||||
unsigned pad_x = 0;
|
||||
unsigned pad_y = 0;
|
||||
unsigned split_dim = split_reshuffle(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims, &pad_x, &pad_y);
|
||||
|
||||
if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5)
|
||||
map->in_tile_x_size = operation->input_width / tp_cores_used + 2;
|
||||
map->in_image_x_size = in_dims[0];
|
||||
map->in_image_y_size = in_dims[1];
|
||||
map->in_image_z_size = in_dims[2];
|
||||
|
||||
if (operation->input_width > 8 && operation->input_channels == 1) {
|
||||
switch(operation->weight_width) {
|
||||
case 3:
|
||||
map->in_window_y_end = (operation->input_height / tp_cores_used) + 1;
|
||||
break;
|
||||
case 5:
|
||||
map->in_window_y_end = (operation->input_height / tp_cores_used);
|
||||
break;
|
||||
default:
|
||||
unreachable("Unsupported weight size");
|
||||
}
|
||||
} else
|
||||
map->in_window_y_end = map->in_window_x_end;
|
||||
ML_DBG("map->in_image_z_size %d in_dims[2] %d split_dim %d\n", map->in_image_z_size, in_dims[2], split_dim);
|
||||
|
||||
map->in_tile_x_inc = map->in_tile_x_size;
|
||||
map->in_image_stride = operation->input_height;
|
||||
map->in_image_slice = input_width * input_height;
|
||||
|
||||
if (operation->input_channels > 1) {
|
||||
map->in_tile_y_size = map->in_tile_x_size;
|
||||
map->in_tile_y_inc = map->in_tile_x_size;
|
||||
} else {
|
||||
map->in_tile_y_size += 2;
|
||||
map->in_tile_y_inc += 2;
|
||||
}
|
||||
} else {
|
||||
if (operation->input_width < 8) {
|
||||
map->in_window_x_end += 1;
|
||||
map->in_window_y_end += 1;
|
||||
map->in_tile_x_size += 1;
|
||||
map->in_tile_y_size += 1;
|
||||
map->in_tile_x_inc += 1;
|
||||
map->in_tile_y_inc += 1;
|
||||
}
|
||||
}
|
||||
map->in_window_x_start = 0x0 - pad_x;
|
||||
map->in_window_y_start = 0x0 - pad_y;
|
||||
|
||||
unsigned out_loop_0_count = 0x2;
|
||||
map->in_window_x_end = out_dims[0] * out_loop_0_count - 1 - pad_x;
|
||||
map->in_window_y_end = out_dims[1] * 2 - 1 - pad_y;
|
||||
map->in_tile_x_size = out_dims[0] * out_loop_0_count;
|
||||
map->in_tile_x_inc = map->in_tile_x_size;
|
||||
map->in_tile_y_size = out_dims[1] * 2;
|
||||
map->in_tile_y_inc = out_dims[1] * 2;
|
||||
|
||||
struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
|
||||
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
|
||||
|
||||
if (operation->padding_same)
|
||||
map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
|
||||
else
|
||||
map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core;
|
||||
unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
|
||||
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
|
||||
|
||||
struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
|
||||
map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
|
||||
offset = etna_ml_get_offset(subgraph, operation->output_tensor);
|
||||
map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
|
||||
|
||||
if (operation->padding_same)
|
||||
map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
|
||||
else
|
||||
map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core;
|
||||
for (unsigned i = 0; i < tp_core; i++) {
|
||||
unsigned in_dims[3];
|
||||
unsigned out_dims[3];
|
||||
unsigned in_offset = 0;
|
||||
unsigned out_offset = 0;
|
||||
|
||||
in_dims[0] = input_width;
|
||||
in_dims[1] = input_height;
|
||||
in_dims[2] = operation->input_channels;
|
||||
|
||||
out_dims[0] = output_width;
|
||||
out_dims[1] = output_height;
|
||||
out_dims[2] = operation->input_channels;
|
||||
|
||||
unsigned split_dim = split_reshuffle(subgraph, operation, i, tp_cores_used, in_dims, out_dims, NULL, NULL);
|
||||
|
||||
switch(split_dim) {
|
||||
case 0:
|
||||
in_offset = in_dims[0];
|
||||
out_offset = out_dims[0];
|
||||
break;
|
||||
case 1:
|
||||
in_offset = map->in_image_stride * in_dims[1];
|
||||
out_offset = output_height * out_dims[1];
|
||||
break;
|
||||
case 2:
|
||||
in_offset = map->in_image_slice * in_dims[2];
|
||||
out_offset = out_dims[2] * map->in_tile_x_size * map->in_tile_y_size;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
map->in_image_base_address += in_offset;
|
||||
map->out_image_base_address += out_offset;
|
||||
}
|
||||
|
||||
map->out_loop_1_reset = 0x1;
|
||||
map->out_loop_2_reset = 0x0;
|
||||
map->out_loop_3_reset = 0x1;
|
||||
map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2);
|
||||
map->out_loop_0_inc = output_width * output_height;
|
||||
map->out_loop_1_inc = 0x1;
|
||||
map->out_loop_0_count = 0x2;
|
||||
map->out_loop_1_count = round(operation->input_width / 2.0);
|
||||
map->out_loop_2_count = 0x2;
|
||||
map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used);
|
||||
|
||||
if (operation->padding_same) {
|
||||
switch(operation->weight_width) {
|
||||
case 3:
|
||||
map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2);
|
||||
map->out_loop_1_count += 1;
|
||||
break;
|
||||
case 5:
|
||||
map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2);
|
||||
map->out_loop_1_count += 2;
|
||||
break;
|
||||
default:
|
||||
unreachable("Unsupported weight size");
|
||||
}
|
||||
|
||||
if (operation->input_channels == 1)
|
||||
map->out_loop_3_count += 1;
|
||||
else
|
||||
map->out_loop_3_count = map->out_loop_1_count;
|
||||
}
|
||||
|
||||
map->out_loop_0_count = out_loop_0_count;
|
||||
map->out_loop_1_count = out_dims[0];
|
||||
map->out_loop_2_count = out_loop_0_count;
|
||||
map->out_loop_3_count = out_dims[1];
|
||||
map->out_loop_2_inc = map->out_loop_0_inc * 2;
|
||||
map->out_loop_3_inc = map->out_loop_1_count;
|
||||
map->out_loop_3_inc = output_width;
|
||||
map->out_loop_6_inc = map->out_loop_0_inc * 4;
|
||||
|
||||
if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) {
|
||||
if (tp_core > 0) {
|
||||
map->in_image_y_size -= 2;
|
||||
map->in_window_y_end -= 2;
|
||||
map->in_tile_y_size -= 2;
|
||||
map->in_tile_y_inc -= 2;
|
||||
map->out_loop_3_count -= 1;
|
||||
}
|
||||
|
||||
if (tp_core == tp_core_count - 1) {
|
||||
map->in_image_y_size -= 2;
|
||||
}
|
||||
|
||||
if (tp_core > 0) {
|
||||
map->in_image_base_address += operation->input_width * 2;
|
||||
map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned alu_size = operation->input_width;
|
||||
if (operation->padding_same) {
|
||||
alu_size += 1;
|
||||
if (operation->weight_width == 5)
|
||||
alu_size += 1;
|
||||
if (operation->input_width == 5)
|
||||
alu_size += 1;
|
||||
}
|
||||
|
||||
map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size);
|
||||
|
||||
map->in_zp = operation->input_zero_point;
|
||||
map->out_zp = operation->input_zero_point;
|
||||
|
||||
|
|
@ -588,21 +539,6 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
|
|||
map->out_image_circular_buf_size = 0x0;
|
||||
map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
|
||||
|
||||
if (map->in_image_y_size < 2) {
|
||||
map->in_image_y_size = operation->input_width;
|
||||
map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used;
|
||||
map->in_window_y_end = operation->input_width;
|
||||
map->in_tile_y_size = operation->input_width + 1;
|
||||
map->in_tile_y_inc = operation->input_width + 1;
|
||||
map->out_loop_3_count += 1;
|
||||
|
||||
map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
|
||||
map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
|
||||
|
||||
map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
|
||||
map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
|
||||
}
|
||||
|
||||
etna_bo_cpu_fini(bo);
|
||||
|
||||
return bo;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue