diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.h b/src/gallium/drivers/etnaviv/etnaviv_ml.h index 18a147e1d54..477d7633893 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml.h +++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h @@ -12,6 +12,16 @@ #define MAX_CONFIG_BOS 4 +/* + * SWAP - swap value of @a and @b + */ +#define SWAP(a, b) \ + do { \ + __typeof(a) __tmp = (a); \ + (a) = (b); \ + (b) = __tmp; \ + } while (0) + enum etna_job_type { ETNA_JOB_TYPE_NN, ETNA_JOB_TYPE_TP, diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c index b68dc554919..bb324e412c7 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c @@ -344,49 +344,72 @@ create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_o return bo; } -static void -set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used) +static unsigned +split_reshuffle(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims, unsigned *pad_x_out, unsigned *pad_y_out) { - map->in_image_x_size = operation->input_width; + unsigned remaining_out_size, remaining_in_size; + unsigned dim_to_split = 0; - if (operation->padding_same && operation->input_channels > 1) { - map->in_image_y_size = operation->input_height; - map->in_image_z_size = operation->input_channels / tp_cores_used; - } else if (operation->padding_same && operation->input_channels == 1) { - switch(operation->input_width) { - case 3: - case 5: - map->in_image_y_size = operation->input_height; - break; - case 8: - switch(operation->weight_width) { - case 3: - map->in_image_y_size = operation->input_height; - break; - case 5: - map->in_image_y_size = 5; - break; + if (out_dims[1] >= out_dims[dim_to_split]) + dim_to_split = 1; + + if (out_dims[2] >= out_dims[dim_to_split]) + dim_to_split = 2; + + remaining_in_size = in_dims[dim_to_split]; + remaining_out_size = out_dims[dim_to_split]; + + for (unsigned i = 0; i <= tp_core; i++) { + unsigned size = DIV_ROUND_UP(remaining_out_size, (tp_cores_used - i)); + unsigned pad_x = 0; + unsigned pad_y = 0; + + if (operation->padding_same) { + if (operation->weight_width == 5) { + if (i == 0 || dim_to_split != 0) + pad_x++; + + if (i == 0 || dim_to_split != 1) + pad_y++; } - break; - case 80: - case 112: - switch(operation->weight_width) { - case 3: - map->in_image_y_size = operation->input_height / tp_cores_used + 2; - break; - case 5: - map->in_image_y_size = operation->input_height / tp_cores_used + 1; - break; - } - break; - default: - unreachable("Unsupported input width"); + + if (operation->input_width % 2) + if (i == 0 || dim_to_split != 0) + pad_x++; + + if (operation->input_height % 2) + if (i == 0 || dim_to_split != 1) + pad_y++; } - map->in_image_z_size = operation->input_channels; - } else { - map->in_image_y_size = operation->input_height / tp_cores_used; - map->in_image_z_size = operation->input_channels; + + if (i < tp_cores_used - 1) { + in_dims[dim_to_split] = size; + + if (dim_to_split != 2) + in_dims[dim_to_split] *= operation->stride; + + if (dim_to_split == 0) + in_dims[dim_to_split] -= pad_x; + else if (dim_to_split == 1) + in_dims[dim_to_split] -= pad_y; + + remaining_in_size -= in_dims[dim_to_split]; + } else + in_dims[dim_to_split] = remaining_in_size; + + if (i == tp_core) { + if (pad_x_out) + *pad_x_out = pad_x; + if (pad_y_out) + *pad_y_out = pad_y; + } + + out_dims[dim_to_split] = size; + + remaining_out_size -= size; } + + return dim_to_split; } static struct etna_bo * @@ -394,10 +417,15 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope unsigned tp_core, unsigned tp_cores_used) { struct etna_context *ctx = etna_context(subgraph->base.context); - unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count; struct etna_bo *bo = etna_bo_new(ctx->screen->dev, - sizeof(struct etna_tp_params), - DRM_ETNA_GEM_CACHE_WC); + sizeof(struct etna_tp_params), + DRM_ETNA_GEM_CACHE_WC); + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned output_width = operation->output_width; + unsigned output_height = operation->output_height; + unsigned in_dims[3]; + unsigned out_dims[3]; etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); @@ -405,178 +433,101 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope set_default_tp_config(map); - set_input_size(operation, map, tp_cores_used); - - map->in_image_stride = operation->input_width; - map->in_image_slice = operation->input_width * operation->input_height; - - if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) { - if (operation->weight_width == 5 && operation->input_width < 8) { - map->in_window_x_start = 0xfffe; - map->in_window_y_start = 0xfffe; - } else { - map->in_window_x_start = 0xffff; - map->in_window_y_start = 0xffff; - } - } else { - map->in_window_x_start = 0x0; - map->in_window_y_start = 0x0; + if (input_height > input_width) { + SWAP(input_width, input_height); + SWAP(output_width, output_height); } - map->in_window_x_end = operation->input_width - 1; - map->in_window_y_end = (operation->input_height / tp_cores_used) - 1; - map->in_tile_x_size = operation->input_width; - map->in_tile_x_inc = operation->input_width; + in_dims[0] = input_width; + in_dims[1] = input_height; + in_dims[2] = operation->input_channels; - if (operation->input_width <= 8 && operation->input_channels == 1) { - map->in_tile_y_size = operation->input_height; - map->in_tile_y_inc = operation->input_height; - } else { - map->in_tile_y_size = operation->input_height / tp_cores_used; - map->in_tile_y_inc = operation->input_height / tp_cores_used; - } + out_dims[0] = output_width; + out_dims[1] = output_height; + out_dims[2] = operation->input_channels; - if (operation->padding_same) { - switch(operation->weight_width) { - case 3: - map->in_window_x_end += 2; - if (operation->input_width < 8) { - map->in_tile_x_size += 3; - map->in_tile_y_size += 1; - map->in_tile_y_inc += 1; - } else { - map->in_tile_x_size += 2; - } - break; - case 5: - map->in_window_x_end += 3; - if (operation->input_width < 8) { - map->in_tile_x_size += 5; - } else { - map->in_tile_x_size += 4; - } - break; - default: - unreachable("Unsupported weight size"); - } + unsigned pad_x = 0; + unsigned pad_y = 0; + unsigned split_dim = split_reshuffle(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims, &pad_x, &pad_y); - if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5) - map->in_tile_x_size = operation->input_width / tp_cores_used + 2; + map->in_image_x_size = in_dims[0]; + map->in_image_y_size = in_dims[1]; + map->in_image_z_size = in_dims[2]; - if (operation->input_width > 8 && operation->input_channels == 1) { - switch(operation->weight_width) { - case 3: - map->in_window_y_end = (operation->input_height / tp_cores_used) + 1; - break; - case 5: - map->in_window_y_end = (operation->input_height / tp_cores_used); - break; - default: - unreachable("Unsupported weight size"); - } - } else - map->in_window_y_end = map->in_window_x_end; + ML_DBG("map->in_image_z_size %d in_dims[2] %d split_dim %d\n", map->in_image_z_size, in_dims[2], split_dim); - map->in_tile_x_inc = map->in_tile_x_size; + map->in_image_stride = operation->input_height; + map->in_image_slice = input_width * input_height; - if (operation->input_channels > 1) { - map->in_tile_y_size = map->in_tile_x_size; - map->in_tile_y_inc = map->in_tile_x_size; - } else { - map->in_tile_y_size += 2; - map->in_tile_y_inc += 2; - } - } else { - if (operation->input_width < 8) { - map->in_window_x_end += 1; - map->in_window_y_end += 1; - map->in_tile_x_size += 1; - map->in_tile_y_size += 1; - map->in_tile_x_inc += 1; - map->in_tile_y_inc += 1; - } - } + map->in_window_x_start = 0x0 - pad_x; + map->in_window_y_start = 0x0 - pad_y; + + unsigned out_loop_0_count = 0x2; + map->in_window_x_end = out_dims[0] * out_loop_0_count - 1 - pad_x; + map->in_window_y_end = out_dims[1] * 2 - 1 - pad_y; + map->in_tile_x_size = out_dims[0] * out_loop_0_count; + map->in_tile_x_inc = map->in_tile_x_size; + map->in_tile_y_size = out_dims[1] * 2; + map->in_tile_y_inc = out_dims[1] * 2; struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); - map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); - - if (operation->padding_same) - map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core; - else - map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core; + unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor); + map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset; struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); - map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo); + offset = etna_ml_get_offset(subgraph, operation->output_tensor); + map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset; - if (operation->padding_same) - map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core; - else - map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core; + for (unsigned i = 0; i < tp_core; i++) { + unsigned in_dims[3]; + unsigned out_dims[3]; + unsigned in_offset = 0; + unsigned out_offset = 0; + + in_dims[0] = input_width; + in_dims[1] = input_height; + in_dims[2] = operation->input_channels; + + out_dims[0] = output_width; + out_dims[1] = output_height; + out_dims[2] = operation->input_channels; + + unsigned split_dim = split_reshuffle(subgraph, operation, i, tp_cores_used, in_dims, out_dims, NULL, NULL); + + switch(split_dim) { + case 0: + in_offset = in_dims[0]; + out_offset = out_dims[0]; + break; + case 1: + in_offset = map->in_image_stride * in_dims[1]; + out_offset = output_height * out_dims[1]; + break; + case 2: + in_offset = map->in_image_slice * in_dims[2]; + out_offset = out_dims[2] * map->in_tile_x_size * map->in_tile_y_size; + break; + default: + break; + } + + map->in_image_base_address += in_offset; + map->out_image_base_address += out_offset; + } map->out_loop_1_reset = 0x1; map->out_loop_2_reset = 0x0; map->out_loop_3_reset = 0x1; - map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2); + map->out_loop_0_inc = output_width * output_height; map->out_loop_1_inc = 0x1; - map->out_loop_0_count = 0x2; - map->out_loop_1_count = round(operation->input_width / 2.0); - map->out_loop_2_count = 0x2; - map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used); - - if (operation->padding_same) { - switch(operation->weight_width) { - case 3: - map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2); - map->out_loop_1_count += 1; - break; - case 5: - map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2); - map->out_loop_1_count += 2; - break; - default: - unreachable("Unsupported weight size"); - } - - if (operation->input_channels == 1) - map->out_loop_3_count += 1; - else - map->out_loop_3_count = map->out_loop_1_count; - } - + map->out_loop_0_count = out_loop_0_count; + map->out_loop_1_count = out_dims[0]; + map->out_loop_2_count = out_loop_0_count; + map->out_loop_3_count = out_dims[1]; map->out_loop_2_inc = map->out_loop_0_inc * 2; - map->out_loop_3_inc = map->out_loop_1_count; + map->out_loop_3_inc = output_width; map->out_loop_6_inc = map->out_loop_0_inc * 4; - if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) { - if (tp_core > 0) { - map->in_image_y_size -= 2; - map->in_window_y_end -= 2; - map->in_tile_y_size -= 2; - map->in_tile_y_inc -= 2; - map->out_loop_3_count -= 1; - } - - if (tp_core == tp_core_count - 1) { - map->in_image_y_size -= 2; - } - - if (tp_core > 0) { - map->in_image_base_address += operation->input_width * 2; - map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1); - } - } - - unsigned alu_size = operation->input_width; - if (operation->padding_same) { - alu_size += 1; - if (operation->weight_width == 5) - alu_size += 1; - if (operation->input_width == 5) - alu_size += 1; - } - - map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size); - map->in_zp = operation->input_zero_point; map->out_zp = operation->input_zero_point; @@ -588,21 +539,6 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope map->out_image_circular_buf_size = 0x0; map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; - if (map->in_image_y_size < 2) { - map->in_image_y_size = operation->input_width; - map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used; - map->in_window_y_end = operation->input_width; - map->in_tile_y_size = operation->input_width + 1; - map->in_tile_y_inc = operation->input_width + 1; - map->out_loop_3_count += 1; - - map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); - map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core; - - map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); - map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core; - } - etna_bo_cpu_fini(bo); return bo;