diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c index f0a693c66f1..239f64b0069 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c @@ -513,51 +513,6 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph, operation->weight_scale); } -#define MAX_TILE_WIDTH 64 - -static unsigned -calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode) -{ - unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; - unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth; - unsigned output_channels = operation->addition ? 1 : operation->output_channels; - unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count); - unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y; - - if (operation->weight_width == 1) - foo = MIN2(foo, nn_accum_buffer_depth / 3); - - foo = MIN2(foo, kernels_per_core); - foo = MIN2(foo, 127); - - kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo); - unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count); - unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels); - - return superblocks; -} - -static unsigned -calc_interleave_mode(unsigned tile_width, unsigned weight_height) -{ - unsigned mode = 8; - - if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2) - return 1; - - if (tile_width > MAX_TILE_WIDTH / 2) - mode = 1; - else if (tile_width > MAX_TILE_WIDTH / 4) - mode = 2; - else if (tile_width > MAX_TILE_WIDTH / 8) - mode = 4; - - if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) - return MIN2(mode, 4); - - return MIN2(mode, 2); -} - void etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels, unsigned *output_width, unsigned *output_height, unsigned *output_channels) @@ -590,51 +545,14 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig *output_channels = 1; } -unsigned +static unsigned etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out) { - unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth; - unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth; - unsigned input_width = operation->input_width; - unsigned input_height = operation->input_height; - unsigned input_channels = operation->input_channels; - unsigned output_width = operation->output_width; - unsigned output_height = operation->output_height; - unsigned output_channels = operation->output_channels; - unsigned tile_width; - unsigned tile_height; - unsigned superblocks; - unsigned interleave_mode; - - if (operation->addition) - etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels, - &output_width, &output_height, &output_channels); - - if (operation->pooling_first_pixel) { - output_width *= 2; - output_height *= 2; - } - - tile_width = MIN2(output_width, 64); - interleave_mode = calc_interleave_mode(tile_width, operation->weight_height); - - tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1; - tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth); - tile_height = MIN2(tile_height, output_height); - - if (operation->stride > 1 && tile_height % 2 > 0) - tile_height -= 1; - - tile_height = MAX2(tile_height, 1); - superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode); - - if (tile_width_out) - *tile_width_out = tile_width; - - if (tile_height_out) - *tile_height_out = tile_height; - - return superblocks; + unsigned nn_core_version = ctx->screen->specs.nn_core_version; + if (nn_core_version == 7) + return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out); + else + return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out); } static struct etna_bo * diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h index 2376723aa06..6f0c546cf9e 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h @@ -10,15 +10,18 @@ void etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels, unsigned *output_width, unsigned *output_height, unsigned *output_channels); +unsigned +etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out); + struct etna_bo * etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size); +unsigned +etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out); + struct etna_bo * etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size); -unsigned -etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out); - void etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph, const struct pipe_ml_operation *poperation, diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c index beada6a59c8..3dddfc1c9e9 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c @@ -14,6 +14,98 @@ map_resource(struct pipe_resource *resource) return etna_bo_map(etna_resource(resource)->bo); } +#define MAX_TILE_WIDTH 64 + +static unsigned +calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode) +{ + unsigned nn_core_count = ctx->screen->info->npu.nn_core_count; + unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count); + unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y; + + if (operation->weight_width == 1) + foo = MIN2(foo, nn_accum_buffer_depth / 3); + + foo = MIN2(foo, kernels_per_core); + foo = MIN2(foo, 127); + + kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo); + unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count); + unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels); + + return superblocks; +} + +static unsigned +calc_interleave_mode(unsigned tile_width, unsigned weight_height) +{ + unsigned mode = 8; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2) + return 1; + + if (tile_width > MAX_TILE_WIDTH / 2) + mode = 1; + else if (tile_width > MAX_TILE_WIDTH / 4) + mode = 2; + else if (tile_width > MAX_TILE_WIDTH / 8) + mode = 4; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) + return MIN2(mode, 4); + + return MIN2(mode, 2); +} + +unsigned +etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out) +{ + unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth; + unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth; + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned input_channels = operation->input_channels; + unsigned output_width = operation->output_width; + unsigned output_height = operation->output_height; + unsigned output_channels = operation->output_channels; + unsigned tile_width; + unsigned tile_height; + unsigned superblocks; + unsigned interleave_mode; + + if (operation->addition) + etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels, + &output_width, &output_height, &output_channels); + + if (operation->pooling_first_pixel) { + output_width *= 2; + output_height *= 2; + } + + tile_width = MIN2(output_width, 64); + interleave_mode = calc_interleave_mode(tile_width, operation->weight_height); + + tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1; + tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth); + tile_height = MIN2(tile_height, output_height); + + if (operation->stride > 1 && tile_height % 2 > 0) + tile_height -= 1; + + tile_height = MAX2(tile_height, 1); + superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode); + + if (tile_width_out) + *tile_width_out = tile_width; + + if (tile_height_out) + *tile_height_out = tile_height; + + return superblocks; +} + static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation) { @@ -103,7 +195,7 @@ write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, co uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; unsigned stride = MIN2(input_channels, 6); - unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL); + unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL); uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)]; uint32_t *initial_ptr = map; bool do_write = initial_ptr != NULL; @@ -182,7 +274,7 @@ write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigne uint8_t *input = map_resource(operation->weight_tensor); uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; - unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL); + unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL); uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input; uint32_t *initial_ptr = map; bool do_write = initial_ptr != NULL; @@ -268,7 +360,7 @@ write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned uint8_t *input = map_resource(operation->weight_tensor); uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; - unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL); + unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL); uint32_t *initial_ptr = map; bool do_write = initial_ptr != NULL; uint64_t buffer = 0; diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c index b4a623f2f3d..bb93eadeab3 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c @@ -36,6 +36,143 @@ struct etna_nn_header_v8 { uint32_t stream_size[0]; }; +#define MAX_TILE_WIDTH 64 + +static unsigned +calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode) +{ + unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; + struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu); + unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth; + unsigned output_channels = operation->output_channels; + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count); + unsigned tiles_per_core; + + if (operation->weight_width == 1) + tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode); + else { + unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64); + tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride); + } + + tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9); + + tiles_per_core = MIN2(tiles_per_core, kernels_per_core); + tiles_per_core = MIN2(tiles_per_core, 127); + + kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core); + unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count); + + return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels); +} + +static unsigned +calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height) +{ + unsigned mode; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2) + return 1; + + if (tile_width <= MAX_TILE_WIDTH / 2) { + if (MAX_TILE_WIDTH / 4 < tile_width) + mode = 2; + else + mode = 4; + } else + mode = 1; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) { + if (mode >= 2) { + return 2; + } + } else { + if (mode >= 4) { + return 4; + } + } + + if (tile_width <= MAX_TILE_WIDTH / 2) { + if (MAX_TILE_WIDTH / 4 < tile_width) + return 2; + else + return 4; + } + + return 1; +} + +unsigned +etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out) +{ + unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth; + unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth; + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned input_channels = operation->input_channels; + unsigned output_width = operation->output_width; + unsigned output_height = operation->output_height; + unsigned output_channels = operation->output_channels; + unsigned tile_width; + unsigned tile_height; + unsigned superblocks; + unsigned interleave_mode; + + if (operation->addition) + etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels, + &output_width, &output_height, &output_channels); + + if (operation->pooling_first_pixel) { + output_width *= 2; + output_height *= 2; + } + + tile_width = MIN2(output_width, 64); + interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height); + + tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1; + tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth); + tile_height = MIN2(tile_height, output_height); + + /* This gets us the best performance on MobileDet */ + /* TODO: Find the optimal value, or at least let the user override it */ + tile_height = MIN2(tile_height, 4); + + if (operation->stride > 1 && tile_height % 2 > 0) + tile_height -= 1; + + tile_height = MAX2(tile_height, 1); + + superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode); + + if (tile_width_out) + *tile_width_out = tile_width; + + if (tile_height_out) + *tile_height_out = tile_height; + + return superblocks; +} + +static void +reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + uint8_t *input = map_resource(operation->weight_tensor); + struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, + pipe_buffer_size(operation->weight_tensor)); + uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res); + + for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) { + unsigned out_channel = i % operation->output_channels; + + output[out_channel][i / operation->output_channels] = input[i]; + } + + pipe_resource_reference(&operation->weight_tensor, NULL); + operation->weight_tensor = output_res; +} + struct bitstream { unsigned bits_in_buffer; uint64_t buffer; @@ -594,7 +731,7 @@ fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *ope unsigned output_channels = operation->output_channels; unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; unsigned cores_used = MIN2(output_channels, nn_core_count); - unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL); + unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL); unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks); unsigned channel_per_superblock[superblocks];