etnaviv/ml: Implement tiling for V8

Have had to tweak the code to stay safe on the i.MX8MP. Also, we are for now being very conservative with tiling to prevent underruns. In the future, we may want to consider testing different possibilities during compilation and choosing the optimal one. Also maybe detecting underruns by checking whether the NPU hung with a given combination. Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
2026-01-09 06:10:12 +01:00 · 2024-10-23 10:02:14 +02:00 · 2024-10-23 10:02:14 +02:00 · bb06e082f8
commit bb06e082f8
parent 0ef5aa5fb6
4 changed files with 245 additions and 95 deletions
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@ -513,51 +513,6 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
                                  operation->weight_scale);
 }

-#define MAX_TILE_WIDTH 64
-
-static unsigned
-calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
-{
-   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
-   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
-   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
-
-   if (operation->weight_width == 1)
-      foo = MIN2(foo, nn_accum_buffer_depth / 3);
-
-   foo = MIN2(foo, kernels_per_core);
-   foo = MIN2(foo, 127);
-
-   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
-   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
-   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
-
-   return superblocks;
-}
-
-static unsigned
-calc_interleave_mode(unsigned tile_width, unsigned weight_height)
-{
-   unsigned mode = 8;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
-      return 1;
-
-   if (tile_width > MAX_TILE_WIDTH / 2)
-      mode = 1;
-   else if (tile_width > MAX_TILE_WIDTH / 4)
-      mode = 2;
-   else if (tile_width > MAX_TILE_WIDTH / 8)
-      mode = 4;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
-      return MIN2(mode, 4);
-
-   return MIN2(mode, 2);
-}
-
 void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                            unsigned *output_width, unsigned *output_height, unsigned *output_channels)
@ -590,51 +545,14 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
   *output_channels = 1;
 }

-unsigned
+static unsigned
 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
 {
-   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned input_width = operation->input_width;
-   unsigned input_height = operation->input_height;
-   unsigned input_channels = operation->input_channels;
-   unsigned output_width = operation->output_width;
-   unsigned output_height = operation->output_height;
-   unsigned output_channels = operation->output_channels;
-   unsigned tile_width;
-   unsigned tile_height;
-   unsigned superblocks;
-   unsigned interleave_mode;
-
-   if (operation->addition)
-      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
-                                  &output_width, &output_height, &output_channels);
-
-   if (operation->pooling_first_pixel) {
-      output_width *= 2;
-      output_height *= 2;
-   }
-
-   tile_width = MIN2(output_width, 64);
-   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
-
-   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
-   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
-   tile_height = MIN2(tile_height, output_height);
-
-   if (operation->stride > 1 && tile_height % 2 > 0)
-      tile_height -= 1;
-
-   tile_height = MAX2(tile_height, 1);
-   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
-
-   if (tile_width_out)
-      *tile_width_out = tile_width;
-
-   if (tile_height_out)
-      *tile_height_out = tile_height;
-
-   return superblocks;
+   unsigned nn_core_version = ctx->screen->specs.nn_core_version;
+   if (nn_core_version == 7)
+      return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out);
+   else
+      return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out);
 }

 static struct etna_bo *
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
@ -10,15 +10,18 @@ void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                            unsigned *output_width, unsigned *output_height, unsigned *output_channels);

+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);

+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);

-unsigned
-etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
-
 void
 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
                          const struct pipe_ml_operation *poperation,
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
@ -14,6 +14,98 @@ map_resource(struct pipe_resource *resource)
   return etna_bo_map(etna_resource(resource)->bo);
 }

+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
+
+   if (operation->weight_width == 1)
+      foo = MIN2(foo, nn_accum_buffer_depth / 3);
+
+   foo = MIN2(foo, kernels_per_core);
+   foo = MIN2(foo, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+
+   return superblocks;
+}
+
+static unsigned
+calc_interleave_mode(unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode = 8;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width > MAX_TILE_WIDTH / 2)
+      mode = 1;
+   else if (tile_width > MAX_TILE_WIDTH / 4)
+      mode = 2;
+   else if (tile_width > MAX_TILE_WIDTH / 8)
+      mode = 4;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
+      return MIN2(mode, 4);
+
+   return MIN2(mode, 2);
+}
+
+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                  &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
 static uint32_t
 calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
 {
@ -103,7 +195,7 @@ write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, co
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
   unsigned stride = MIN2(input_channels, 6);
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
   uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
   uint32_t *initial_ptr = map;
   bool do_write = initial_ptr != NULL;
@ -182,7 +274,7 @@ write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigne
   uint8_t *input = map_resource(operation->weight_tensor);
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
   uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
   uint32_t *initial_ptr = map;
   bool do_write = initial_ptr != NULL;
@ -268,7 +360,7 @@ write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned
   uint8_t *input = map_resource(operation->weight_tensor);
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
   uint32_t *initial_ptr = map;
   bool do_write = initial_ptr != NULL;
   uint64_t buffer = 0;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@ -36,6 +36,143 @@ struct etna_nn_header_v8 {
   uint32_t stream_size[0];
 };

+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu);
+   unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned tiles_per_core;
+
+   if (operation->weight_width == 1)
+      tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode);
+   else {
+      unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64);
+      tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride);
+   }
+
+   tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9);
+
+   tiles_per_core = MIN2(tiles_per_core, kernels_per_core);
+   tiles_per_core = MIN2(tiles_per_core, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+
+   return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+}
+
+static unsigned
+calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         mode = 2;
+      else
+         mode = 4;
+   } else
+      mode = 1;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) {
+      if (mode >= 2) {
+         return 2;
+      }
+   } else {
+      if (mode >= 4) {
+         return 4;
+      }
+   }
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         return 2;
+      else
+         return 4;
+   }
+
+   return 1;
+}
+
+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                 &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   /* This gets us the best performance on MobileDet */
+   /* TODO: Find the optimal value, or at least let the user override it */
+   tile_height = MIN2(tile_height, 4);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+
+   superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
+static void
+reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   uint8_t *input = map_resource(operation->weight_tensor);
+   struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
+                                                         pipe_buffer_size(operation->weight_tensor));
+   uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
+
+   for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
+      unsigned out_channel = i % operation->output_channels;
+
+      output[out_channel][i / operation->output_channels] = input[i];
+   }
+
+   pipe_resource_reference(&operation->weight_tensor, NULL);
+   operation->weight_tensor = output_res;
+}
+
 struct bitstream {
   unsigned bits_in_buffer;
   uint64_t buffer;
@ -594,7 +731,7 @@ fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *ope
   unsigned output_channels = operation->output_channels;
   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
-   unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL);
   unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);

   unsigned channel_per_superblock[superblocks];