From bb06e082f8d8faba32fccc17bf8d76938c6a3e3f Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@ideasonboard.com>
Date: Wed, 23 Oct 2024 10:02:14 +0200
Subject: [PATCH] etnaviv/ml: Implement tiling for V8

Have had to tweak the code to stay safe on the i.MX8MP.

Also, we are for now being very conservative with tiling to prevent
underruns.

In the future, we may want to consider testing different possibilities
during compilation and choosing the optimal one. Also maybe detecting
underruns by checking whether the NPU hung with a given combination.

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
---
 src/gallium/drivers/etnaviv/etnaviv_ml_nn.c   |  94 +-----------
 src/gallium/drivers/etnaviv/etnaviv_ml_nn.h   |   9 +-
 .../drivers/etnaviv/etnaviv_ml_nn_v7.c        |  98 +++++++++++-
 .../drivers/etnaviv/etnaviv_ml_nn_v8.c        | 139 +++++++++++++++++-
 4 files changed, 245 insertions(+), 95 deletions(-)

diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
index f0a693c66f1..239f64b0069 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@@ -513,51 +513,6 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
                                   operation->weight_scale);
 }
 
-#define MAX_TILE_WIDTH 64
-
-static unsigned
-calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
-{
-   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
-   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
-   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
-
-   if (operation->weight_width == 1)
-      foo = MIN2(foo, nn_accum_buffer_depth / 3);
-
-   foo = MIN2(foo, kernels_per_core);
-   foo = MIN2(foo, 127);
-
-   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
-   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
-   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
-
-   return superblocks;
-}
-
-static unsigned
-calc_interleave_mode(unsigned tile_width, unsigned weight_height)
-{
-   unsigned mode = 8;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
-      return 1;
-
-   if (tile_width > MAX_TILE_WIDTH / 2)
-      mode = 1;
-   else if (tile_width > MAX_TILE_WIDTH / 4)
-      mode = 2;
-   else if (tile_width > MAX_TILE_WIDTH / 8)
-      mode = 4;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
-      return MIN2(mode, 4);
-
-   return MIN2(mode, 2);
-}
-
 void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                             unsigned *output_width, unsigned *output_height, unsigned *output_channels)
@@ -590,51 +545,14 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
    *output_channels = 1;
 }
 
-unsigned
+static unsigned
 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
 {
-   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned input_width = operation->input_width;
-   unsigned input_height = operation->input_height;
-   unsigned input_channels = operation->input_channels;
-   unsigned output_width = operation->output_width;
-   unsigned output_height = operation->output_height;
-   unsigned output_channels = operation->output_channels;
-   unsigned tile_width;
-   unsigned tile_height;
-   unsigned superblocks;
-   unsigned interleave_mode;
-
-   if (operation->addition)
-      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
-                                  &output_width, &output_height, &output_channels);
-
-   if (operation->pooling_first_pixel) {
-      output_width *= 2;
-      output_height *= 2;
-   }
-
-   tile_width = MIN2(output_width, 64);
-   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
-
-   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
-   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
-   tile_height = MIN2(tile_height, output_height);
-
-   if (operation->stride > 1 && tile_height % 2 > 0)
-      tile_height -= 1;
-
-   tile_height = MAX2(tile_height, 1);
-   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
-
-   if (tile_width_out)
-      *tile_width_out = tile_width;
-
-   if (tile_height_out)
-      *tile_height_out = tile_height;
-
-   return superblocks;
+   unsigned nn_core_version = ctx->screen->specs.nn_core_version;
+   if (nn_core_version == 7)
+      return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out);
+   else
+      return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out);
 }
 
 static struct etna_bo *
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
index 2376723aa06..6f0c546cf9e 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
@@ -10,15 +10,18 @@ void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                             unsigned *output_width, unsigned *output_height, unsigned *output_channels);
 
+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
 
+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
 
-unsigned
-etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
-
 void
 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
                           const struct pipe_ml_operation *poperation,
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
index beada6a59c8..3dddfc1c9e9 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
@@ -14,6 +14,98 @@ map_resource(struct pipe_resource *resource)
    return etna_bo_map(etna_resource(resource)->bo);
 }
 
+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
+
+   if (operation->weight_width == 1)
+      foo = MIN2(foo, nn_accum_buffer_depth / 3);
+
+   foo = MIN2(foo, kernels_per_core);
+   foo = MIN2(foo, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+
+   return superblocks;
+}
+
+static unsigned
+calc_interleave_mode(unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode = 8;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width > MAX_TILE_WIDTH / 2)
+      mode = 1;
+   else if (tile_width > MAX_TILE_WIDTH / 4)
+      mode = 2;
+   else if (tile_width > MAX_TILE_WIDTH / 8)
+      mode = 4;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
+      return MIN2(mode, 4);
+
+   return MIN2(mode, 2);
+}
+
+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                  &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
 static uint32_t
 calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
 {
@@ -103,7 +195,7 @@ write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, co
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
    unsigned stride = MIN2(input_channels, 6);
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
@@ -182,7 +274,7 @@ write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigne
    uint8_t *input = map_resource(operation->weight_tensor);
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
@@ -268,7 +360,7 @@ write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned
    uint8_t *input = map_resource(operation->weight_tensor);
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
    uint64_t buffer = 0;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
index b4a623f2f3d..bb93eadeab3 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@@ -36,6 +36,143 @@ struct etna_nn_header_v8 {
    uint32_t stream_size[0];
 };
 
+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu);
+   unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned tiles_per_core;
+
+   if (operation->weight_width == 1)
+      tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode);
+   else {
+      unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64);
+      tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride);
+   }
+
+   tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9);
+
+   tiles_per_core = MIN2(tiles_per_core, kernels_per_core);
+   tiles_per_core = MIN2(tiles_per_core, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+
+   return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+}
+
+static unsigned
+calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         mode = 2;
+      else
+         mode = 4;
+   } else
+      mode = 1;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) {
+      if (mode >= 2) {
+         return 2;
+      }
+   } else {
+      if (mode >= 4) {
+         return 4;
+      }
+   }
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         return 2;
+      else
+         return 4;
+   }
+
+   return 1;
+}
+
+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                 &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   /* This gets us the best performance on MobileDet */
+   /* TODO: Find the optimal value, or at least let the user override it */
+   tile_height = MIN2(tile_height, 4);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+
+   superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
+static void
+reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   uint8_t *input = map_resource(operation->weight_tensor);
+   struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
+                                                         pipe_buffer_size(operation->weight_tensor));
+   uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
+
+   for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
+      unsigned out_channel = i % operation->output_channels;
+
+      output[out_channel][i / operation->output_channels] = input[i];
+   }
+
+   pipe_resource_reference(&operation->weight_tensor, NULL);
+   operation->weight_tensor = output_res;
+}
+
 struct bitstream {
    unsigned bits_in_buffer;
    uint64_t buffer;
@@ -594,7 +731,7 @@ fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *ope
    unsigned output_channels = operation->output_channels;
    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
    unsigned cores_used = MIN2(output_channels, nn_core_count);
-   unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL);
    unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
 
    unsigned channel_per_superblock[superblocks];