diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
index f0a693c66f1..239f64b0069 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@@ -513,51 +513,6 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
                                   operation->weight_scale);
 }
 
-#define MAX_TILE_WIDTH 64
-
-static unsigned
-calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
-{
-   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
-   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
-   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
-
-   if (operation->weight_width == 1)
-      foo = MIN2(foo, nn_accum_buffer_depth / 3);
-
-   foo = MIN2(foo, kernels_per_core);
-   foo = MIN2(foo, 127);
-
-   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
-   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
-   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
-
-   return superblocks;
-}
-
-static unsigned
-calc_interleave_mode(unsigned tile_width, unsigned weight_height)
-{
-   unsigned mode = 8;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
-      return 1;
-
-   if (tile_width > MAX_TILE_WIDTH / 2)
-      mode = 1;
-   else if (tile_width > MAX_TILE_WIDTH / 4)
-      mode = 2;
-   else if (tile_width > MAX_TILE_WIDTH / 8)
-      mode = 4;
-
-   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
-      return MIN2(mode, 4);
-
-   return MIN2(mode, 2);
-}
-
 void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                             unsigned *output_width, unsigned *output_height, unsigned *output_channels)
@@ -590,51 +545,14 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
    *output_channels = 1;
 }
 
-unsigned
+static unsigned
 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
 {
-   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
-   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
-   unsigned input_width = operation->input_width;
-   unsigned input_height = operation->input_height;
-   unsigned input_channels = operation->input_channels;
-   unsigned output_width = operation->output_width;
-   unsigned output_height = operation->output_height;
-   unsigned output_channels = operation->output_channels;
-   unsigned tile_width;
-   unsigned tile_height;
-   unsigned superblocks;
-   unsigned interleave_mode;
-
-   if (operation->addition)
-      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
-                                  &output_width, &output_height, &output_channels);
-
-   if (operation->pooling_first_pixel) {
-      output_width *= 2;
-      output_height *= 2;
-   }
-
-   tile_width = MIN2(output_width, 64);
-   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
-
-   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
-   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
-   tile_height = MIN2(tile_height, output_height);
-
-   if (operation->stride > 1 && tile_height % 2 > 0)
-      tile_height -= 1;
-
-   tile_height = MAX2(tile_height, 1);
-   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
-
-   if (tile_width_out)
-      *tile_width_out = tile_width;
-
-   if (tile_height_out)
-      *tile_height_out = tile_height;
-
-   return superblocks;
+   unsigned nn_core_version = ctx->screen->specs.nn_core_version;
+   if (nn_core_version == 7)
+      return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out);
+   else
+      return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out);
 }
 
 static struct etna_bo *
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
index 2376723aa06..6f0c546cf9e 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
@@ -10,15 +10,18 @@ void
 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
                             unsigned *output_width, unsigned *output_height, unsigned *output_channels);
 
+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
 
+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
+
 struct etna_bo *
 etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
 
-unsigned
-etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
-
 void
 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
                           const struct pipe_ml_operation *poperation,
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
index beada6a59c8..3dddfc1c9e9 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
@@ -14,6 +14,98 @@ map_resource(struct pipe_resource *resource)
    return etna_bo_map(etna_resource(resource)->bo);
 }
 
+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
+
+   if (operation->weight_width == 1)
+      foo = MIN2(foo, nn_accum_buffer_depth / 3);
+
+   foo = MIN2(foo, kernels_per_core);
+   foo = MIN2(foo, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+
+   return superblocks;
+}
+
+static unsigned
+calc_interleave_mode(unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode = 8;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width > MAX_TILE_WIDTH / 2)
+      mode = 1;
+   else if (tile_width > MAX_TILE_WIDTH / 4)
+      mode = 2;
+   else if (tile_width > MAX_TILE_WIDTH / 8)
+      mode = 4;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
+      return MIN2(mode, 4);
+
+   return MIN2(mode, 2);
+}
+
+unsigned
+etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                  &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
 static uint32_t
 calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
 {
@@ -103,7 +195,7 @@ write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, co
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
    unsigned stride = MIN2(input_channels, 6);
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
@@ -182,7 +274,7 @@ write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigne
    uint8_t *input = map_resource(operation->weight_tensor);
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
@@ -268,7 +360,7 @@ write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned
    uint8_t *input = map_resource(operation->weight_tensor);
    uint32_t *biases = map_resource(operation->bias_tensor);
    unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned superblocks = etna_ml_calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
    uint32_t *initial_ptr = map;
    bool do_write = initial_ptr != NULL;
    uint64_t buffer = 0;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
index b4a623f2f3d..bb93eadeab3 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@@ -36,6 +36,143 @@ struct etna_nn_header_v8 {
    uint32_t stream_size[0];
 };
 
+#define MAX_TILE_WIDTH 64
+
+static unsigned
+calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode)
+{
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu);
+   unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth;
+   unsigned output_channels = operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
+   unsigned tiles_per_core;
+
+   if (operation->weight_width == 1)
+      tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode);
+   else {
+      unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64);
+      tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride);
+   }
+
+   tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9);
+
+   tiles_per_core = MIN2(tiles_per_core, kernels_per_core);
+   tiles_per_core = MIN2(tiles_per_core, 127);
+
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+
+   return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
+}
+
+static unsigned
+calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height)
+{
+   unsigned mode;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
+      return 1;
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         mode = 2;
+      else
+         mode = 4;
+   } else
+      mode = 1;
+
+   if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) {
+      if (mode >= 2) {
+         return 2;
+      }
+   } else {
+      if (mode >= 4) {
+         return 4;
+      }
+   }
+
+   if (tile_width <= MAX_TILE_WIDTH / 2) {
+      if (MAX_TILE_WIDTH / 4 < tile_width)
+         return 2;
+      else
+         return 4;
+   }
+
+   return 1;
+}
+
+unsigned
+etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
+{
+   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_width = operation->output_width;
+   unsigned output_height = operation->output_height;
+   unsigned output_channels = operation->output_channels;
+   unsigned tile_width;
+   unsigned tile_height;
+   unsigned superblocks;
+   unsigned interleave_mode;
+
+   if (operation->addition)
+      etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
+                                 &output_width, &output_height, &output_channels);
+
+   if (operation->pooling_first_pixel) {
+      output_width *= 2;
+      output_height *= 2;
+   }
+
+   tile_width = MIN2(output_width, 64);
+   interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height);
+
+   tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
+   tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
+   tile_height = MIN2(tile_height, output_height);
+
+   /* This gets us the best performance on MobileDet */
+   /* TODO: Find the optimal value, or at least let the user override it */
+   tile_height = MIN2(tile_height, 4);
+
+   if (operation->stride > 1 && tile_height % 2 > 0)
+      tile_height -= 1;
+
+   tile_height = MAX2(tile_height, 1);
+
+   superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode);
+
+   if (tile_width_out)
+      *tile_width_out = tile_width;
+
+   if (tile_height_out)
+      *tile_height_out = tile_height;
+
+   return superblocks;
+}
+
+static void
+reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   uint8_t *input = map_resource(operation->weight_tensor);
+   struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
+                                                         pipe_buffer_size(operation->weight_tensor));
+   uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
+
+   for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
+      unsigned out_channel = i % operation->output_channels;
+
+      output[out_channel][i / operation->output_channels] = input[i];
+   }
+
+   pipe_resource_reference(&operation->weight_tensor, NULL);
+   operation->weight_tensor = output_res;
+}
+
 struct bitstream {
    unsigned bits_in_buffer;
    uint64_t buffer;
@@ -594,7 +731,7 @@ fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *ope
    unsigned output_channels = operation->output_channels;
    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
    unsigned cores_used = MIN2(output_channels, nn_core_count);
-   unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
+   unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL);
    unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
 
    unsigned channel_per_superblock[superblocks];