etnaviv: Use TP cores to accelerate tensor transformations

Vivante NPUs can contain systolic arrays that can be used to perform several kinds of tensor transformations. Use these to offload the CPU. Acked-by: Christian Gmeiner <cgmeiner@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25714>
2025-12-20 07:20:10 +01:00 · 2023-11-16 14:41:03 +01:00 · 2023-11-16 14:41:03 +01:00 · 60c2bcb8af
commit 60c2bcb8af
parent d6473ce28e
5 changed files with 873 additions and 0 deletions
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.c
@ -13,6 +13,7 @@
 #include "etnaviv_debug.h"
 #include "etnaviv_emit.h"
 #include "etnaviv_ml_nn.h"
+#include "etnaviv_ml_tp.h"
 #include "etnaviv_ml.h"

 struct pipe_resource *
@ -60,6 +61,16 @@ etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned
   ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
 }

+static bool
+needs_reshuffle(const struct pipe_ml_operation *poperation)
+{
+   bool has_stride = poperation->conv.stride_x > 1 || poperation->conv.stride_y > 1;
+   bool pointwise = poperation->conv.pointwise;
+   unsigned input_width = poperation->input_tensor->dims[1];
+
+   return has_stride && !(poperation->conv.depthwise && (input_width > 5 || input_width < 3)) && !pointwise;
+}
+
 static void
 reference_tensor_with_offset(struct etna_ml_subgraph *subgraph,
                             unsigned src_tensor,
@ -84,6 +95,10 @@ dump_graph(struct list_head *etna_operations)
   unsigned i = 0;
   list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
      switch(operation->type) {
+      case ETNA_JOB_TYPE_TP:
+         ML_DBG("%3d %-4s %3d %3d",
+                i, "TP", operation->input_tensor, operation->output_tensor);
+         break;
      case ETNA_JOB_TYPE_NN:
         ML_DBG("%3d %-4s %3d %3d in2: %3d",
                i, "NN", operation->input_tensor, operation->output_tensor, operation->add_input_tensor);
@ -107,6 +122,12 @@ lower_operations(struct etna_ml_subgraph *subgraph,
      switch(poperation->type) {
         case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
            unsigned input_tensor = poperation->input_tensor->index;
+            if (needs_reshuffle(poperation)) {
+               struct etna_operation *operation = calloc(1, sizeof(*operation));
+               etna_ml_lower_reshuffle(subgraph, poperation, operation, &input_tensor);
+               list_addtail(&operation->link, etna_operations);
+            }
+
            struct etna_operation *operation = calloc(1, sizeof(*operation));
            etna_ml_lower_convolution(subgraph, poperation, operation);
            operation->input_tensor = input_tensor;
@ -129,6 +150,7 @@ lower_operations(struct etna_ml_subgraph *subgraph,
      struct etna_operation *operation = calloc(1, sizeof(*operation));
      unsigned input_tensor = poperations[0].input_tensor->index;
      unsigned output_tensor;
+      etna_ml_lower_transpose(subgraph, &poperations[0], operation, &output_tensor);
      list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
         if (operation->input_tensor == input_tensor)
            operation->input_tensor = output_tensor;
@ -150,6 +172,22 @@ lower_operations(struct etna_ml_subgraph *subgraph,
                                      operation->input_tensor_size / 2);
   }

+   /* Detranspose any output tensors that aren't inputs to other operations
+    * and have output channels, these are the outputs of the graph.
+    */
+   list_for_each_entry_safe(struct etna_operation, operation, etna_operations, link) {
+      struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensor);
+      if (res != NULL)
+         continue;
+
+      if (operation->output_channels > 1) {
+         struct etna_operation *transpose_operation = calloc(1, sizeof(*operation));
+         etna_ml_lower_detranspose(subgraph, operation, transpose_operation);
+         operation->output_tensor = transpose_operation->input_tensor;
+         list_add(&transpose_operation->link, &operation->link);
+      }
+   }
+
   /* Create any output tensors that aren't inputs to other operations, these
    * are the outputs of the graph.
    */
@ -236,6 +274,9 @@ etna_ml_subgraph_create(struct pipe_context *pcontext,
         case ETNA_JOB_TYPE_NN:
            etna_ml_compile_operation_nn(subgraph, operation, &instruction);
            break;
+         case ETNA_JOB_TYPE_TP:
+            etna_ml_compile_operation_tp(subgraph, operation, &instruction);
+            break;
      }

      util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction);
@ -359,6 +400,12 @@ etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psub

      if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
         switch (operation->type) {
+            case ETNA_JOB_TYPE_TP:
+               for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
+                  dump_buffer(operation->configs[j], "tp", dump_id);
+                  dump_id++;
+               }
+               break;
            case ETNA_JOB_TYPE_NN:
               dump_buffer(operation->configs[0], "nn", dump_id);
               dump_buffer(operation->coefficients, "compressed", dump_id);
@ -389,6 +436,9 @@ etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psub
      etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE);

      switch (operation->type) {
+         case ETNA_JOB_TYPE_TP:
+            etna_ml_emit_operation_tp(subgraph, operation, i);
+            break;
         case ETNA_JOB_TYPE_NN:
            etna_ml_emit_operation_nn(subgraph, operation, i);
            break;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h
@ -13,6 +13,13 @@

 enum etna_job_type {
    ETNA_JOB_TYPE_NN,
+    ETNA_JOB_TYPE_TP,
+};
+
+enum etna_ml_tp_type {
+   ETNA_ML_TP_TRANSPOSE,
+   ETNA_ML_TP_DETRANSPOSE,
+   ETNA_ML_TP_RESHUFFLE,
 };

 struct etna_ml_subgraph {
@ -40,6 +47,7 @@ struct etna_operation {
   struct list_head link;

   enum etna_job_type type;
+   enum etna_ml_tp_type tp_type;

   bool addition;
   bool depthwise;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/u_inlines.h"
+
+#include "etnaviv_context.h"
+#include "etnaviv_debug.h"
+#include "etnaviv_emit.h"
+#include "etnaviv_ml_tp.h"
+
+#define FIELD(field, bits) uint32_t field : bits;
+
+struct etna_tp_params {
+   /* 0 */
+   FIELD(in_image_x_size, 16)
+   FIELD(unused0, 16)
+
+   /* 1 */
+   FIELD(in_image_y_size, 16)
+   FIELD(in_image_z_size, 16)
+
+   /* 2 */
+   FIELD(in_image_stride, 16)
+   FIELD(unused1, 16)
+
+   /* 3 */
+   FIELD(in_image_slice, 32)
+
+   /* 4 */
+   FIELD(in_window_x_start, 16)
+   FIELD(in_window_y_start, 16)
+
+   /* 5 */
+   FIELD(in_window_x_end, 16)
+   FIELD(in_window_y_end, 16)
+
+   /* 6 */
+   FIELD(in_tile_sequence, 2)
+   FIELD(in_tile_global_mem, 1)
+   FIELD(in_image_global_mem, 1)
+   FIELD(alu_i2f_enable, 1)
+   FIELD(alu_square_enable, 1)
+   FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */
+   FIELD(alu_horz_proc_count, 6)
+   FIELD(alu_horz_proc_stride, 1)
+   FIELD(alu_vert_processing, 2)
+   FIELD(unused2, 1)
+   FIELD(alu_vert_proc_count, 6)
+   FIELD(alu_vert_proc_stride, 1)
+   FIELD(alu_nms_enable, 1)
+   FIELD(alu_pwl_enable, 1)
+   FIELD(alu_mult_enable, 1)
+   FIELD(alu_f2i_enable, 1)
+   FIELD(alu_load_pwl_lut, 1)
+   FIELD(alu_load_pwl_lut_global_mem, 1)
+
+   /* 7 */
+   FIELD(in_tile_list_address, 32)
+
+   /* 8 */
+   FIELD(in_tile_x_size, 16)
+   FIELD(in_tile_y_size, 16)
+
+   /* 9 */
+   FIELD(in_tile_x_inc, 16)
+   FIELD(in_tile_y_inc, 16)
+
+   /* 10 */
+   FIELD(in_image_base_address, 32)
+
+   /* 11 */
+   FIELD(alu_load_pwl_lut_address, 32)
+
+   /* 12 */
+   FIELD(out_tile_skip_at_border, 1)
+   FIELD(out_image_global_mem, 1)
+   FIELD(out_loop_1_reset, 1)
+   FIELD(out_loop_2_reset, 1)
+   FIELD(out_loop_3_reset, 1)
+   FIELD(out_brick_mode, 1)
+   FIELD(alu_z_filter_mode, 1)
+   FIELD(unused3, 1)
+   FIELD(in_window_z_start_overfetch, 2)
+   FIELD(unused4, 1)
+   FIELD(in_window_z_end_overfetch, 2)
+   FIELD(unused5, 1)
+   FIELD(alu_square_preshift, 4)
+   FIELD(in_image_data_type, 3)
+   FIELD(out_image_data_type, 3)
+   FIELD(unused6, 4)
+   FIELD(alu_pwl_sign_support, 1)
+   FIELD(alu_relu_enable, 1)
+   FIELD(no_flush, 1)
+   FIELD(last, 1)
+
+   /* 13 */
+   FIELD(out_image_base_address, 32)
+
+   /* 14 */
+   FIELD(out_loop_0_inc, 32)
+
+   /* 15 */
+   FIELD(out_loop_1_inc, 32)
+
+   /* 16 */
+   FIELD(out_loop_0_count, 16)
+   FIELD(out_loop_1_count, 16)
+
+   /* 17 */
+   FIELD(out_loop_2_inc, 32)
+
+   /* 18 */
+   FIELD(out_loop_3_inc, 32)
+
+   /* 19 */
+   FIELD(out_loop_2_count, 16)
+   FIELD(out_loop_3_count, 16)
+
+   /* 20 */
+   FIELD(out_loop_4_inc, 32)
+
+   /* 21 */
+   FIELD(out_loop_5_inc, 32)
+
+   /* 22 */
+   FIELD(out_loop_4_count, 16)
+   FIELD(out_loop_5_count, 16)
+
+   /* 23 */
+   FIELD(out_loop_6_inc, 32)
+
+   /* 24 */
+   FIELD(alu_filter_pwl_swap, 1)
+   FIELD(flat_rounding_mode, 2)
+   FIELD(integer_rounding_mode, 2)
+   FIELD(alu_input_preshift, 5)
+   FIELD(alu_output_postshift, 5)
+   FIELD(alu_reorder_bits_used, 4)
+   FIELD(alu_reorder_loop_2_mode, 1)
+   FIELD(unused7, 4)
+   FIELD(in_image_border_mode, 2)
+   FIELD(alu_output_postshift_5_6, 2)
+   FIELD(unused8, 4)
+
+   /* 25 */
+   FIELD(in_image_circular_buf_size, 32)  /* >> 6 */
+
+   /* 26 */
+   FIELD(in_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
+
+   /* 27 */
+   FIELD(out_image_circular_buf_size, 32)  /* >> 6 */
+
+   /* 28 */
+   FIELD(out_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
+
+   /* 29 */
+   FIELD(in_image_border_const, 16)
+   FIELD(coef_zp, 8)
+   FIELD(in_zp, 8)
+
+   /* 30 */
+   FIELD(out_zp, 8)
+   FIELD(alu_output_post_multiplier, 15)
+   FIELD(unused9, 9)
+};
+
+static void
+set_default_tp_config(struct etna_tp_params *map)
+{
+   map->unused0 = 0x0;
+   map->unused1 = 0x0;
+   map->in_window_x_start = 0x0;
+   map->in_window_y_start = 0x0;
+   map->in_tile_sequence = 0x0;
+   map->in_tile_global_mem = 0x0;
+   map->in_image_global_mem = 0x1;
+   map->alu_i2f_enable = 0x1;
+   map->alu_square_enable = 0x0;
+   map->alu_horz_processing = 0x0;
+   map->alu_horz_proc_count = 0x0;
+   map->alu_horz_proc_stride = 0x0;
+   map->alu_vert_processing = 0x0;
+   map->unused2 = 0x0;
+   map->alu_vert_proc_count = 0x0;
+   map->alu_vert_proc_stride = 0x0;
+   map->alu_nms_enable = 0x0;
+   map->alu_pwl_enable = 0x0;
+   map->alu_mult_enable = 0x0;
+   map->alu_f2i_enable = 0x1;
+   map->alu_load_pwl_lut = 0x0;
+   map->alu_load_pwl_lut_global_mem = 0x0;
+   map->in_tile_list_address = 0x0;
+   map->in_tile_x_size = 0x1;
+   map->in_tile_x_inc = 0x1;
+   map->alu_load_pwl_lut_address = 0x0;
+   map->out_tile_skip_at_border = 0x0;
+   map->out_image_global_mem = 0x1;
+   map->out_loop_1_reset = 0x0;
+   map->out_loop_2_reset = 0x0;
+   map->out_loop_3_reset = 0x0;
+   map->out_brick_mode = 0x0;
+   map->alu_z_filter_mode = 0x0;
+   map->unused3 = 0x0;
+   map->in_window_z_start_overfetch = 0x0;
+   map->unused4 = 0x0;
+   map->in_window_z_end_overfetch = 0x0;
+   map->unused5 = 0x0;
+   map->alu_square_preshift = 0x0;
+   map->in_image_data_type = 0x0;
+   map->out_image_data_type = 0x0;
+   map->unused6 = 0x0;
+   map->alu_pwl_sign_support = 0x0;
+   map->alu_relu_enable = 0x0;
+   map->no_flush = 0x0;
+   map->last = 0x1;
+   map->out_loop_0_inc = 0x1;
+   map->out_loop_3_inc = 0x0;
+   map->out_loop_3_count = 0x1;
+   map->out_loop_4_inc = 0x0;
+   map->out_loop_5_inc = 0x0;
+   map->out_loop_4_count = 0x1;
+   map->out_loop_5_count = 0x1;
+   map->out_loop_6_inc = 0x0;
+   map->alu_filter_pwl_swap = 0x0;
+   map->flat_rounding_mode = 0x1;
+   map->integer_rounding_mode = 0x1;
+   map->alu_input_preshift = 0x0;
+   map->alu_output_postshift = 0x0;
+   map->alu_reorder_bits_used = 0x0;
+   map->alu_reorder_loop_2_mode = 0x0;
+   map->unused7 = 0x0;
+   map->in_image_border_mode = 0x0;
+   map->alu_output_postshift_5_6 = 0x0;
+   map->unused8 = 0x0;
+   map->in_image_border_const = 0x0;
+   map->coef_zp = 0x0;
+   map->alu_output_post_multiplier = 0x0;
+   map->unused9 = 0x0;
+}
+
+static struct etna_bo *
+create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
+{
+   struct etna_context *ctx = etna_context(subgraph->base.context);
+   struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
+                                    sizeof(struct etna_tp_params),
+                                    DRM_ETNA_GEM_CACHE_WC);
+
+   etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
+
+   struct etna_tp_params *map = etna_bo_map(bo);
+
+   set_default_tp_config(map);
+
+   map->in_image_x_size = operation->input_channels;
+   map->in_image_y_size = operation->input_height;
+   map->in_image_z_size = operation->input_width;
+   map->in_image_stride = operation->input_channels;
+   map->in_image_slice = operation->input_width * operation->input_channels;
+   map->in_window_x_end = operation->input_channels - 1;
+   map->in_window_y_end = operation->input_height - 1;
+   map->in_tile_y_size = operation->input_height;
+   map->in_tile_y_inc = operation->input_height;
+
+   struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
+   map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
+
+   struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
+   unsigned offset = etna_ml_get_offset(subgraph, operation->output_tensor);
+   map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
+
+   map->out_loop_1_inc = operation->input_width * operation->input_height;
+   map->out_loop_0_count = operation->input_height;
+   map->out_loop_1_count = operation->input_channels;
+   map->out_loop_2_inc = operation->input_height;
+   map->out_loop_2_count = operation->input_width;
+   map->in_image_circular_buf_size = 0x0;
+   map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+   map->out_image_circular_buf_size = 0x0;
+   map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+   map->in_zp = operation->input_zero_point;
+   map->out_zp = operation->input_zero_point;
+   map->no_flush = 0x0;
+
+   etna_bo_cpu_fini(bo);
+
+   return bo;
+}
+
+static struct etna_bo *
+create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
+{
+   struct etna_context *ctx = etna_context(subgraph->base.context);
+   unsigned input_width = operation->input_width;
+   unsigned input_height = operation->input_height;
+   unsigned input_channels = operation->input_channels;
+   struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
+                                    sizeof(struct etna_tp_params),
+                                    DRM_ETNA_GEM_CACHE_WC);
+
+   etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
+
+   struct etna_tp_params *map = etna_bo_map(bo);
+
+   set_default_tp_config(map);
+
+   map->in_image_x_size = input_width;
+   map->in_image_y_size = input_height * input_channels;
+   map->in_image_z_size = 0x1;
+   map->in_image_stride = input_width;
+   map->in_image_slice = input_width * input_height * input_channels;
+   map->in_window_x_end = input_width - 1;
+   map->in_window_y_end = input_height * input_channels - 1;
+   map->in_tile_y_size = 0x1;
+   map->in_tile_y_inc = 0x1;
+
+   struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
+   map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
+
+   struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
+   map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
+
+   map->out_loop_0_inc = input_channels;
+   map->out_loop_1_inc = 0x0;
+   map->out_loop_0_count = input_height;
+   map->out_loop_1_count = 0x1;
+   map->out_loop_2_inc = input_height * input_channels;
+   map->out_loop_2_count = input_width;
+   map->out_loop_3_inc = 0x1;
+   map->out_loop_3_count = input_channels;
+   map->out_loop_4_inc = input_width * input_height * input_channels;
+   map->in_image_circular_buf_size = 0x0;
+   map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+   map->out_image_circular_buf_size = 0x0;
+   map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+   map->in_zp = operation->input_zero_point;
+   map->out_zp = operation->input_zero_point;
+
+   etna_bo_cpu_fini(bo);
+
+   return bo;
+}
+
+static void
+set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used)
+{
+   map->in_image_x_size = operation->input_width;
+
+   if (operation->padding_same && operation->input_channels > 1) {
+      map->in_image_y_size = operation->input_height;
+      map->in_image_z_size = operation->input_channels / tp_cores_used;
+   } else if (operation->padding_same && operation->input_channels == 1) {
+      switch(operation->input_width) {
+      case 3:
+      case 5:
+         map->in_image_y_size = operation->input_height;
+         break;
+      case 8:
+         switch(operation->weight_width) {
+         case 3:
+            map->in_image_y_size = operation->input_height;
+            break;
+         case 5:
+            map->in_image_y_size = 5;
+            break;
+         }
+         break;
+      case 80:
+      case 112:
+         switch(operation->weight_width) {
+         case 3:
+            map->in_image_y_size = operation->input_height / tp_cores_used + 2;
+            break;
+         case 5:
+            map->in_image_y_size = operation->input_height / tp_cores_used + 1;
+            break;
+         }
+         break;
+      default:
+         unreachable("Unsupported input width");
+      }
+      map->in_image_z_size = operation->input_channels;
+   } else {
+      map->in_image_y_size = operation->input_height / tp_cores_used;
+      map->in_image_z_size = operation->input_channels;
+   }
+}
+
+static struct etna_bo *
+create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
+                        unsigned tp_core, unsigned tp_cores_used)
+{
+   struct etna_context *ctx = etna_context(subgraph->base.context);
+   unsigned tp_core_count = ctx->screen->specs.tp_core_count;
+   struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
+                                    sizeof(struct etna_tp_params),
+                                    DRM_ETNA_GEM_CACHE_WC);
+
+   etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
+
+   struct etna_tp_params *map = etna_bo_map(bo);
+
+   set_default_tp_config(map);
+
+   set_input_size(operation, map, tp_cores_used);
+
+   map->in_image_stride = operation->input_width;
+   map->in_image_slice = operation->input_width * operation->input_height;
+
+   if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) {
+      if (operation->weight_width == 5 && operation->input_width < 8) {
+         map->in_window_x_start = 0xfffe;
+         map->in_window_y_start = 0xfffe;
+      } else {
+         map->in_window_x_start = 0xffff;
+         map->in_window_y_start = 0xffff;
+      }
+   } else {
+      map->in_window_x_start = 0x0;
+      map->in_window_y_start = 0x0;
+   }
+
+   map->in_window_x_end = operation->input_width - 1;
+   map->in_window_y_end = (operation->input_height / tp_cores_used) - 1;
+   map->in_tile_x_size = operation->input_width;
+   map->in_tile_x_inc = operation->input_width;
+
+   if (operation->input_width <= 8 && operation->input_channels == 1) {
+      map->in_tile_y_size = operation->input_height;
+      map->in_tile_y_inc = operation->input_height;
+   } else {
+      map->in_tile_y_size = operation->input_height / tp_cores_used;
+      map->in_tile_y_inc = operation->input_height / tp_cores_used;
+   }
+
+   if (operation->padding_same) {
+      switch(operation->weight_width) {
+      case 3:
+         map->in_window_x_end += 2;
+         if (operation->input_width < 8) {
+            map->in_tile_x_size += 3;
+            map->in_tile_y_size += 1;
+            map->in_tile_y_inc += 1;
+         } else {
+            map->in_tile_x_size += 2;
+         }
+         break;
+      case 5:
+         map->in_window_x_end += 3;
+         if (operation->input_width < 8) {
+            map->in_tile_x_size += 5;
+         } else {
+            map->in_tile_x_size += 4;
+         }
+         break;
+      default:
+         unreachable("Unsupported weight size");
+      }
+
+      if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5)
+         map->in_tile_x_size = operation->input_width / tp_cores_used + 2;
+
+      if (operation->input_width > 8 && operation->input_channels == 1) {
+         switch(operation->weight_width) {
+         case 3:
+            map->in_window_y_end = (operation->input_height / tp_cores_used) + 1;
+            break;
+         case 5:
+            map->in_window_y_end = (operation->input_height / tp_cores_used);
+            break;
+         default:
+            unreachable("Unsupported weight size");
+         }
+      } else
+         map->in_window_y_end = map->in_window_x_end;
+
+      map->in_tile_x_inc = map->in_tile_x_size;
+
+      if (operation->input_channels > 1) {
+         map->in_tile_y_size = map->in_tile_x_size;
+         map->in_tile_y_inc = map->in_tile_x_size;
+      } else {
+         map->in_tile_y_size += 2;
+         map->in_tile_y_inc += 2;
+      }
+   } else {
+      if (operation->input_width < 8) {
+            map->in_window_x_end += 1;
+            map->in_window_y_end += 1;
+            map->in_tile_x_size += 1;
+            map->in_tile_y_size += 1;
+            map->in_tile_x_inc += 1;
+            map->in_tile_y_inc += 1;
+      }
+   }
+
+   struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
+   map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
+
+   if (operation->padding_same)
+      map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
+   else
+      map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core;
+
+   struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
+   map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
+
+   if (operation->padding_same)
+      map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
+   else
+      map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core;
+
+   map->out_loop_1_reset = 0x1;
+   map->out_loop_2_reset = 0x0;
+   map->out_loop_3_reset = 0x1;
+   map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2);
+   map->out_loop_1_inc = 0x1;
+   map->out_loop_0_count = 0x2;
+   map->out_loop_1_count = round(operation->input_width / 2.0);
+   map->out_loop_2_count = 0x2;
+   map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used);
+
+   if (operation->padding_same) {
+      switch(operation->weight_width) {
+      case 3:
+         map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2);
+         map->out_loop_1_count += 1;
+         break;
+      case 5:
+         map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2);
+         map->out_loop_1_count += 2;
+         break;
+      default:
+         unreachable("Unsupported weight size");
+      }
+
+      if (operation->input_channels == 1)
+        map->out_loop_3_count += 1;
+      else
+        map->out_loop_3_count = map->out_loop_1_count;
+   }
+
+   map->out_loop_2_inc = map->out_loop_0_inc * 2;
+   map->out_loop_3_inc = map->out_loop_1_count;
+   map->out_loop_6_inc = map->out_loop_0_inc * 4;
+
+   if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) {
+      if (tp_core > 0) {
+         map->in_image_y_size -= 2;
+         map->in_window_y_end -= 2;
+         map->in_tile_y_size -= 2;
+         map->in_tile_y_inc -= 2;
+         map->out_loop_3_count -= 1;
+      }
+
+      if (tp_core == tp_core_count - 1) {
+         map->in_image_y_size -= 2;
+      }
+
+      if (tp_core > 0) {
+         map->in_image_base_address += operation->input_width * 2;
+         map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1);
+      }
+   }
+
+   unsigned alu_size = operation->input_width;
+   if (operation->padding_same) {
+      alu_size += 1;
+      if (operation->weight_width == 5)
+         alu_size += 1;
+      if (operation->input_width == 5)
+         alu_size += 1;
+   }
+
+   map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size);
+
+   map->in_zp = operation->input_zero_point;
+   map->out_zp = operation->input_zero_point;
+
+   if (tp_cores_used > 1)
+      map->no_flush = tp_core < tp_cores_used - 1;
+
+   map->in_image_circular_buf_size = 0x0;
+   map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+   map->out_image_circular_buf_size = 0x0;
+   map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
+
+   if (map->in_image_y_size < 2) {
+      map->in_image_y_size = operation->input_width;
+      map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used;
+      map->in_window_y_end = operation->input_width;
+      map->in_tile_y_size = operation->input_width + 1;
+      map->in_tile_y_inc = operation->input_width + 1;
+      map->out_loop_3_count += 1;
+
+      map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
+      map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
+
+      map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
+      map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
+   }
+
+   etna_bo_cpu_fini(bo);
+
+   return bo;
+}
+
+void
+etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
+                        const struct pipe_ml_operation *first_operation,
+                        struct etna_operation *operation,
+                        unsigned *output_tensor)
+{
+   operation->type = ETNA_JOB_TYPE_TP;
+   operation->tp_type = ETNA_ML_TP_TRANSPOSE;
+
+   operation->input_tensor = first_operation->input_tensor->index;
+   operation->input_width = first_operation->input_tensor->dims[1];
+   operation->input_height = first_operation->input_tensor->dims[2];
+   operation->input_channels = first_operation->input_tensor->dims[3];
+   operation->input_zero_point = first_operation->input_tensor->zero_point;
+   operation->input_scale = first_operation->input_tensor->scale;
+   operation->input_tensor_size = operation->input_width *
+                                  operation->input_height *
+                                  operation->input_channels;
+
+   *output_tensor = etna_ml_allocate_tensor(subgraph);
+   operation->output_tensor = *output_tensor;
+   operation->output_width = first_operation->input_tensor->dims[1];
+   operation->output_height = first_operation->input_tensor->dims[2];
+   operation->output_channels = first_operation->input_tensor->dims[3];
+   operation->output_zero_point = first_operation->input_tensor->zero_point;
+   operation->output_scale = first_operation->input_tensor->scale;
+}
+
+void
+etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
+                          struct etna_operation *convolution,
+                          struct etna_operation *operation)
+{
+   operation->type = ETNA_JOB_TYPE_TP;
+   operation->tp_type = ETNA_ML_TP_DETRANSPOSE;
+
+   operation->input_tensor = etna_ml_allocate_tensor(subgraph);
+   operation->input_width = convolution->output_width;
+   operation->input_height = convolution->output_height;
+   operation->input_channels = convolution->output_channels;
+   operation->input_zero_point = convolution->output_zero_point;
+   operation->input_scale = convolution->output_scale;
+   operation->input_tensor_size = operation->input_width *
+                                  operation->input_height *
+                                  operation->input_channels;
+
+   operation->output_tensor = convolution->output_tensor;
+   operation->output_width = convolution->output_width;
+   operation->output_height = convolution->output_height;
+   operation->output_channels = convolution->output_channels;
+   operation->output_zero_point = convolution->output_zero_point;
+   operation->output_scale = convolution->output_scale;
+}
+
+void
+etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
+                        const struct pipe_ml_operation *convolution,
+                        struct etna_operation *operation,
+                        unsigned *output_tensor)
+{
+   operation->type = ETNA_JOB_TYPE_TP;
+   operation->tp_type = ETNA_ML_TP_RESHUFFLE;
+   operation->stride = convolution->conv.stride_x;
+   operation->padding_same = convolution->conv.padding_same;
+
+   operation->input_tensor = convolution->input_tensor->index;
+   operation->input_width = convolution->input_tensor->dims[1];
+   operation->input_height = convolution->input_tensor->dims[2];
+   operation->input_channels = convolution->input_tensor->dims[3];
+   operation->input_zero_point = convolution->input_tensor->zero_point;
+   operation->input_scale = convolution->input_tensor->scale;
+   operation->input_tensor_size = operation->input_width *
+                                  operation->input_height *
+                                  operation->input_channels;
+
+   *output_tensor = etna_ml_allocate_tensor(subgraph);
+   operation->output_tensor = *output_tensor;
+   operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
+   operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
+   operation->output_channels = operation->input_channels * operation->stride * operation->stride;
+   operation->output_zero_point = convolution->input_tensor->zero_point;
+   operation->output_scale = convolution->input_tensor->scale;
+
+   /* When destriding a convolution, the transformation to be made to the input
+    * tensor will depend on the size of the weight tensor.
+    */
+   operation->weight_width = convolution->conv.weight_tensor->dims[1];
+   operation->weight_height = convolution->conv.weight_tensor->dims[2];
+
+   if (operation->padding_same) {
+      if (operation->weight_width == 5) {
+         operation->output_width += 2;
+         operation->output_height += 2;
+      } else {
+         operation->output_width += 1;
+         operation->output_height += 1;
+      }
+   }
+}
+
+void
+etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
+                             const struct etna_operation *operation,
+                             struct etna_vip_instruction *instruction)
+{
+   struct etna_context *ctx = etna_context(subgraph->base.context);
+   struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
+   assert(input);
+   pipe_resource_reference(&instruction->input, input);
+
+   struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
+   assert(output);
+   pipe_resource_reference(&instruction->output, output);
+
+   switch (operation->tp_type) {
+   case ETNA_ML_TP_TRANSPOSE:
+      instruction->configs[0] = create_transpose_config(subgraph, operation);
+      break;
+   case ETNA_ML_TP_DETRANSPOSE:
+      instruction->configs[0] = create_detranspose_config(subgraph, operation);
+      break;
+   case ETNA_ML_TP_RESHUFFLE: {
+      unsigned tp_core_count = ctx->screen->specs.tp_core_count;
+      unsigned tp_cores_used;
+
+      tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
+
+      /* TODO: Run among the 4 cores for faster performance */
+      if ((operation->input_width == 320 || operation->input_width == 224) &&
+          operation->input_channels == 3)
+         tp_cores_used = 1;
+
+      ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
+      for (unsigned i = 0; i < tp_cores_used; i++) {
+         instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used);
+      }
+      break;
+   }
+   }
+   instruction->type = ETNA_JOB_TYPE_TP;
+}
+
+void
+etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
+                          struct etna_vip_instruction *operation,
+                          unsigned idx)
+{
+   struct etna_context *ctx = etna_context(subgraph->base.context);
+   unsigned tp_core_count = ctx->screen->specs.tp_core_count;
+   struct etna_cmd_stream *stream = ctx->stream;
+   bool more_than_one_tp_job = operation->configs[1] != NULL;
+   bool parallel = !DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL);
+
+   for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
+      unsigned offset = parallel ? idx + 1 : 0;
+
+      if (more_than_one_tp_job && (j < tp_core_count - 1))
+            offset = parallel ? 0x1f : 0x1;
+
+      etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
+      etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
+      etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0);
+      etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) {
+         .bo = operation->configs[j],
+         .flags = ETNA_RELOC_READ,
+         .offset = offset,
+      });
+   }
+   etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0);
+}
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.h
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "etnaviv_ml.h"
+
+void
+etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
+                        const struct pipe_ml_operation *first_operation,
+                        struct etna_operation *operation,
+                        unsigned *output_tensor);
+
+void
+etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
+                          struct etna_operation *convolution,
+                          struct etna_operation *operation);
+
+void
+etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
+                        const struct pipe_ml_operation *first_operation,
+                        struct etna_operation *operation,
+                        unsigned *output_tensor);
+
+void
+etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
+                             const struct etna_operation *operation,
+                             struct etna_vip_instruction *instruction);
+
+void
+etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
+                          struct etna_vip_instruction *operation,
+                          unsigned idx);
--- a/src/gallium/drivers/etnaviv/meson.build
+++ b/src/gallium/drivers/etnaviv/meson.build
@ -61,6 +61,8 @@ files_etnaviv = files(
  'etnaviv_ml.h',
  'etnaviv_ml_nn.c',
  'etnaviv_ml_nn.h',
+  'etnaviv_ml_tp.c',
+  'etnaviv_ml_tp.h',
  'etnaviv_nir_lower_source_mods.c',
  'etnaviv_nir_lower_texture.c',
  'etnaviv_nir_lower_ubo_to_uniform.c',