etnaviv/ml: Add encoding of coefficients for V8

In V8 the weights and biases of convolution operations are encoded with a totally different scheme. The initial reverse engineering and implementation was done by: Philipp Zabel <p.zabel@pengutronix.de> Support for zero run length encoding and average bias is not implemented yet. Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
2026-05-07 00:38:48 +02:00 · 2024-10-23 08:56:33 +02:00 · 2024-10-23 08:56:33 +02:00 · b4ba62fcda
commit b4ba62fcda
parent f3d765ed5d
4 changed files with 703 additions and 1 deletions
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@ -892,10 +892,17 @@ void
 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
                             struct etna_vip_instruction *instruction)
 {
+   struct pipe_context *pctx = subgraph->base.context;
+   struct etna_context *ctx = etna_context(pctx);
+   unsigned nn_core_version = ctx->screen->specs.nn_core_version;
   unsigned coef_cache_size;

   instruction->type = ETNA_JOB_TYPE_NN;
-   instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
+
+   if (nn_core_version == 7)
+      instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
+   else
+      instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size);

   struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
   assert(input);
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h
@ -13,6 +13,9 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
 struct etna_bo *
 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);

+struct etna_bo *
+etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
+
 unsigned
 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);

--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * Copyright (c) 2024 Pengutronix, Philipp Zabel
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <time.h>
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "etnaviv_context.h"
+#include "etnaviv_debug.h"
+#include "etnaviv_ml.h"
+#include "etnaviv_ml_nn.h"
+#include "etnaviv_screen.h"
+
+static void *
+map_resource(struct pipe_resource *resource)
+{
+   return etna_bo_map(etna_resource(resource)->bo);
+}
+
+#define FIELD(field, bits) uint32_t field : bits;
+
+struct etna_nn_header_v8 {
+   FIELD(precode, 1)
+   FIELD(bit16, 1)
+   FIELD(fp16, 1)
+   FIELD(reserved1, 1)
+   FIELD(version, 4)
+
+   uint8_t run_length_size;
+   uint8_t run_length_table[18];
+   uint32_t symbol_map;
+   uint16_t avg_bias;
+   uint16_t reserved2;
+   uint32_t stream_size[0];
+};
+
+struct bitstream {
+   unsigned bits_in_buffer;
+   uint64_t buffer;
+   uint32_t **map;
+   bool do_write;
+};
+
+static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights)
+{
+   unsigned input_channels;
+   int32_t input_zero_point = 128 - operation->input_zero_point;
+   int32_t correction = 0;
+
+   if (operation->depthwise)
+      input_channels = 1;
+   else if (operation->addition)
+      input_channels = 2 * operation->output_channels;
+   else
+      input_channels = operation->input_channels;
+
+   for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
+      correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
+   }
+
+   return correction;
+}
+
+static void
+append_bits(uint32_t value, size_t size, struct bitstream *bitstream)
+{
+   assert(value < 1 << size);
+   if (!size)
+      return;
+   bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer;
+   bitstream->bits_in_buffer += size;
+   if (bitstream->bits_in_buffer >= 32) {
+      if (bitstream->do_write)
+         **bitstream->map = bitstream->buffer & 0xffffffff;
+      *bitstream->map += 1;
+      bitstream->buffer >>= 32;
+      bitstream->bits_in_buffer -= 32;
+   }
+}
+
+static void
+flush_bits(struct bitstream *bitstream)
+{
+   if (bitstream->bits_in_buffer > 0)
+      append_bits(0, 32 - bitstream->bits_in_buffer, bitstream);
+}
+
+struct wb_stream {
+   struct bitstream bitstream;
+   unsigned zero_point;
+   unsigned zrl_bits;
+   unsigned accum_zeroes;
+};
+
+static void
+wb_stream_flush_zeroes(struct wb_stream *wb_stream)
+{
+   struct bitstream *bitstream = &wb_stream->bitstream;
+
+   if (wb_stream->accum_zeroes == 0)
+      return;
+
+   append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream);
+   wb_stream->accum_zeroes = 0;
+   append_bits(wb_stream->zero_point, 8, bitstream);
+}
+
+static void
+wb_stream_write(struct wb_stream *wb_stream, unsigned value)
+{
+   struct bitstream *bitstream = &wb_stream->bitstream;
+   unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
+
+   if (wb_stream->zrl_bits == 0) {
+      append_bits(value, 8, bitstream);
+      return;
+   }
+
+   if (wb_stream->accum_zeroes == max_zeroes) {
+      append_bits(max_zeroes, wb_stream->zrl_bits, bitstream);
+      wb_stream->accum_zeroes = 0;
+      append_bits(value, 8, bitstream);
+      return;
+   }
+
+   if (value == wb_stream->zero_point) {
+      wb_stream->accum_zeroes++;
+      return;
+   }
+
+   append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream);
+   wb_stream->accum_zeroes = 0;
+   append_bits(value, 8, bitstream);
+}
+
+/*
+ * The V8 architecture Huffman stream decoder uses a fixed code book with 8
+ * entries to determine bit lengths of variable length values later in the bit
+ * stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus
+ * optional 2-bit) fields:
+ *
+ *     code   symbol
+ *    --------------
+ *    00_       0
+ *    10_       1
+ *    111       2
+ *    110       3
+ *    011       4
+ *    010 1_    5
+ *    010 01    6
+ *    010 00    7
+ *
+ * The free bit (_) is used for the sign, if available, otherwise the sign
+ * is stored with the variable length value later in the bitstream. In ZRL
+ * encoding mode, where larger values are stored verbatim, this may also be
+ * the lsb of the value instead.. The decoder processes weights in pairs and
+ * is pipelined 3-deep:
+ *
+ * In each step, first two 3-bit codes are read, then up to two 2-bit codes
+ * that belong with (010) 3-bit codes from the previous step. The optional
+ * 2-bit codes from the previous step, together with the 3-bit codes from the
+ * step before that are used to decode two symbols that are mapped to two bit
+ * lengths for the two variable length values that are read next.
+ *
+ * Finally, the bit lengths, signs, and variable length values are used to
+ * calculate two weights.
+ */
+
+struct code {
+   /* fixed 3-bit code */
+   uint8_t part0;
+   /* optional 2-bit code, iff part0 == 0b010 */
+   uint8_t part1;
+   /* variable length value */
+   uint8_t part2;
+   /* bit length determined from part0, part1, and symbol-to-bitlength map */
+   uint8_t part2_len;
+};
+
+struct encoder {
+   /* bit-length-to-huffman-symbol map */
+   uint8_t map[9];
+   /* ring buffer for 3 encoded weight pairs */
+   struct code code[6];
+   size_t bytes_read;
+   struct bitstream bitstream;
+   uint32_t *initial_ptr;
+   uint32_t *dest;
+   uint8_t accum_zeroes;
+   uint8_t avg_bias;
+   bool zrl;
+};
+
+/* Calculate a histogram of bit lenghts. */
+static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl)
+{
+   for (size_t i = 0; i < len; i++) {
+      uint8_t num_bits = 0;
+      if (bytes[i]) {
+         bool sign = bytes[i] >> 7;
+         uint8_t value = bytes[i];
+         if (sign) {
+            value -= zrl;
+            value ^= 0xff;
+         }
+         num_bits = util_logbase2(value) + 1;
+      }
+      assert(num_bits <= 8);
+      histogram[num_bits]++;
+   }
+}
+
+/*
+ * value can be 8-bit raw value or variable length value with prepended sign.
+ * num_bits is number of bits in value, including the sign bit.
+ */
+static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits)
+{
+   switch (sym) {
+   case 0:
+      return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
+   case 1:
+      return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
+   case 2:
+      return (struct code){ 7, 0, value, num_bits};
+   case 3:
+      return (struct code){ 3, 0, value, num_bits};
+   case 4:
+      return (struct code){ 6, 0, value, num_bits};
+   case 5:
+      return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 };
+   case 6:
+      return (struct code){ 2, 2, value, num_bits};
+   case 7:
+      return (struct code){ 2, 0, value, num_bits};
+   default:
+      return (struct code){};
+   }
+}
+
+static void emit_pair(struct encoder *encoder)
+{
+   struct bitstream *bitstream = &encoder->bitstream;
+   struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6];
+
+   append_bits(code[0].part0, 3, bitstream);
+   append_bits(code[1].part0, 3, bitstream);
+   if (encoder->bytes_read > 2) {
+      code = &encoder->code[(encoder->bytes_read - 4) % 6];
+      append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream);
+      append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream);
+   }
+   if (encoder->bytes_read > 4) {
+      code = &encoder->code[(encoder->bytes_read - 6) % 6];
+      append_bits(code[0].part2, code[0].part2_len, bitstream);
+      append_bits(code[1].part2, code[1].part2_len, bitstream);
+   }
+}
+
+/* Encode a single byte. Emit into the bitstream when a pair is complete. */
+static void encode_byte(struct encoder *encoder, uint8_t byte)
+{
+   bool zrl = encoder->zrl;
+   bool sign = byte >> 7;
+   uint8_t value = byte;
+
+   if (sign) {
+      value -= zrl;
+      value ^= 0xff;
+   }
+
+   uint8_t msb = util_logbase2(value);
+   uint8_t num_bits = value ? (msb + 1) : 0;
+   value &= ~(1 << msb);
+   uint8_t sym = encoder->map[num_bits];
+   if (zrl && byte == 0) {
+      if (encoder->accum_zeroes <= 1) {
+         // this seems to be used for the non-repeated 0 at the beginning and end
+         sym = encoder->map[7];
+         num_bits = 8;
+      } else {
+         // FIXME - how to encode run length into the run length table?
+         num_bits = 1;
+      }
+   }
+   if (!zrl && num_bits == 0) {
+      num_bits = 1;
+   }
+   if (sym == 255 || (zrl && byte == 128)) {
+      // if there is no huffman code assigned to this bit length, or when
+      // encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim.
+      sym = encoder->map[7];
+      value = byte;
+      num_bits = 8;
+   } else if (zrl && num_bits == 7) {
+      value = byte;
+      num_bits = 8;
+   } else {
+      value = (value << 1) | sign;
+   }
+   unsigned int i = encoder->bytes_read % 6;
+   encoder->code[i] = huffman_code(sym, value, num_bits);
+   encoder->bytes_read++;
+   if ((encoder->bytes_read & 1) == 0)
+      emit_pair(encoder);
+}
+
+static void
+encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct etna_context *ctx = etna_context(context);
+   unsigned customer_id = ctx->screen->info->customer_id;
+   uint8_t zero_point = operation->weight_zero_point;
+
+   value -= encoder->avg_bias;
+
+   if (customer_id == 0x99) {
+      if (encoder->zrl) {
+         if (encoder->avg_bias > 0) {
+            if (value == zero_point) {
+               encoder->accum_zeroes++;
+               return;
+            } else if (encoder->accum_zeroes) {
+               encode_byte(encoder, zero_point);
+               encoder->accum_zeroes = 0;
+            }
+         } else {
+            if (value == 0x0) {
+               encoder->accum_zeroes++;
+               return;
+            } else if (encoder->accum_zeroes) {
+               encode_byte(encoder, 0x80);
+               encoder->accum_zeroes = 0;
+            }
+         }
+      }
+
+      encode_byte(encoder, value);
+   } else {
+      if (encoder->zrl) {
+         if (value == zero_point) {
+            encoder->accum_zeroes++;
+            return;
+         } else if (encoder->accum_zeroes) {
+            encode_byte(encoder, 0x00);
+            encoder->accum_zeroes = 0;
+         }
+      }
+
+      encode_byte(encoder, value - zero_point);
+   }
+}
+
+static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr)
+{
+   memset(encoder, 0, sizeof(*encoder));
+   encoder->initial_ptr = initial_ptr;
+   encoder->dest = initial_ptr;
+   encoder->bitstream.map = &encoder->dest;
+   encoder->bitstream.do_write = initial_ptr != NULL;
+
+   for (int i = 0; i < 9; i++)
+      encoder->map[i] = 255;
+
+   for (int i = 0; i < 8; i++) {
+      assert(map[i] < sizeof(encoder->map));
+      encoder->map[map[i]] = i;
+   }
+}
+
+static void encode_uint32(struct encoder *encoder, uint32_t value)
+{
+   encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
+   encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
+   encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias);
+   encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias);
+}
+
+static void encode_uint16(struct encoder *encoder, uint32_t value)
+{
+   encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
+   encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
+}
+
+/*
+ * Flush remaining weights stuck in the encoder ring buffer and all bits
+ * in the bitstream FIFO. Return the total number of bits written.
+ */
+static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
+{
+   struct bitstream *bitstream = &encoder->bitstream;
+   size_t total_bits;
+   uint8_t flush_val = (encoder->bytes_read & 1) + 4;
+
+   struct code code;
+   if (encoder->bytes_read & 1)
+      encode_byte(encoder, 0x0);
+
+   code.part0 = (flush_val & 1) << 2;
+   code.part1 = 0x0;
+   code.part2 = 0x0;
+   code.part2_len = 0x0;
+   encoder->code[encoder->bytes_read++ % 6] = code;
+   encoder->code[encoder->bytes_read++ % 6] = code;
+   emit_pair(encoder);
+   encoder->code[encoder->bytes_read++ % 6] = code;
+   encoder->code[encoder->bytes_read++ % 6] = code;
+   emit_pair(encoder);
+
+   total_bits = (*bitstream->map - encoder->initial_ptr) * 32 +
+                bitstream->bits_in_buffer;
+
+   int padding_bits = 0;
+   if (total_bits % (64 * 8) > 0)
+      padding_bits = (64 * 8) - total_bits % (64 * 8);
+
+   while (padding_bits > 0) {
+      unsigned bits = padding_bits >= 32 ? 32 : padding_bits;
+      append_bits(0, bits, bitstream);
+      padding_bits -= bits;
+   }
+
+   return total_bits;
+}
+
+static void map_swap(uint8_t *map, int a, int b)
+{
+   uint8_t tmp = map[a];
+
+   map[a] = map[b];
+   map[b] = tmp;
+}
+
+/*
+ * Sort the Huffman symbol to bit length map according to the histogram of bit
+ * lengths, so that more common bit lengths are represented by shorter codes.
+ * FIXME - doesn't take into account zrl mode properly.
+ */
+static void sort_map(uint8_t *map, size_t *histogram)
+{
+   const uint8_t network[19][2] = {
+      {0, 2}, {1, 3}, {4, 6}, {5, 7},
+      {0, 4}, {1, 5}, {2, 6}, {3, 7},
+      {0, 1}, {2, 3}, {4, 5}, {6, 7},
+      {2, 4}, {3, 5},
+      {1, 4}, {3, 6},
+      {1, 2}, {3 ,4}, {5, 6},
+   };
+
+   for (int i = 0; i < 19; i++) {
+      int a = network[i][0];
+      int b = network[i][1];
+
+      if (histogram[map[a]] < histogram[map[b]])
+         map_swap(map, a, b);
+   }
+}
+
+static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
+{
+   encoder->initial_ptr = *encoder->bitstream.map;
+   encoder->dest = encoder->initial_ptr;
+   encoder->bitstream.map = &encoder->dest;
+
+   encoder->bitstream.buffer = 0;
+   encoder->bitstream.bits_in_buffer = 0;
+   encoder->bytes_read = 0;
+   memset(encoder->code, 0, sizeof(encoder->code));
+}
+
+static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel)
+{
+   struct pipe_context *pctx = subgraph->base.context;
+   struct etna_context *ctx = etna_context(pctx);
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_channels = operation->output_channels;
+   unsigned kernel_size;
+   uint8_t *weights = map_resource(operation->weight_tensor);
+   unsigned block_size;
+   unsigned blocks;
+
+   if (operation->depthwise)
+      input_channels = 1;
+   else if (operation->addition)
+      input_channels = 2 * output_channels;
+
+   kernel_size = input_channels * operation->weight_height * operation->weight_width;
+
+   uint8_t (*weights_map)[kernel_size] = (void *)weights;
+
+   if (operation->depthwise)
+      block_size = MAX2(operation->weight_height * operation->weight_width, 9);
+   else
+      block_size = 9;
+
+   blocks = DIV_ROUND_UP(kernel_size, block_size);
+
+   for (unsigned block = 0; block < blocks; block++) {
+      for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
+         unsigned oc;
+
+         if (operation->depthwise) {
+            oc = first_channel + kernel * nn_core_count;
+
+            if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count))
+               oc -= nn_core_count - output_channels % nn_core_count;
+         } else
+            oc = first_channel + kernel;
+
+         for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) {
+            uint8_t weight;
+
+            if (kernel_idx + block * block_size >= kernel_size)
+               weight = operation->weight_zero_point;
+            else
+               weight = weights_map[oc][kernel_idx + block * block_size];
+
+            encode_value(subgraph, operation, encoder, weight);
+         }
+
+         if (operation->depthwise && block_size % 9)
+            for (unsigned i = 0; i < 9 - block_size % 9; i++)
+               encode_value(subgraph, operation, encoder, operation->weight_zero_point);
+      }
+   }
+}
+
+static uint32_t pack_symbol_map(uint8_t map[8])
+{
+   uint32_t ret = 0;
+
+   for (int i = 0; i < 8; i++)
+      ret |= map[i] << (4 * i);
+
+   return ret;
+}
+
+static struct etna_bo *
+create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct etna_context *ctx = etna_context(context);
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   unsigned input_channels = operation->input_channels;
+   unsigned output_channels = operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   size_t max_size;
+
+   if (operation->depthwise)
+      input_channels = 1;
+   else if (operation->addition)
+      input_channels = 2 * output_channels;
+
+   unsigned header_size = 64;
+   unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
+   unsigned tail_size = 64;
+   max_size = header_size + cores_used * body_size + tail_size;
+
+   return etna_bo_new(ctx->screen->dev, max_size, DRM_ETNA_GEM_CACHE_WC);
+}
+
+static void
+calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map)
+{
+   unsigned input_channels = operation->input_channels;
+   unsigned output_channels = operation->output_channels;
+   uint8_t *input = map_resource(operation->weight_tensor);
+   size_t histogram[9] = {};
+
+   if (operation->depthwise)
+      input_channels = 1;
+   else if (operation->addition)
+      input_channels = 2 * output_channels;
+
+   uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
+   unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels;
+   for (unsigned oc = 0; oc < output_channels; oc++)
+      histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false);
+
+   for (int i = 0; i < 8; i++)
+      symbol_map[i] = i;
+   sort_map(symbol_map, histogram);
+}
+
+static void
+fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct etna_context *ctx = etna_context(context);
+   unsigned output_channels = operation->output_channels;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
+   unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
+
+   unsigned channel_per_superblock[superblocks];
+   for (unsigned superblock = 0; superblock < superblocks; superblock++)
+      channel_per_superblock[superblock] = superblock * full_superblock * cores_used;
+
+   for (unsigned core = 0; core < cores_used; core++) {
+      unsigned kernels_per_core = output_channels / cores_used;
+      if (core < output_channels % cores_used)
+         kernels_per_core++;
+
+      encoder_reset(subgraph, operation, encoder);
+      encode_uint16(encoder, kernels_per_core);
+
+      for (unsigned superblock = 0; superblock < superblocks; superblock++) {
+
+         unsigned kernels_in_superblock = full_superblock;
+         if (superblock == superblocks - 1) {
+            unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock;
+            kernels_in_superblock = remaining_channels / cores_used;
+            if (core < remaining_channels % cores_used)
+               kernels_in_superblock += 1;
+         }
+
+         unsigned first_channel;
+         if (operation->depthwise)
+            first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock;
+         else
+            first_channel = channel_per_superblock[superblock];
+
+         encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel);
+
+         channel_per_superblock[superblock] += kernels_in_superblock;
+      }
+
+      unsigned actual_bits = encoder_flush(subgraph, operation, encoder);
+      header->stream_size[core] = actual_bits;
+   }
+}
+
+static uint32_t *
+fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map)
+{
+   uint8_t *input = map_resource(operation->weight_tensor);
+   uint32_t *biases = map_resource(operation->bias_tensor);
+   unsigned input_channels = operation->input_channels;
+   unsigned output_channels = operation->output_channels;
+
+   if (operation->depthwise)
+      input_channels = 1;
+   else if (operation->addition)
+      input_channels = 2 * output_channels;
+
+   uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
+   for (unsigned oc = 0; oc < output_channels; oc++) {
+      uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]);
+
+      *map = biases[oc] + corr;
+      map++;
+   }
+
+   return map;
+}
+
+struct etna_bo *
+etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
+{
+   struct etna_bo *bo = create_bo(subgraph, operation);
+   uint32_t *map = etna_bo_map(bo);
+   struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map;
+   struct encoder encoder;
+   uint8_t symbol_map[8];
+
+   etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
+   memset(header, 0, sizeof(*header));
+
+   calculate_symbol_map(subgraph, operation, symbol_map);
+   header->symbol_map = pack_symbol_map(symbol_map);
+   header->version = 1;
+
+   map += ALIGN(sizeof(*header), 64) / 4;
+
+   encoder_init(&encoder, symbol_map, map);
+
+   fill_weights(subgraph, operation, &encoder, header);
+   map = fill_biases(subgraph, operation, encoder.dest);
+
+   /* Size of the data that will go into the SRAM cache, header included */
+   *cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo);
+
+   etna_bo_cpu_fini(bo);
+
+   return bo;
+}
--- a/src/gallium/drivers/etnaviv/meson.build
+++ b/src/gallium/drivers/etnaviv/meson.build
@ -35,6 +35,7 @@ files_etnaviv = files(
  'etnaviv_ml.c',
  'etnaviv_ml.h',
  'etnaviv_ml_nn_v7.c',
+  'etnaviv_ml_nn_v8.c',
  'etnaviv_ml_nn.c',
  'etnaviv_ml_nn.h',
  'etnaviv_ml_tp.c',