diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c index 0276a93048e..749bce62aad 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c @@ -892,10 +892,17 @@ void etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_vip_instruction *instruction) { + struct pipe_context *pctx = subgraph->base.context; + struct etna_context *ctx = etna_context(pctx); + unsigned nn_core_version = ctx->screen->specs.nn_core_version; unsigned coef_cache_size; instruction->type = ETNA_JOB_TYPE_NN; - instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size); + + if (nn_core_version == 7) + instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size); + else + instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size); struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); assert(input); diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h index 202bcc81332..2376723aa06 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h @@ -13,6 +13,9 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig struct etna_bo * etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size); +struct etna_bo * +etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size); + unsigned etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out); diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c new file mode 100644 index 00000000000..b4a623f2f3d --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * Copyright (c) 2024 Pengutronix, Philipp Zabel + * SPDX-License-Identifier: MIT + */ + +#include +#include "util/u_inlines.h" +#include "util/u_math.h" +#include "etnaviv_context.h" +#include "etnaviv_debug.h" +#include "etnaviv_ml.h" +#include "etnaviv_ml_nn.h" +#include "etnaviv_screen.h" + +static void * +map_resource(struct pipe_resource *resource) +{ + return etna_bo_map(etna_resource(resource)->bo); +} + +#define FIELD(field, bits) uint32_t field : bits; + +struct etna_nn_header_v8 { + FIELD(precode, 1) + FIELD(bit16, 1) + FIELD(fp16, 1) + FIELD(reserved1, 1) + FIELD(version, 4) + + uint8_t run_length_size; + uint8_t run_length_table[18]; + uint32_t symbol_map; + uint16_t avg_bias; + uint16_t reserved2; + uint32_t stream_size[0]; +}; + +struct bitstream { + unsigned bits_in_buffer; + uint64_t buffer; + uint32_t **map; + bool do_write; +}; + +static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights) +{ + unsigned input_channels; + int32_t input_zero_point = 128 - operation->input_zero_point; + int32_t correction = 0; + + if (operation->depthwise) + input_channels = 1; + else if (operation->addition) + input_channels = 2 * operation->output_channels; + else + input_channels = operation->input_channels; + + for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) { + correction += (weights[i] - operation->weight_zero_point) * input_zero_point; + } + + return correction; +} + +static void +append_bits(uint32_t value, size_t size, struct bitstream *bitstream) +{ + assert(value < 1 << size); + if (!size) + return; + bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer; + bitstream->bits_in_buffer += size; + if (bitstream->bits_in_buffer >= 32) { + if (bitstream->do_write) + **bitstream->map = bitstream->buffer & 0xffffffff; + *bitstream->map += 1; + bitstream->buffer >>= 32; + bitstream->bits_in_buffer -= 32; + } +} + +static void +flush_bits(struct bitstream *bitstream) +{ + if (bitstream->bits_in_buffer > 0) + append_bits(0, 32 - bitstream->bits_in_buffer, bitstream); +} + +struct wb_stream { + struct bitstream bitstream; + unsigned zero_point; + unsigned zrl_bits; + unsigned accum_zeroes; +}; + +static void +wb_stream_flush_zeroes(struct wb_stream *wb_stream) +{ + struct bitstream *bitstream = &wb_stream->bitstream; + + if (wb_stream->accum_zeroes == 0) + return; + + append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream); + wb_stream->accum_zeroes = 0; + append_bits(wb_stream->zero_point, 8, bitstream); +} + +static void +wb_stream_write(struct wb_stream *wb_stream, unsigned value) +{ + struct bitstream *bitstream = &wb_stream->bitstream; + unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1; + + if (wb_stream->zrl_bits == 0) { + append_bits(value, 8, bitstream); + return; + } + + if (wb_stream->accum_zeroes == max_zeroes) { + append_bits(max_zeroes, wb_stream->zrl_bits, bitstream); + wb_stream->accum_zeroes = 0; + append_bits(value, 8, bitstream); + return; + } + + if (value == wb_stream->zero_point) { + wb_stream->accum_zeroes++; + return; + } + + append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream); + wb_stream->accum_zeroes = 0; + append_bits(value, 8, bitstream); +} + +/* + * The V8 architecture Huffman stream decoder uses a fixed code book with 8 + * entries to determine bit lengths of variable length values later in the bit + * stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus + * optional 2-bit) fields: + * + * code symbol + * -------------- + * 00_ 0 + * 10_ 1 + * 111 2 + * 110 3 + * 011 4 + * 010 1_ 5 + * 010 01 6 + * 010 00 7 + * + * The free bit (_) is used for the sign, if available, otherwise the sign + * is stored with the variable length value later in the bitstream. In ZRL + * encoding mode, where larger values are stored verbatim, this may also be + * the lsb of the value instead.. The decoder processes weights in pairs and + * is pipelined 3-deep: + * + * In each step, first two 3-bit codes are read, then up to two 2-bit codes + * that belong with (010) 3-bit codes from the previous step. The optional + * 2-bit codes from the previous step, together with the 3-bit codes from the + * step before that are used to decode two symbols that are mapped to two bit + * lengths for the two variable length values that are read next. + * + * Finally, the bit lengths, signs, and variable length values are used to + * calculate two weights. + */ + +struct code { + /* fixed 3-bit code */ + uint8_t part0; + /* optional 2-bit code, iff part0 == 0b010 */ + uint8_t part1; + /* variable length value */ + uint8_t part2; + /* bit length determined from part0, part1, and symbol-to-bitlength map */ + uint8_t part2_len; +}; + +struct encoder { + /* bit-length-to-huffman-symbol map */ + uint8_t map[9]; + /* ring buffer for 3 encoded weight pairs */ + struct code code[6]; + size_t bytes_read; + struct bitstream bitstream; + uint32_t *initial_ptr; + uint32_t *dest; + uint8_t accum_zeroes; + uint8_t avg_bias; + bool zrl; +}; + +/* Calculate a histogram of bit lenghts. */ +static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl) +{ + for (size_t i = 0; i < len; i++) { + uint8_t num_bits = 0; + if (bytes[i]) { + bool sign = bytes[i] >> 7; + uint8_t value = bytes[i]; + if (sign) { + value -= zrl; + value ^= 0xff; + } + num_bits = util_logbase2(value) + 1; + } + assert(num_bits <= 8); + histogram[num_bits]++; + } +} + +/* + * value can be 8-bit raw value or variable length value with prepended sign. + * num_bits is number of bits in value, including the sign bit. + */ +static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits) +{ + switch (sym) { + case 0: + return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 }; + case 1: + return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 }; + case 2: + return (struct code){ 7, 0, value, num_bits}; + case 3: + return (struct code){ 3, 0, value, num_bits}; + case 4: + return (struct code){ 6, 0, value, num_bits}; + case 5: + return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 }; + case 6: + return (struct code){ 2, 2, value, num_bits}; + case 7: + return (struct code){ 2, 0, value, num_bits}; + default: + return (struct code){}; + } +} + +static void emit_pair(struct encoder *encoder) +{ + struct bitstream *bitstream = &encoder->bitstream; + struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6]; + + append_bits(code[0].part0, 3, bitstream); + append_bits(code[1].part0, 3, bitstream); + if (encoder->bytes_read > 2) { + code = &encoder->code[(encoder->bytes_read - 4) % 6]; + append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream); + append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream); + } + if (encoder->bytes_read > 4) { + code = &encoder->code[(encoder->bytes_read - 6) % 6]; + append_bits(code[0].part2, code[0].part2_len, bitstream); + append_bits(code[1].part2, code[1].part2_len, bitstream); + } +} + +/* Encode a single byte. Emit into the bitstream when a pair is complete. */ +static void encode_byte(struct encoder *encoder, uint8_t byte) +{ + bool zrl = encoder->zrl; + bool sign = byte >> 7; + uint8_t value = byte; + + if (sign) { + value -= zrl; + value ^= 0xff; + } + + uint8_t msb = util_logbase2(value); + uint8_t num_bits = value ? (msb + 1) : 0; + value &= ~(1 << msb); + uint8_t sym = encoder->map[num_bits]; + if (zrl && byte == 0) { + if (encoder->accum_zeroes <= 1) { + // this seems to be used for the non-repeated 0 at the beginning and end + sym = encoder->map[7]; + num_bits = 8; + } else { + // FIXME - how to encode run length into the run length table? + num_bits = 1; + } + } + if (!zrl && num_bits == 0) { + num_bits = 1; + } + if (sym == 255 || (zrl && byte == 128)) { + // if there is no huffman code assigned to this bit length, or when + // encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim. + sym = encoder->map[7]; + value = byte; + num_bits = 8; + } else if (zrl && num_bits == 7) { + value = byte; + num_bits = 8; + } else { + value = (value << 1) | sign; + } + unsigned int i = encoder->bytes_read % 6; + encoder->code[i] = huffman_code(sym, value, num_bits); + encoder->bytes_read++; + if ((encoder->bytes_read & 1) == 0) + emit_pair(encoder); +} + +static void +encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned customer_id = ctx->screen->info->customer_id; + uint8_t zero_point = operation->weight_zero_point; + + value -= encoder->avg_bias; + + if (customer_id == 0x99) { + if (encoder->zrl) { + if (encoder->avg_bias > 0) { + if (value == zero_point) { + encoder->accum_zeroes++; + return; + } else if (encoder->accum_zeroes) { + encode_byte(encoder, zero_point); + encoder->accum_zeroes = 0; + } + } else { + if (value == 0x0) { + encoder->accum_zeroes++; + return; + } else if (encoder->accum_zeroes) { + encode_byte(encoder, 0x80); + encoder->accum_zeroes = 0; + } + } + } + + encode_byte(encoder, value); + } else { + if (encoder->zrl) { + if (value == zero_point) { + encoder->accum_zeroes++; + return; + } else if (encoder->accum_zeroes) { + encode_byte(encoder, 0x00); + encoder->accum_zeroes = 0; + } + } + + encode_byte(encoder, value - zero_point); + } +} + +static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr) +{ + memset(encoder, 0, sizeof(*encoder)); + encoder->initial_ptr = initial_ptr; + encoder->dest = initial_ptr; + encoder->bitstream.map = &encoder->dest; + encoder->bitstream.do_write = initial_ptr != NULL; + + for (int i = 0; i < 9; i++) + encoder->map[i] = 255; + + for (int i = 0; i < 8; i++) { + assert(map[i] < sizeof(encoder->map)); + encoder->map[map[i]] = i; + } +} + +static void encode_uint32(struct encoder *encoder, uint32_t value) +{ + encode_byte(encoder, (value & 0xff) - encoder->avg_bias); + encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias); + encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias); + encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias); +} + +static void encode_uint16(struct encoder *encoder, uint32_t value) +{ + encode_byte(encoder, (value & 0xff) - encoder->avg_bias); + encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias); +} + +/* + * Flush remaining weights stuck in the encoder ring buffer and all bits + * in the bitstream FIFO. Return the total number of bits written. + */ +static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder) +{ + struct bitstream *bitstream = &encoder->bitstream; + size_t total_bits; + uint8_t flush_val = (encoder->bytes_read & 1) + 4; + + struct code code; + if (encoder->bytes_read & 1) + encode_byte(encoder, 0x0); + + code.part0 = (flush_val & 1) << 2; + code.part1 = 0x0; + code.part2 = 0x0; + code.part2_len = 0x0; + encoder->code[encoder->bytes_read++ % 6] = code; + encoder->code[encoder->bytes_read++ % 6] = code; + emit_pair(encoder); + encoder->code[encoder->bytes_read++ % 6] = code; + encoder->code[encoder->bytes_read++ % 6] = code; + emit_pair(encoder); + + total_bits = (*bitstream->map - encoder->initial_ptr) * 32 + + bitstream->bits_in_buffer; + + int padding_bits = 0; + if (total_bits % (64 * 8) > 0) + padding_bits = (64 * 8) - total_bits % (64 * 8); + + while (padding_bits > 0) { + unsigned bits = padding_bits >= 32 ? 32 : padding_bits; + append_bits(0, bits, bitstream); + padding_bits -= bits; + } + + return total_bits; +} + +static void map_swap(uint8_t *map, int a, int b) +{ + uint8_t tmp = map[a]; + + map[a] = map[b]; + map[b] = tmp; +} + +/* + * Sort the Huffman symbol to bit length map according to the histogram of bit + * lengths, so that more common bit lengths are represented by shorter codes. + * FIXME - doesn't take into account zrl mode properly. + */ +static void sort_map(uint8_t *map, size_t *histogram) +{ + const uint8_t network[19][2] = { + {0, 2}, {1, 3}, {4, 6}, {5, 7}, + {0, 4}, {1, 5}, {2, 6}, {3, 7}, + {0, 1}, {2, 3}, {4, 5}, {6, 7}, + {2, 4}, {3, 5}, + {1, 4}, {3, 6}, + {1, 2}, {3 ,4}, {5, 6}, + }; + + for (int i = 0; i < 19; i++) { + int a = network[i][0]; + int b = network[i][1]; + + if (histogram[map[a]] < histogram[map[b]]) + map_swap(map, a, b); + } +} + +static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder) +{ + encoder->initial_ptr = *encoder->bitstream.map; + encoder->dest = encoder->initial_ptr; + encoder->bitstream.map = &encoder->dest; + + encoder->bitstream.buffer = 0; + encoder->bitstream.bits_in_buffer = 0; + encoder->bytes_read = 0; + memset(encoder->code, 0, sizeof(encoder->code)); +} + +static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel) +{ + struct pipe_context *pctx = subgraph->base.context; + struct etna_context *ctx = etna_context(pctx); + unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; + unsigned input_channels = operation->input_channels; + unsigned output_channels = operation->output_channels; + unsigned kernel_size; + uint8_t *weights = map_resource(operation->weight_tensor); + unsigned block_size; + unsigned blocks; + + if (operation->depthwise) + input_channels = 1; + else if (operation->addition) + input_channels = 2 * output_channels; + + kernel_size = input_channels * operation->weight_height * operation->weight_width; + + uint8_t (*weights_map)[kernel_size] = (void *)weights; + + if (operation->depthwise) + block_size = MAX2(operation->weight_height * operation->weight_width, 9); + else + block_size = 9; + + blocks = DIV_ROUND_UP(kernel_size, block_size); + + for (unsigned block = 0; block < blocks; block++) { + for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { + unsigned oc; + + if (operation->depthwise) { + oc = first_channel + kernel * nn_core_count; + + if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count)) + oc -= nn_core_count - output_channels % nn_core_count; + } else + oc = first_channel + kernel; + + for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) { + uint8_t weight; + + if (kernel_idx + block * block_size >= kernel_size) + weight = operation->weight_zero_point; + else + weight = weights_map[oc][kernel_idx + block * block_size]; + + encode_value(subgraph, operation, encoder, weight); + } + + if (operation->depthwise && block_size % 9) + for (unsigned i = 0; i < 9 - block_size % 9; i++) + encode_value(subgraph, operation, encoder, operation->weight_zero_point); + } + } +} + +static uint32_t pack_symbol_map(uint8_t map[8]) +{ + uint32_t ret = 0; + + for (int i = 0; i < 8; i++) + ret |= map[i] << (4 * i); + + return ret; +} + +static struct etna_bo * +create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; + unsigned input_channels = operation->input_channels; + unsigned output_channels = operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + size_t max_size; + + if (operation->depthwise) + input_channels = 1; + else if (operation->addition) + input_channels = 2 * output_channels; + + unsigned header_size = 64; + unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2; + unsigned tail_size = 64; + max_size = header_size + cores_used * body_size + tail_size; + + return etna_bo_new(ctx->screen->dev, max_size, DRM_ETNA_GEM_CACHE_WC); +} + +static void +calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map) +{ + unsigned input_channels = operation->input_channels; + unsigned output_channels = operation->output_channels; + uint8_t *input = map_resource(operation->weight_tensor); + size_t histogram[9] = {}; + + if (operation->depthwise) + input_channels = 1; + else if (operation->addition) + input_channels = 2 * output_channels; + + uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input; + unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels; + for (unsigned oc = 0; oc < output_channels; oc++) + histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false); + + for (int i = 0; i < 8; i++) + symbol_map[i] = i; + sort_map(symbol_map, histogram); +} + +static void +fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned output_channels = operation->output_channels; + unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL); + unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks); + + unsigned channel_per_superblock[superblocks]; + for (unsigned superblock = 0; superblock < superblocks; superblock++) + channel_per_superblock[superblock] = superblock * full_superblock * cores_used; + + for (unsigned core = 0; core < cores_used; core++) { + unsigned kernels_per_core = output_channels / cores_used; + if (core < output_channels % cores_used) + kernels_per_core++; + + encoder_reset(subgraph, operation, encoder); + encode_uint16(encoder, kernels_per_core); + + for (unsigned superblock = 0; superblock < superblocks; superblock++) { + + unsigned kernels_in_superblock = full_superblock; + if (superblock == superblocks - 1) { + unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock; + kernels_in_superblock = remaining_channels / cores_used; + if (core < remaining_channels % cores_used) + kernels_in_superblock += 1; + } + + unsigned first_channel; + if (operation->depthwise) + first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock; + else + first_channel = channel_per_superblock[superblock]; + + encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel); + + channel_per_superblock[superblock] += kernels_in_superblock; + } + + unsigned actual_bits = encoder_flush(subgraph, operation, encoder); + header->stream_size[core] = actual_bits; + } +} + +static uint32_t * +fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map) +{ + uint8_t *input = map_resource(operation->weight_tensor); + uint32_t *biases = map_resource(operation->bias_tensor); + unsigned input_channels = operation->input_channels; + unsigned output_channels = operation->output_channels; + + if (operation->depthwise) + input_channels = 1; + else if (operation->addition) + input_channels = 2 * output_channels; + + uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input; + for (unsigned oc = 0; oc < output_channels; oc++) { + uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]); + + *map = biases[oc] + corr; + map++; + } + + return map; +} + +struct etna_bo * +etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size) +{ + struct etna_bo *bo = create_bo(subgraph, operation); + uint32_t *map = etna_bo_map(bo); + struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map; + struct encoder encoder; + uint8_t symbol_map[8]; + + etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); + memset(header, 0, sizeof(*header)); + + calculate_symbol_map(subgraph, operation, symbol_map); + header->symbol_map = pack_symbol_map(symbol_map); + header->version = 1; + + map += ALIGN(sizeof(*header), 64) / 4; + + encoder_init(&encoder, symbol_map, map); + + fill_weights(subgraph, operation, &encoder, header); + map = fill_biases(subgraph, operation, encoder.dest); + + /* Size of the data that will go into the SRAM cache, header included */ + *cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo); + + etna_bo_cpu_fini(bo); + + return bo; +} diff --git a/src/gallium/drivers/etnaviv/meson.build b/src/gallium/drivers/etnaviv/meson.build index 1ba1ce859a3..e44eaef16ce 100644 --- a/src/gallium/drivers/etnaviv/meson.build +++ b/src/gallium/drivers/etnaviv/meson.build @@ -35,6 +35,7 @@ files_etnaviv = files( 'etnaviv_ml.c', 'etnaviv_ml.h', 'etnaviv_ml_nn_v7.c', + 'etnaviv_ml_nn_v8.c', 'etnaviv_ml_nn.c', 'etnaviv_ml_nn.h', 'etnaviv_ml_tp.c',