etnaviv/ml: Add encoding of coefficients for V8

In V8 the weights and biases of convolution operations are encoded with
a totally different scheme.

The initial reverse engineering and implementation was done by:

Philipp Zabel <p.zabel@pengutronix.de>

Support for zero run length encoding and average bias is not implemented yet.

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
This commit is contained in:
Tomeu Vizoso 2024-10-23 08:56:33 +02:00 committed by Marge Bot
parent f3d765ed5d
commit b4ba62fcda
4 changed files with 703 additions and 1 deletions

View file

@ -892,10 +892,17 @@ void
etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
struct etna_vip_instruction *instruction)
{
struct pipe_context *pctx = subgraph->base.context;
struct etna_context *ctx = etna_context(pctx);
unsigned nn_core_version = ctx->screen->specs.nn_core_version;
unsigned coef_cache_size;
instruction->type = ETNA_JOB_TYPE_NN;
instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
if (nn_core_version == 7)
instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
else
instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size);
struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
assert(input);

View file

@ -13,6 +13,9 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
struct etna_bo *
etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
struct etna_bo *
etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
unsigned
etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);

View file

@ -0,0 +1,691 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* Copyright (c) 2024 Pengutronix, Philipp Zabel
* SPDX-License-Identifier: MIT
*/
#include <time.h>
#include "util/u_inlines.h"
#include "util/u_math.h"
#include "etnaviv_context.h"
#include "etnaviv_debug.h"
#include "etnaviv_ml.h"
#include "etnaviv_ml_nn.h"
#include "etnaviv_screen.h"
static void *
map_resource(struct pipe_resource *resource)
{
return etna_bo_map(etna_resource(resource)->bo);
}
#define FIELD(field, bits) uint32_t field : bits;
struct etna_nn_header_v8 {
FIELD(precode, 1)
FIELD(bit16, 1)
FIELD(fp16, 1)
FIELD(reserved1, 1)
FIELD(version, 4)
uint8_t run_length_size;
uint8_t run_length_table[18];
uint32_t symbol_map;
uint16_t avg_bias;
uint16_t reserved2;
uint32_t stream_size[0];
};
struct bitstream {
unsigned bits_in_buffer;
uint64_t buffer;
uint32_t **map;
bool do_write;
};
static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights)
{
unsigned input_channels;
int32_t input_zero_point = 128 - operation->input_zero_point;
int32_t correction = 0;
if (operation->depthwise)
input_channels = 1;
else if (operation->addition)
input_channels = 2 * operation->output_channels;
else
input_channels = operation->input_channels;
for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
}
return correction;
}
static void
append_bits(uint32_t value, size_t size, struct bitstream *bitstream)
{
assert(value < 1 << size);
if (!size)
return;
bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer;
bitstream->bits_in_buffer += size;
if (bitstream->bits_in_buffer >= 32) {
if (bitstream->do_write)
**bitstream->map = bitstream->buffer & 0xffffffff;
*bitstream->map += 1;
bitstream->buffer >>= 32;
bitstream->bits_in_buffer -= 32;
}
}
static void
flush_bits(struct bitstream *bitstream)
{
if (bitstream->bits_in_buffer > 0)
append_bits(0, 32 - bitstream->bits_in_buffer, bitstream);
}
struct wb_stream {
struct bitstream bitstream;
unsigned zero_point;
unsigned zrl_bits;
unsigned accum_zeroes;
};
static void
wb_stream_flush_zeroes(struct wb_stream *wb_stream)
{
struct bitstream *bitstream = &wb_stream->bitstream;
if (wb_stream->accum_zeroes == 0)
return;
append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream);
wb_stream->accum_zeroes = 0;
append_bits(wb_stream->zero_point, 8, bitstream);
}
static void
wb_stream_write(struct wb_stream *wb_stream, unsigned value)
{
struct bitstream *bitstream = &wb_stream->bitstream;
unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
if (wb_stream->zrl_bits == 0) {
append_bits(value, 8, bitstream);
return;
}
if (wb_stream->accum_zeroes == max_zeroes) {
append_bits(max_zeroes, wb_stream->zrl_bits, bitstream);
wb_stream->accum_zeroes = 0;
append_bits(value, 8, bitstream);
return;
}
if (value == wb_stream->zero_point) {
wb_stream->accum_zeroes++;
return;
}
append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream);
wb_stream->accum_zeroes = 0;
append_bits(value, 8, bitstream);
}
/*
* The V8 architecture Huffman stream decoder uses a fixed code book with 8
* entries to determine bit lengths of variable length values later in the bit
* stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus
* optional 2-bit) fields:
*
* code symbol
* --------------
* 00_ 0
* 10_ 1
* 111 2
* 110 3
* 011 4
* 010 1_ 5
* 010 01 6
* 010 00 7
*
* The free bit (_) is used for the sign, if available, otherwise the sign
* is stored with the variable length value later in the bitstream. In ZRL
* encoding mode, where larger values are stored verbatim, this may also be
* the lsb of the value instead.. The decoder processes weights in pairs and
* is pipelined 3-deep:
*
* In each step, first two 3-bit codes are read, then up to two 2-bit codes
* that belong with (010) 3-bit codes from the previous step. The optional
* 2-bit codes from the previous step, together with the 3-bit codes from the
* step before that are used to decode two symbols that are mapped to two bit
* lengths for the two variable length values that are read next.
*
* Finally, the bit lengths, signs, and variable length values are used to
* calculate two weights.
*/
struct code {
/* fixed 3-bit code */
uint8_t part0;
/* optional 2-bit code, iff part0 == 0b010 */
uint8_t part1;
/* variable length value */
uint8_t part2;
/* bit length determined from part0, part1, and symbol-to-bitlength map */
uint8_t part2_len;
};
struct encoder {
/* bit-length-to-huffman-symbol map */
uint8_t map[9];
/* ring buffer for 3 encoded weight pairs */
struct code code[6];
size_t bytes_read;
struct bitstream bitstream;
uint32_t *initial_ptr;
uint32_t *dest;
uint8_t accum_zeroes;
uint8_t avg_bias;
bool zrl;
};
/* Calculate a histogram of bit lenghts. */
static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl)
{
for (size_t i = 0; i < len; i++) {
uint8_t num_bits = 0;
if (bytes[i]) {
bool sign = bytes[i] >> 7;
uint8_t value = bytes[i];
if (sign) {
value -= zrl;
value ^= 0xff;
}
num_bits = util_logbase2(value) + 1;
}
assert(num_bits <= 8);
histogram[num_bits]++;
}
}
/*
* value can be 8-bit raw value or variable length value with prepended sign.
* num_bits is number of bits in value, including the sign bit.
*/
static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits)
{
switch (sym) {
case 0:
return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
case 1:
return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
case 2:
return (struct code){ 7, 0, value, num_bits};
case 3:
return (struct code){ 3, 0, value, num_bits};
case 4:
return (struct code){ 6, 0, value, num_bits};
case 5:
return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 };
case 6:
return (struct code){ 2, 2, value, num_bits};
case 7:
return (struct code){ 2, 0, value, num_bits};
default:
return (struct code){};
}
}
static void emit_pair(struct encoder *encoder)
{
struct bitstream *bitstream = &encoder->bitstream;
struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6];
append_bits(code[0].part0, 3, bitstream);
append_bits(code[1].part0, 3, bitstream);
if (encoder->bytes_read > 2) {
code = &encoder->code[(encoder->bytes_read - 4) % 6];
append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream);
append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream);
}
if (encoder->bytes_read > 4) {
code = &encoder->code[(encoder->bytes_read - 6) % 6];
append_bits(code[0].part2, code[0].part2_len, bitstream);
append_bits(code[1].part2, code[1].part2_len, bitstream);
}
}
/* Encode a single byte. Emit into the bitstream when a pair is complete. */
static void encode_byte(struct encoder *encoder, uint8_t byte)
{
bool zrl = encoder->zrl;
bool sign = byte >> 7;
uint8_t value = byte;
if (sign) {
value -= zrl;
value ^= 0xff;
}
uint8_t msb = util_logbase2(value);
uint8_t num_bits = value ? (msb + 1) : 0;
value &= ~(1 << msb);
uint8_t sym = encoder->map[num_bits];
if (zrl && byte == 0) {
if (encoder->accum_zeroes <= 1) {
// this seems to be used for the non-repeated 0 at the beginning and end
sym = encoder->map[7];
num_bits = 8;
} else {
// FIXME - how to encode run length into the run length table?
num_bits = 1;
}
}
if (!zrl && num_bits == 0) {
num_bits = 1;
}
if (sym == 255 || (zrl && byte == 128)) {
// if there is no huffman code assigned to this bit length, or when
// encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim.
sym = encoder->map[7];
value = byte;
num_bits = 8;
} else if (zrl && num_bits == 7) {
value = byte;
num_bits = 8;
} else {
value = (value << 1) | sign;
}
unsigned int i = encoder->bytes_read % 6;
encoder->code[i] = huffman_code(sym, value, num_bits);
encoder->bytes_read++;
if ((encoder->bytes_read & 1) == 0)
emit_pair(encoder);
}
static void
encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value)
{
struct pipe_context *context = subgraph->base.context;
struct etna_context *ctx = etna_context(context);
unsigned customer_id = ctx->screen->info->customer_id;
uint8_t zero_point = operation->weight_zero_point;
value -= encoder->avg_bias;
if (customer_id == 0x99) {
if (encoder->zrl) {
if (encoder->avg_bias > 0) {
if (value == zero_point) {
encoder->accum_zeroes++;
return;
} else if (encoder->accum_zeroes) {
encode_byte(encoder, zero_point);
encoder->accum_zeroes = 0;
}
} else {
if (value == 0x0) {
encoder->accum_zeroes++;
return;
} else if (encoder->accum_zeroes) {
encode_byte(encoder, 0x80);
encoder->accum_zeroes = 0;
}
}
}
encode_byte(encoder, value);
} else {
if (encoder->zrl) {
if (value == zero_point) {
encoder->accum_zeroes++;
return;
} else if (encoder->accum_zeroes) {
encode_byte(encoder, 0x00);
encoder->accum_zeroes = 0;
}
}
encode_byte(encoder, value - zero_point);
}
}
static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr)
{
memset(encoder, 0, sizeof(*encoder));
encoder->initial_ptr = initial_ptr;
encoder->dest = initial_ptr;
encoder->bitstream.map = &encoder->dest;
encoder->bitstream.do_write = initial_ptr != NULL;
for (int i = 0; i < 9; i++)
encoder->map[i] = 255;
for (int i = 0; i < 8; i++) {
assert(map[i] < sizeof(encoder->map));
encoder->map[map[i]] = i;
}
}
static void encode_uint32(struct encoder *encoder, uint32_t value)
{
encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias);
encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias);
}
static void encode_uint16(struct encoder *encoder, uint32_t value)
{
encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
}
/*
* Flush remaining weights stuck in the encoder ring buffer and all bits
* in the bitstream FIFO. Return the total number of bits written.
*/
static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
{
struct bitstream *bitstream = &encoder->bitstream;
size_t total_bits;
uint8_t flush_val = (encoder->bytes_read & 1) + 4;
struct code code;
if (encoder->bytes_read & 1)
encode_byte(encoder, 0x0);
code.part0 = (flush_val & 1) << 2;
code.part1 = 0x0;
code.part2 = 0x0;
code.part2_len = 0x0;
encoder->code[encoder->bytes_read++ % 6] = code;
encoder->code[encoder->bytes_read++ % 6] = code;
emit_pair(encoder);
encoder->code[encoder->bytes_read++ % 6] = code;
encoder->code[encoder->bytes_read++ % 6] = code;
emit_pair(encoder);
total_bits = (*bitstream->map - encoder->initial_ptr) * 32 +
bitstream->bits_in_buffer;
int padding_bits = 0;
if (total_bits % (64 * 8) > 0)
padding_bits = (64 * 8) - total_bits % (64 * 8);
while (padding_bits > 0) {
unsigned bits = padding_bits >= 32 ? 32 : padding_bits;
append_bits(0, bits, bitstream);
padding_bits -= bits;
}
return total_bits;
}
static void map_swap(uint8_t *map, int a, int b)
{
uint8_t tmp = map[a];
map[a] = map[b];
map[b] = tmp;
}
/*
* Sort the Huffman symbol to bit length map according to the histogram of bit
* lengths, so that more common bit lengths are represented by shorter codes.
* FIXME - doesn't take into account zrl mode properly.
*/
static void sort_map(uint8_t *map, size_t *histogram)
{
const uint8_t network[19][2] = {
{0, 2}, {1, 3}, {4, 6}, {5, 7},
{0, 4}, {1, 5}, {2, 6}, {3, 7},
{0, 1}, {2, 3}, {4, 5}, {6, 7},
{2, 4}, {3, 5},
{1, 4}, {3, 6},
{1, 2}, {3 ,4}, {5, 6},
};
for (int i = 0; i < 19; i++) {
int a = network[i][0];
int b = network[i][1];
if (histogram[map[a]] < histogram[map[b]])
map_swap(map, a, b);
}
}
static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
{
encoder->initial_ptr = *encoder->bitstream.map;
encoder->dest = encoder->initial_ptr;
encoder->bitstream.map = &encoder->dest;
encoder->bitstream.buffer = 0;
encoder->bitstream.bits_in_buffer = 0;
encoder->bytes_read = 0;
memset(encoder->code, 0, sizeof(encoder->code));
}
static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel)
{
struct pipe_context *pctx = subgraph->base.context;
struct etna_context *ctx = etna_context(pctx);
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
unsigned input_channels = operation->input_channels;
unsigned output_channels = operation->output_channels;
unsigned kernel_size;
uint8_t *weights = map_resource(operation->weight_tensor);
unsigned block_size;
unsigned blocks;
if (operation->depthwise)
input_channels = 1;
else if (operation->addition)
input_channels = 2 * output_channels;
kernel_size = input_channels * operation->weight_height * operation->weight_width;
uint8_t (*weights_map)[kernel_size] = (void *)weights;
if (operation->depthwise)
block_size = MAX2(operation->weight_height * operation->weight_width, 9);
else
block_size = 9;
blocks = DIV_ROUND_UP(kernel_size, block_size);
for (unsigned block = 0; block < blocks; block++) {
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
unsigned oc;
if (operation->depthwise) {
oc = first_channel + kernel * nn_core_count;
if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count))
oc -= nn_core_count - output_channels % nn_core_count;
} else
oc = first_channel + kernel;
for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) {
uint8_t weight;
if (kernel_idx + block * block_size >= kernel_size)
weight = operation->weight_zero_point;
else
weight = weights_map[oc][kernel_idx + block * block_size];
encode_value(subgraph, operation, encoder, weight);
}
if (operation->depthwise && block_size % 9)
for (unsigned i = 0; i < 9 - block_size % 9; i++)
encode_value(subgraph, operation, encoder, operation->weight_zero_point);
}
}
}
static uint32_t pack_symbol_map(uint8_t map[8])
{
uint32_t ret = 0;
for (int i = 0; i < 8; i++)
ret |= map[i] << (4 * i);
return ret;
}
static struct etna_bo *
create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
{
struct pipe_context *context = subgraph->base.context;
struct etna_context *ctx = etna_context(context);
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
unsigned input_channels = operation->input_channels;
unsigned output_channels = operation->output_channels;
unsigned cores_used = MIN2(output_channels, nn_core_count);
size_t max_size;
if (operation->depthwise)
input_channels = 1;
else if (operation->addition)
input_channels = 2 * output_channels;
unsigned header_size = 64;
unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
unsigned tail_size = 64;
max_size = header_size + cores_used * body_size + tail_size;
return etna_bo_new(ctx->screen->dev, max_size, DRM_ETNA_GEM_CACHE_WC);
}
static void
calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map)
{
unsigned input_channels = operation->input_channels;
unsigned output_channels = operation->output_channels;
uint8_t *input = map_resource(operation->weight_tensor);
size_t histogram[9] = {};
if (operation->depthwise)
input_channels = 1;
else if (operation->addition)
input_channels = 2 * output_channels;
uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels;
for (unsigned oc = 0; oc < output_channels; oc++)
histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false);
for (int i = 0; i < 8; i++)
symbol_map[i] = i;
sort_map(symbol_map, histogram);
}
static void
fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header)
{
struct pipe_context *context = subgraph->base.context;
struct etna_context *ctx = etna_context(context);
unsigned output_channels = operation->output_channels;
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
unsigned cores_used = MIN2(output_channels, nn_core_count);
unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
unsigned channel_per_superblock[superblocks];
for (unsigned superblock = 0; superblock < superblocks; superblock++)
channel_per_superblock[superblock] = superblock * full_superblock * cores_used;
for (unsigned core = 0; core < cores_used; core++) {
unsigned kernels_per_core = output_channels / cores_used;
if (core < output_channels % cores_used)
kernels_per_core++;
encoder_reset(subgraph, operation, encoder);
encode_uint16(encoder, kernels_per_core);
for (unsigned superblock = 0; superblock < superblocks; superblock++) {
unsigned kernels_in_superblock = full_superblock;
if (superblock == superblocks - 1) {
unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock;
kernels_in_superblock = remaining_channels / cores_used;
if (core < remaining_channels % cores_used)
kernels_in_superblock += 1;
}
unsigned first_channel;
if (operation->depthwise)
first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock;
else
first_channel = channel_per_superblock[superblock];
encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel);
channel_per_superblock[superblock] += kernels_in_superblock;
}
unsigned actual_bits = encoder_flush(subgraph, operation, encoder);
header->stream_size[core] = actual_bits;
}
}
static uint32_t *
fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map)
{
uint8_t *input = map_resource(operation->weight_tensor);
uint32_t *biases = map_resource(operation->bias_tensor);
unsigned input_channels = operation->input_channels;
unsigned output_channels = operation->output_channels;
if (operation->depthwise)
input_channels = 1;
else if (operation->addition)
input_channels = 2 * output_channels;
uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
for (unsigned oc = 0; oc < output_channels; oc++) {
uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]);
*map = biases[oc] + corr;
map++;
}
return map;
}
struct etna_bo *
etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
{
struct etna_bo *bo = create_bo(subgraph, operation);
uint32_t *map = etna_bo_map(bo);
struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map;
struct encoder encoder;
uint8_t symbol_map[8];
etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
memset(header, 0, sizeof(*header));
calculate_symbol_map(subgraph, operation, symbol_map);
header->symbol_map = pack_symbol_map(symbol_map);
header->version = 1;
map += ALIGN(sizeof(*header), 64) / 4;
encoder_init(&encoder, symbol_map, map);
fill_weights(subgraph, operation, &encoder, header);
map = fill_biases(subgraph, operation, encoder.dest);
/* Size of the data that will go into the SRAM cache, header included */
*cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo);
etna_bo_cpu_fini(bo);
return bo;
}

View file

@ -35,6 +35,7 @@ files_etnaviv = files(
'etnaviv_ml.c',
'etnaviv_ml.h',
'etnaviv_ml_nn_v7.c',
'etnaviv_ml_nn_v8.c',
'etnaviv_ml_nn.c',
'etnaviv_ml_nn.h',
'etnaviv_ml_tp.c',