mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-20 18:30:34 +01:00
etnaviv/ml: Add encoding of coefficients for V8
In V8 the weights and biases of convolution operations are encoded with a totally different scheme. The initial reverse engineering and implementation was done by: Philipp Zabel <p.zabel@pengutronix.de> Support for zero run length encoding and average bias is not implemented yet. Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31842>
This commit is contained in:
parent
f3d765ed5d
commit
b4ba62fcda
4 changed files with 703 additions and 1 deletions
|
|
@ -892,10 +892,17 @@ void
|
|||
etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
|
||||
struct etna_vip_instruction *instruction)
|
||||
{
|
||||
struct pipe_context *pctx = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(pctx);
|
||||
unsigned nn_core_version = ctx->screen->specs.nn_core_version;
|
||||
unsigned coef_cache_size;
|
||||
|
||||
instruction->type = ETNA_JOB_TYPE_NN;
|
||||
instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
|
||||
|
||||
if (nn_core_version == 7)
|
||||
instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
|
||||
else
|
||||
instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size);
|
||||
|
||||
struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
|
||||
assert(input);
|
||||
|
|
|
|||
|
|
@ -13,6 +13,9 @@ etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsig
|
|||
struct etna_bo *
|
||||
etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
|
||||
|
||||
struct etna_bo *
|
||||
etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size);
|
||||
|
||||
unsigned
|
||||
etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out);
|
||||
|
||||
|
|
|
|||
691
src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
Normal file
691
src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
Normal file
|
|
@ -0,0 +1,691 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
|
||||
* Copyright (c) 2024 Pengutronix, Philipp Zabel
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include <time.h>
|
||||
#include "util/u_inlines.h"
|
||||
#include "util/u_math.h"
|
||||
#include "etnaviv_context.h"
|
||||
#include "etnaviv_debug.h"
|
||||
#include "etnaviv_ml.h"
|
||||
#include "etnaviv_ml_nn.h"
|
||||
#include "etnaviv_screen.h"
|
||||
|
||||
static void *
|
||||
map_resource(struct pipe_resource *resource)
|
||||
{
|
||||
return etna_bo_map(etna_resource(resource)->bo);
|
||||
}
|
||||
|
||||
#define FIELD(field, bits) uint32_t field : bits;
|
||||
|
||||
struct etna_nn_header_v8 {
|
||||
FIELD(precode, 1)
|
||||
FIELD(bit16, 1)
|
||||
FIELD(fp16, 1)
|
||||
FIELD(reserved1, 1)
|
||||
FIELD(version, 4)
|
||||
|
||||
uint8_t run_length_size;
|
||||
uint8_t run_length_table[18];
|
||||
uint32_t symbol_map;
|
||||
uint16_t avg_bias;
|
||||
uint16_t reserved2;
|
||||
uint32_t stream_size[0];
|
||||
};
|
||||
|
||||
struct bitstream {
|
||||
unsigned bits_in_buffer;
|
||||
uint64_t buffer;
|
||||
uint32_t **map;
|
||||
bool do_write;
|
||||
};
|
||||
|
||||
static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights)
|
||||
{
|
||||
unsigned input_channels;
|
||||
int32_t input_zero_point = 128 - operation->input_zero_point;
|
||||
int32_t correction = 0;
|
||||
|
||||
if (operation->depthwise)
|
||||
input_channels = 1;
|
||||
else if (operation->addition)
|
||||
input_channels = 2 * operation->output_channels;
|
||||
else
|
||||
input_channels = operation->input_channels;
|
||||
|
||||
for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
|
||||
correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
|
||||
}
|
||||
|
||||
return correction;
|
||||
}
|
||||
|
||||
static void
|
||||
append_bits(uint32_t value, size_t size, struct bitstream *bitstream)
|
||||
{
|
||||
assert(value < 1 << size);
|
||||
if (!size)
|
||||
return;
|
||||
bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer;
|
||||
bitstream->bits_in_buffer += size;
|
||||
if (bitstream->bits_in_buffer >= 32) {
|
||||
if (bitstream->do_write)
|
||||
**bitstream->map = bitstream->buffer & 0xffffffff;
|
||||
*bitstream->map += 1;
|
||||
bitstream->buffer >>= 32;
|
||||
bitstream->bits_in_buffer -= 32;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
flush_bits(struct bitstream *bitstream)
|
||||
{
|
||||
if (bitstream->bits_in_buffer > 0)
|
||||
append_bits(0, 32 - bitstream->bits_in_buffer, bitstream);
|
||||
}
|
||||
|
||||
struct wb_stream {
|
||||
struct bitstream bitstream;
|
||||
unsigned zero_point;
|
||||
unsigned zrl_bits;
|
||||
unsigned accum_zeroes;
|
||||
};
|
||||
|
||||
static void
|
||||
wb_stream_flush_zeroes(struct wb_stream *wb_stream)
|
||||
{
|
||||
struct bitstream *bitstream = &wb_stream->bitstream;
|
||||
|
||||
if (wb_stream->accum_zeroes == 0)
|
||||
return;
|
||||
|
||||
append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream);
|
||||
wb_stream->accum_zeroes = 0;
|
||||
append_bits(wb_stream->zero_point, 8, bitstream);
|
||||
}
|
||||
|
||||
static void
|
||||
wb_stream_write(struct wb_stream *wb_stream, unsigned value)
|
||||
{
|
||||
struct bitstream *bitstream = &wb_stream->bitstream;
|
||||
unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
|
||||
|
||||
if (wb_stream->zrl_bits == 0) {
|
||||
append_bits(value, 8, bitstream);
|
||||
return;
|
||||
}
|
||||
|
||||
if (wb_stream->accum_zeroes == max_zeroes) {
|
||||
append_bits(max_zeroes, wb_stream->zrl_bits, bitstream);
|
||||
wb_stream->accum_zeroes = 0;
|
||||
append_bits(value, 8, bitstream);
|
||||
return;
|
||||
}
|
||||
|
||||
if (value == wb_stream->zero_point) {
|
||||
wb_stream->accum_zeroes++;
|
||||
return;
|
||||
}
|
||||
|
||||
append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream);
|
||||
wb_stream->accum_zeroes = 0;
|
||||
append_bits(value, 8, bitstream);
|
||||
}
|
||||
|
||||
/*
|
||||
* The V8 architecture Huffman stream decoder uses a fixed code book with 8
|
||||
* entries to determine bit lengths of variable length values later in the bit
|
||||
* stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus
|
||||
* optional 2-bit) fields:
|
||||
*
|
||||
* code symbol
|
||||
* --------------
|
||||
* 00_ 0
|
||||
* 10_ 1
|
||||
* 111 2
|
||||
* 110 3
|
||||
* 011 4
|
||||
* 010 1_ 5
|
||||
* 010 01 6
|
||||
* 010 00 7
|
||||
*
|
||||
* The free bit (_) is used for the sign, if available, otherwise the sign
|
||||
* is stored with the variable length value later in the bitstream. In ZRL
|
||||
* encoding mode, where larger values are stored verbatim, this may also be
|
||||
* the lsb of the value instead.. The decoder processes weights in pairs and
|
||||
* is pipelined 3-deep:
|
||||
*
|
||||
* In each step, first two 3-bit codes are read, then up to two 2-bit codes
|
||||
* that belong with (010) 3-bit codes from the previous step. The optional
|
||||
* 2-bit codes from the previous step, together with the 3-bit codes from the
|
||||
* step before that are used to decode two symbols that are mapped to two bit
|
||||
* lengths for the two variable length values that are read next.
|
||||
*
|
||||
* Finally, the bit lengths, signs, and variable length values are used to
|
||||
* calculate two weights.
|
||||
*/
|
||||
|
||||
struct code {
|
||||
/* fixed 3-bit code */
|
||||
uint8_t part0;
|
||||
/* optional 2-bit code, iff part0 == 0b010 */
|
||||
uint8_t part1;
|
||||
/* variable length value */
|
||||
uint8_t part2;
|
||||
/* bit length determined from part0, part1, and symbol-to-bitlength map */
|
||||
uint8_t part2_len;
|
||||
};
|
||||
|
||||
struct encoder {
|
||||
/* bit-length-to-huffman-symbol map */
|
||||
uint8_t map[9];
|
||||
/* ring buffer for 3 encoded weight pairs */
|
||||
struct code code[6];
|
||||
size_t bytes_read;
|
||||
struct bitstream bitstream;
|
||||
uint32_t *initial_ptr;
|
||||
uint32_t *dest;
|
||||
uint8_t accum_zeroes;
|
||||
uint8_t avg_bias;
|
||||
bool zrl;
|
||||
};
|
||||
|
||||
/* Calculate a histogram of bit lenghts. */
|
||||
static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
uint8_t num_bits = 0;
|
||||
if (bytes[i]) {
|
||||
bool sign = bytes[i] >> 7;
|
||||
uint8_t value = bytes[i];
|
||||
if (sign) {
|
||||
value -= zrl;
|
||||
value ^= 0xff;
|
||||
}
|
||||
num_bits = util_logbase2(value) + 1;
|
||||
}
|
||||
assert(num_bits <= 8);
|
||||
histogram[num_bits]++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* value can be 8-bit raw value or variable length value with prepended sign.
|
||||
* num_bits is number of bits in value, including the sign bit.
|
||||
*/
|
||||
static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits)
|
||||
{
|
||||
switch (sym) {
|
||||
case 0:
|
||||
return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
|
||||
case 1:
|
||||
return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
|
||||
case 2:
|
||||
return (struct code){ 7, 0, value, num_bits};
|
||||
case 3:
|
||||
return (struct code){ 3, 0, value, num_bits};
|
||||
case 4:
|
||||
return (struct code){ 6, 0, value, num_bits};
|
||||
case 5:
|
||||
return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 };
|
||||
case 6:
|
||||
return (struct code){ 2, 2, value, num_bits};
|
||||
case 7:
|
||||
return (struct code){ 2, 0, value, num_bits};
|
||||
default:
|
||||
return (struct code){};
|
||||
}
|
||||
}
|
||||
|
||||
static void emit_pair(struct encoder *encoder)
|
||||
{
|
||||
struct bitstream *bitstream = &encoder->bitstream;
|
||||
struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6];
|
||||
|
||||
append_bits(code[0].part0, 3, bitstream);
|
||||
append_bits(code[1].part0, 3, bitstream);
|
||||
if (encoder->bytes_read > 2) {
|
||||
code = &encoder->code[(encoder->bytes_read - 4) % 6];
|
||||
append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream);
|
||||
append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream);
|
||||
}
|
||||
if (encoder->bytes_read > 4) {
|
||||
code = &encoder->code[(encoder->bytes_read - 6) % 6];
|
||||
append_bits(code[0].part2, code[0].part2_len, bitstream);
|
||||
append_bits(code[1].part2, code[1].part2_len, bitstream);
|
||||
}
|
||||
}
|
||||
|
||||
/* Encode a single byte. Emit into the bitstream when a pair is complete. */
|
||||
static void encode_byte(struct encoder *encoder, uint8_t byte)
|
||||
{
|
||||
bool zrl = encoder->zrl;
|
||||
bool sign = byte >> 7;
|
||||
uint8_t value = byte;
|
||||
|
||||
if (sign) {
|
||||
value -= zrl;
|
||||
value ^= 0xff;
|
||||
}
|
||||
|
||||
uint8_t msb = util_logbase2(value);
|
||||
uint8_t num_bits = value ? (msb + 1) : 0;
|
||||
value &= ~(1 << msb);
|
||||
uint8_t sym = encoder->map[num_bits];
|
||||
if (zrl && byte == 0) {
|
||||
if (encoder->accum_zeroes <= 1) {
|
||||
// this seems to be used for the non-repeated 0 at the beginning and end
|
||||
sym = encoder->map[7];
|
||||
num_bits = 8;
|
||||
} else {
|
||||
// FIXME - how to encode run length into the run length table?
|
||||
num_bits = 1;
|
||||
}
|
||||
}
|
||||
if (!zrl && num_bits == 0) {
|
||||
num_bits = 1;
|
||||
}
|
||||
if (sym == 255 || (zrl && byte == 128)) {
|
||||
// if there is no huffman code assigned to this bit length, or when
|
||||
// encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim.
|
||||
sym = encoder->map[7];
|
||||
value = byte;
|
||||
num_bits = 8;
|
||||
} else if (zrl && num_bits == 7) {
|
||||
value = byte;
|
||||
num_bits = 8;
|
||||
} else {
|
||||
value = (value << 1) | sign;
|
||||
}
|
||||
unsigned int i = encoder->bytes_read % 6;
|
||||
encoder->code[i] = huffman_code(sym, value, num_bits);
|
||||
encoder->bytes_read++;
|
||||
if ((encoder->bytes_read & 1) == 0)
|
||||
emit_pair(encoder);
|
||||
}
|
||||
|
||||
static void
|
||||
encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value)
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned customer_id = ctx->screen->info->customer_id;
|
||||
uint8_t zero_point = operation->weight_zero_point;
|
||||
|
||||
value -= encoder->avg_bias;
|
||||
|
||||
if (customer_id == 0x99) {
|
||||
if (encoder->zrl) {
|
||||
if (encoder->avg_bias > 0) {
|
||||
if (value == zero_point) {
|
||||
encoder->accum_zeroes++;
|
||||
return;
|
||||
} else if (encoder->accum_zeroes) {
|
||||
encode_byte(encoder, zero_point);
|
||||
encoder->accum_zeroes = 0;
|
||||
}
|
||||
} else {
|
||||
if (value == 0x0) {
|
||||
encoder->accum_zeroes++;
|
||||
return;
|
||||
} else if (encoder->accum_zeroes) {
|
||||
encode_byte(encoder, 0x80);
|
||||
encoder->accum_zeroes = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
encode_byte(encoder, value);
|
||||
} else {
|
||||
if (encoder->zrl) {
|
||||
if (value == zero_point) {
|
||||
encoder->accum_zeroes++;
|
||||
return;
|
||||
} else if (encoder->accum_zeroes) {
|
||||
encode_byte(encoder, 0x00);
|
||||
encoder->accum_zeroes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
encode_byte(encoder, value - zero_point);
|
||||
}
|
||||
}
|
||||
|
||||
static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr)
|
||||
{
|
||||
memset(encoder, 0, sizeof(*encoder));
|
||||
encoder->initial_ptr = initial_ptr;
|
||||
encoder->dest = initial_ptr;
|
||||
encoder->bitstream.map = &encoder->dest;
|
||||
encoder->bitstream.do_write = initial_ptr != NULL;
|
||||
|
||||
for (int i = 0; i < 9; i++)
|
||||
encoder->map[i] = 255;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
assert(map[i] < sizeof(encoder->map));
|
||||
encoder->map[map[i]] = i;
|
||||
}
|
||||
}
|
||||
|
||||
static void encode_uint32(struct encoder *encoder, uint32_t value)
|
||||
{
|
||||
encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
|
||||
encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
|
||||
encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias);
|
||||
encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias);
|
||||
}
|
||||
|
||||
static void encode_uint16(struct encoder *encoder, uint32_t value)
|
||||
{
|
||||
encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
|
||||
encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush remaining weights stuck in the encoder ring buffer and all bits
|
||||
* in the bitstream FIFO. Return the total number of bits written.
|
||||
*/
|
||||
static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
|
||||
{
|
||||
struct bitstream *bitstream = &encoder->bitstream;
|
||||
size_t total_bits;
|
||||
uint8_t flush_val = (encoder->bytes_read & 1) + 4;
|
||||
|
||||
struct code code;
|
||||
if (encoder->bytes_read & 1)
|
||||
encode_byte(encoder, 0x0);
|
||||
|
||||
code.part0 = (flush_val & 1) << 2;
|
||||
code.part1 = 0x0;
|
||||
code.part2 = 0x0;
|
||||
code.part2_len = 0x0;
|
||||
encoder->code[encoder->bytes_read++ % 6] = code;
|
||||
encoder->code[encoder->bytes_read++ % 6] = code;
|
||||
emit_pair(encoder);
|
||||
encoder->code[encoder->bytes_read++ % 6] = code;
|
||||
encoder->code[encoder->bytes_read++ % 6] = code;
|
||||
emit_pair(encoder);
|
||||
|
||||
total_bits = (*bitstream->map - encoder->initial_ptr) * 32 +
|
||||
bitstream->bits_in_buffer;
|
||||
|
||||
int padding_bits = 0;
|
||||
if (total_bits % (64 * 8) > 0)
|
||||
padding_bits = (64 * 8) - total_bits % (64 * 8);
|
||||
|
||||
while (padding_bits > 0) {
|
||||
unsigned bits = padding_bits >= 32 ? 32 : padding_bits;
|
||||
append_bits(0, bits, bitstream);
|
||||
padding_bits -= bits;
|
||||
}
|
||||
|
||||
return total_bits;
|
||||
}
|
||||
|
||||
static void map_swap(uint8_t *map, int a, int b)
|
||||
{
|
||||
uint8_t tmp = map[a];
|
||||
|
||||
map[a] = map[b];
|
||||
map[b] = tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the Huffman symbol to bit length map according to the histogram of bit
|
||||
* lengths, so that more common bit lengths are represented by shorter codes.
|
||||
* FIXME - doesn't take into account zrl mode properly.
|
||||
*/
|
||||
static void sort_map(uint8_t *map, size_t *histogram)
|
||||
{
|
||||
const uint8_t network[19][2] = {
|
||||
{0, 2}, {1, 3}, {4, 6}, {5, 7},
|
||||
{0, 4}, {1, 5}, {2, 6}, {3, 7},
|
||||
{0, 1}, {2, 3}, {4, 5}, {6, 7},
|
||||
{2, 4}, {3, 5},
|
||||
{1, 4}, {3, 6},
|
||||
{1, 2}, {3 ,4}, {5, 6},
|
||||
};
|
||||
|
||||
for (int i = 0; i < 19; i++) {
|
||||
int a = network[i][0];
|
||||
int b = network[i][1];
|
||||
|
||||
if (histogram[map[a]] < histogram[map[b]])
|
||||
map_swap(map, a, b);
|
||||
}
|
||||
}
|
||||
|
||||
static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
|
||||
{
|
||||
encoder->initial_ptr = *encoder->bitstream.map;
|
||||
encoder->dest = encoder->initial_ptr;
|
||||
encoder->bitstream.map = &encoder->dest;
|
||||
|
||||
encoder->bitstream.buffer = 0;
|
||||
encoder->bitstream.bits_in_buffer = 0;
|
||||
encoder->bytes_read = 0;
|
||||
memset(encoder->code, 0, sizeof(encoder->code));
|
||||
}
|
||||
|
||||
static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel)
|
||||
{
|
||||
struct pipe_context *pctx = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(pctx);
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned input_channels = operation->input_channels;
|
||||
unsigned output_channels = operation->output_channels;
|
||||
unsigned kernel_size;
|
||||
uint8_t *weights = map_resource(operation->weight_tensor);
|
||||
unsigned block_size;
|
||||
unsigned blocks;
|
||||
|
||||
if (operation->depthwise)
|
||||
input_channels = 1;
|
||||
else if (operation->addition)
|
||||
input_channels = 2 * output_channels;
|
||||
|
||||
kernel_size = input_channels * operation->weight_height * operation->weight_width;
|
||||
|
||||
uint8_t (*weights_map)[kernel_size] = (void *)weights;
|
||||
|
||||
if (operation->depthwise)
|
||||
block_size = MAX2(operation->weight_height * operation->weight_width, 9);
|
||||
else
|
||||
block_size = 9;
|
||||
|
||||
blocks = DIV_ROUND_UP(kernel_size, block_size);
|
||||
|
||||
for (unsigned block = 0; block < blocks; block++) {
|
||||
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
|
||||
unsigned oc;
|
||||
|
||||
if (operation->depthwise) {
|
||||
oc = first_channel + kernel * nn_core_count;
|
||||
|
||||
if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count))
|
||||
oc -= nn_core_count - output_channels % nn_core_count;
|
||||
} else
|
||||
oc = first_channel + kernel;
|
||||
|
||||
for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) {
|
||||
uint8_t weight;
|
||||
|
||||
if (kernel_idx + block * block_size >= kernel_size)
|
||||
weight = operation->weight_zero_point;
|
||||
else
|
||||
weight = weights_map[oc][kernel_idx + block * block_size];
|
||||
|
||||
encode_value(subgraph, operation, encoder, weight);
|
||||
}
|
||||
|
||||
if (operation->depthwise && block_size % 9)
|
||||
for (unsigned i = 0; i < 9 - block_size % 9; i++)
|
||||
encode_value(subgraph, operation, encoder, operation->weight_zero_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t pack_symbol_map(uint8_t map[8])
|
||||
{
|
||||
uint32_t ret = 0;
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
ret |= map[i] << (4 * i);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct etna_bo *
|
||||
create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned input_channels = operation->input_channels;
|
||||
unsigned output_channels = operation->output_channels;
|
||||
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
||||
size_t max_size;
|
||||
|
||||
if (operation->depthwise)
|
||||
input_channels = 1;
|
||||
else if (operation->addition)
|
||||
input_channels = 2 * output_channels;
|
||||
|
||||
unsigned header_size = 64;
|
||||
unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
|
||||
unsigned tail_size = 64;
|
||||
max_size = header_size + cores_used * body_size + tail_size;
|
||||
|
||||
return etna_bo_new(ctx->screen->dev, max_size, DRM_ETNA_GEM_CACHE_WC);
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map)
|
||||
{
|
||||
unsigned input_channels = operation->input_channels;
|
||||
unsigned output_channels = operation->output_channels;
|
||||
uint8_t *input = map_resource(operation->weight_tensor);
|
||||
size_t histogram[9] = {};
|
||||
|
||||
if (operation->depthwise)
|
||||
input_channels = 1;
|
||||
else if (operation->addition)
|
||||
input_channels = 2 * output_channels;
|
||||
|
||||
uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
|
||||
unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels;
|
||||
for (unsigned oc = 0; oc < output_channels; oc++)
|
||||
histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
symbol_map[i] = i;
|
||||
sort_map(symbol_map, histogram);
|
||||
}
|
||||
|
||||
static void
|
||||
fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header)
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned output_channels = operation->output_channels;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
||||
unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, NULL, NULL);
|
||||
unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
|
||||
|
||||
unsigned channel_per_superblock[superblocks];
|
||||
for (unsigned superblock = 0; superblock < superblocks; superblock++)
|
||||
channel_per_superblock[superblock] = superblock * full_superblock * cores_used;
|
||||
|
||||
for (unsigned core = 0; core < cores_used; core++) {
|
||||
unsigned kernels_per_core = output_channels / cores_used;
|
||||
if (core < output_channels % cores_used)
|
||||
kernels_per_core++;
|
||||
|
||||
encoder_reset(subgraph, operation, encoder);
|
||||
encode_uint16(encoder, kernels_per_core);
|
||||
|
||||
for (unsigned superblock = 0; superblock < superblocks; superblock++) {
|
||||
|
||||
unsigned kernels_in_superblock = full_superblock;
|
||||
if (superblock == superblocks - 1) {
|
||||
unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock;
|
||||
kernels_in_superblock = remaining_channels / cores_used;
|
||||
if (core < remaining_channels % cores_used)
|
||||
kernels_in_superblock += 1;
|
||||
}
|
||||
|
||||
unsigned first_channel;
|
||||
if (operation->depthwise)
|
||||
first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock;
|
||||
else
|
||||
first_channel = channel_per_superblock[superblock];
|
||||
|
||||
encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel);
|
||||
|
||||
channel_per_superblock[superblock] += kernels_in_superblock;
|
||||
}
|
||||
|
||||
unsigned actual_bits = encoder_flush(subgraph, operation, encoder);
|
||||
header->stream_size[core] = actual_bits;
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t *
|
||||
fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map)
|
||||
{
|
||||
uint8_t *input = map_resource(operation->weight_tensor);
|
||||
uint32_t *biases = map_resource(operation->bias_tensor);
|
||||
unsigned input_channels = operation->input_channels;
|
||||
unsigned output_channels = operation->output_channels;
|
||||
|
||||
if (operation->depthwise)
|
||||
input_channels = 1;
|
||||
else if (operation->addition)
|
||||
input_channels = 2 * output_channels;
|
||||
|
||||
uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
|
||||
for (unsigned oc = 0; oc < output_channels; oc++) {
|
||||
uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]);
|
||||
|
||||
*map = biases[oc] + corr;
|
||||
map++;
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
struct etna_bo *
|
||||
etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
|
||||
{
|
||||
struct etna_bo *bo = create_bo(subgraph, operation);
|
||||
uint32_t *map = etna_bo_map(bo);
|
||||
struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map;
|
||||
struct encoder encoder;
|
||||
uint8_t symbol_map[8];
|
||||
|
||||
etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
|
||||
memset(header, 0, sizeof(*header));
|
||||
|
||||
calculate_symbol_map(subgraph, operation, symbol_map);
|
||||
header->symbol_map = pack_symbol_map(symbol_map);
|
||||
header->version = 1;
|
||||
|
||||
map += ALIGN(sizeof(*header), 64) / 4;
|
||||
|
||||
encoder_init(&encoder, symbol_map, map);
|
||||
|
||||
fill_weights(subgraph, operation, &encoder, header);
|
||||
map = fill_biases(subgraph, operation, encoder.dest);
|
||||
|
||||
/* Size of the data that will go into the SRAM cache, header included */
|
||||
*cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo);
|
||||
|
||||
etna_bo_cpu_fini(bo);
|
||||
|
||||
return bo;
|
||||
}
|
||||
|
|
@ -35,6 +35,7 @@ files_etnaviv = files(
|
|||
'etnaviv_ml.c',
|
||||
'etnaviv_ml.h',
|
||||
'etnaviv_ml_nn_v7.c',
|
||||
'etnaviv_ml_nn_v8.c',
|
||||
'etnaviv_ml_nn.c',
|
||||
'etnaviv_ml_nn.h',
|
||||
'etnaviv_ml_tp.c',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue