mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-26 17:10:11 +01:00
etnaviv/ml: Support per-channel quantized weights
Dequantize the weights, calculate new quantization values based on the minimum and maximum values and requantize again. This causes a loss of accuracy when compared to proper per-axis quantization as we don't have the original data from training any more. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34497>
This commit is contained in:
parent
ce9030740a
commit
aa73767f5e
1 changed files with 143 additions and 6 deletions
|
|
@ -4,6 +4,7 @@
|
|||
*/
|
||||
|
||||
#include "pipe/p_state.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/u_inlines.h"
|
||||
|
||||
#include "etnaviv_context.h"
|
||||
|
|
@ -185,6 +186,119 @@ map_resource(struct pipe_resource *resource)
|
|||
return etna_bo_map(etna_buffer_resource(resource)->bo);
|
||||
}
|
||||
|
||||
static void
|
||||
calc_quant_params(float min, float max, float *out_scale, uint8_t *out_zero_point)
|
||||
{
|
||||
if (max < min)
|
||||
SWAP(max, min);
|
||||
|
||||
float diff = max - MIN2(min, 0.0);
|
||||
*out_scale = diff / 255.0f;
|
||||
*out_zero_point = round(-(min / *out_scale));
|
||||
}
|
||||
|
||||
static void
|
||||
calc_quantization(struct etna_ml_subgraph *subgraph, const struct pipe_ml_operation *poperation,
|
||||
struct etna_operation *operation, float *out_scale, uint8_t *out_zero_point)
|
||||
{
|
||||
const struct pipe_tensor *weight_tensor = poperation->conv.weight_tensor;
|
||||
void *map = map_resource(operation->weight_tensor);
|
||||
unsigned input_channels = operation->input_channels;
|
||||
|
||||
if (poperation->conv.depthwise)
|
||||
input_channels = 1;
|
||||
|
||||
uint8_t (*weights)[operation->weight_width][operation->weight_height][input_channels] = map;
|
||||
|
||||
float max = .0, min = FLT_MAX;
|
||||
for (unsigned oc = 0; oc < operation->output_channels; oc++) {
|
||||
for (unsigned w = 0; w < operation->weight_width; w++) {
|
||||
for (unsigned h = 0; h < operation->weight_height; h++) {
|
||||
for (unsigned ic = 0; ic < input_channels; ic++) {
|
||||
float dequant;
|
||||
if (operation->weight_signed) {
|
||||
int8_t val = weights[oc][w][h][ic];
|
||||
dequant = weight_tensor->scales[oc] * (val - weight_tensor->zero_points[oc]);
|
||||
} else {
|
||||
uint8_t val = weights[oc][w][h][ic];
|
||||
dequant = weight_tensor->scales[oc] * (val - weight_tensor->zero_points[oc]);
|
||||
}
|
||||
max = MAX2(max, dequant);
|
||||
min = MIN2(min, dequant);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
calc_quant_params(min, max, out_scale, out_zero_point);
|
||||
}
|
||||
|
||||
static void
|
||||
requantize_weights(struct etna_ml_subgraph *subgraph,
|
||||
const struct pipe_ml_operation *poperation,
|
||||
struct etna_operation *operation)
|
||||
{
|
||||
const struct pipe_tensor *weight_tensor = poperation->conv.weight_tensor;
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
void *input = map_resource(operation->weight_tensor);
|
||||
unsigned input_channels = operation->input_channels;
|
||||
|
||||
if (poperation->conv.depthwise)
|
||||
input_channels = 1;
|
||||
|
||||
unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * input_channels;
|
||||
struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
|
||||
void *output = map_resource(output_res);
|
||||
uint8_t (*map_in)[operation->weight_width][operation->weight_height][input_channels] = input;
|
||||
uint8_t (*map_out)[operation->weight_width][operation->weight_height][input_channels] = output;
|
||||
|
||||
for (unsigned oc = 0; oc < operation->output_channels; oc++) {
|
||||
for (unsigned w = 0; w < operation->weight_width; w++) {
|
||||
for (unsigned h = 0; h < operation->weight_height; h++) {
|
||||
for (unsigned ic = 0; ic < input_channels; ic++) {
|
||||
if (operation->weight_signed) {
|
||||
int8_t val = map_in[oc][w][h][ic];
|
||||
double dequantized = weight_tensor->scales[oc] * (val - weight_tensor->zero_points[oc]);
|
||||
int8_t requantized = round(dequantized / operation->weight_scale);
|
||||
map_out[oc][w][h][ic] = requantized + operation->weight_zero_point - 0x80;
|
||||
} else {
|
||||
uint8_t val = map_in[oc][w][h][ic];
|
||||
double dequantized = weight_tensor->scales[oc] * (val - weight_tensor->zero_points[oc]);
|
||||
uint8_t requantized = round(dequantized / operation->weight_scale);
|
||||
map_out[oc][w][h][ic] = requantized + operation->weight_zero_point;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pipe_resource_reference(&operation->weight_tensor, NULL);
|
||||
operation->weight_tensor = output_res;
|
||||
}
|
||||
|
||||
static void
|
||||
requantize_bias(struct etna_ml_subgraph *subgraph,
|
||||
const struct pipe_ml_operation *poperation,
|
||||
struct etna_operation *operation)
|
||||
{
|
||||
const struct pipe_tensor *bias_tensor = poperation->conv.bias_tensor;
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
uint32_t *input = map_resource(operation->bias_tensor);
|
||||
unsigned new_size = operation->output_channels * sizeof(*input);
|
||||
struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
|
||||
uint32_t *output = map_resource(output_res);
|
||||
float bias_scale = operation->weight_scale * operation->input_scale;
|
||||
|
||||
for (unsigned oc = 0; oc < operation->output_channels; oc++) {
|
||||
int32_t quantized = input[oc];
|
||||
double dequantized = bias_tensor->scales[oc] * quantized;
|
||||
int32_t requantized = round(dequantized / bias_scale);
|
||||
output[oc] = requantized;
|
||||
}
|
||||
|
||||
pipe_resource_reference(&operation->bias_tensor, NULL);
|
||||
operation->bias_tensor = output_res;
|
||||
}
|
||||
|
||||
static void
|
||||
pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
|
||||
|
|
@ -495,12 +609,15 @@ etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
|
|||
pointwise_to_2x2(subgraph, operation);
|
||||
|
||||
if (operation->depthwise) {
|
||||
if (nn_core_version < 8 && (operation->output_channels > 1 || operation->stride > 1)) {
|
||||
if (operation->input_width < 8 && operation->input_width > 2)
|
||||
operation->pooling_first_pixel = false;
|
||||
expand_depthwise(subgraph, operation);
|
||||
} else if (operation->output_channels > 1)
|
||||
if (nn_core_version < 8) {
|
||||
if (operation->output_channels > 1 || operation->stride > 1) {
|
||||
if (operation->input_width < 8 && operation->input_width > 2)
|
||||
operation->pooling_first_pixel = false;
|
||||
expand_depthwise(subgraph, operation);
|
||||
}
|
||||
} else if (operation->output_channels > 1) {
|
||||
reorder_for_hw_depthwise(subgraph, operation);
|
||||
}
|
||||
}
|
||||
|
||||
if (operation->stride > 1 && !operation->pooling_first_pixel)
|
||||
|
|
@ -511,11 +628,31 @@ etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
|
|||
operation->input_tensor_sizes[0] = operation->input_width *
|
||||
operation->input_height *
|
||||
operation->input_channels;
|
||||
ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
|
||||
|
||||
operation->output_tensor_sizes[0] = operation->output_width *
|
||||
operation->output_height *
|
||||
operation->output_channels;
|
||||
|
||||
if (poperation->conv.weight_tensor->scales) {
|
||||
float scale;
|
||||
uint8_t zero_point;
|
||||
|
||||
assert(poperation->conv.weight_tensor->zero_points);
|
||||
|
||||
ML_DBG("\nWARNING: The weights in this model are quantized using a per-channel scheme.\n"
|
||||
"The hardware doesn't support it natively, so it will be workarounded by requantizing\n"
|
||||
"on the fly, with a loss of accuracy. It is recommended to quantize the model with the\n"
|
||||
"per-tensor scheme. See https://ai.google.dev/edge/litert/models/quantization_spec.\n\n");
|
||||
|
||||
calc_quantization(subgraph, poperation, operation, &scale, &zero_point);
|
||||
operation->weight_scale = scale;
|
||||
operation->weight_zero_point = zero_point;
|
||||
requantize_weights(subgraph, poperation, operation);
|
||||
requantize_bias(subgraph, poperation, operation);
|
||||
|
||||
if (nn_core_version >= 8 && poperation->conv.depthwise)
|
||||
operation->input_channels = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static float
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue