From fc70406bddfa74b324210256fa0444445641d4ba Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 29 Jan 2026 14:53:02 +0100 Subject: [PATCH] ethosu: Expand pooling to U85 Part-of: --- src/gallium/drivers/ethosu/ethosu_cmd.c | 110 +++++++++++++++++++++- src/gallium/drivers/ethosu/ethosu_lower.c | 25 +++-- src/gallium/drivers/ethosu/ethosu_ml.h | 13 ++- 3 files changed, 138 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.c b/src/gallium/drivers/ethosu/ethosu_cmd.c index 2f76a998d8e..36e1a781aaf 100644 --- a/src/gallium/drivers/ethosu/ethosu_cmd.c +++ b/src/gallium/drivers/ethosu/ethosu_cmd.c @@ -24,6 +24,16 @@ enum ethosu_op_to_scale { OP_B = 2, }; +enum ethosu_microblock { + MICROBLOCK_U1X1 = 0, + MICROBLOCK_U1X2 = 1, + MICROBLOCK_U1X4 = 2, + MICROBLOCK_U2X2 = 3, + MICROBLOCK_U2X4 = 4, + MICROBLOCK_U4X4 = 5, + MICROBLOCK_U2X1 = 6, /* U85 elementwise ublock */ +}; + static void ethosu_ensure_cmdstream(struct ethosu_subgraph *subgraph) { @@ -310,6 +320,49 @@ emit_shram_registers(struct ethosu_subgraph *subgraph, struct ethosu_operation * EMIT0(NPU_SET_ACC_FORMAT, operation->block_config.acc_type); } +static void +emit_acc_format(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + /* Currently only 8-bit quantized operations are supported, so + * acc_format=INT_32 (0), acc_input=I8 (0), acc_output=I8 (0). + * These would need to vary for 16-bit or mixed-precision ops. */ + unsigned acc_format = 0; + unsigned acc_input = 0; + unsigned acc_output = 0; + enum ethosu_microblock block = MICROBLOCK_U1X1; + + switch (operation->block_config.ofm_ublock.height << 4 | operation->block_config.ofm_ublock.width) { + case 0x11: + block = MICROBLOCK_U1X1; + break; + case 0x12: + block = MICROBLOCK_U1X2; + break; + case 0x14: + block = MICROBLOCK_U1X4; + break; + case 0x21: + block = MICROBLOCK_U2X1; + break; + case 0x22: + block = MICROBLOCK_U2X2; + break; + case 0x24: + block = MICROBLOCK_U2X4; + break; + case 0x44: + block = MICROBLOCK_U4X4; + break; + default: + assert(false && "Invalid microblock"); + } + + EMIT0(NPU_SET_ACC_FORMAT, NPU_SET_ACC_FORMAT_ACC_FORMAT(acc_format) | + NPU_SET_ACC_FORMAT_ACC_INPUT(acc_input) | + NPU_SET_ACC_FORMAT_ACC_OUTPUT(acc_output) | + NPU_SET_ACC_FORMAT_MICROBLOCK(block)); +} + static void emit_common(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, enum ethosu_op_to_scale op_to_scale) { @@ -393,6 +446,28 @@ pooling_emit_ofm_scaling( return scale; } +static unsigned +sum_emit_ofm_scaling(double input1_scale, double output_scale, unsigned kernel_height, unsigned kernel_width, uint32_t *out_shift) +{ + int kernel_elements = kernel_height * kernel_width; + double rescale = input1_scale / output_scale; + int rescale_bits = 0; + int N = 31; + int exp; + + frexp((double)(kernel_elements - 1), &exp); + + int n = (N - 1) - rescale_bits; + uint64_t numerator = (1ULL << (n + exp)) + (1ULL << exp); + uint32_t scale = (uint32_t)ceil(rescale * (double)numerator / kernel_elements); + int shift = n + exp; + + assert(shift >= 0 && shift < 64); + + *out_shift = shift; + return scale; +} + static void emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { @@ -401,7 +476,15 @@ emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operatio emit_common(subgraph, operation, false); - if (operation->pooling.avg) { + switch (operation->pooling.type) { + case ETHOSU_POOLING_TYPE_MAX: { + if (!ethosu_is_u65(ethosu_screen(subgraph->base.context->screen))) { + EMIT1(NPU_SET_OFM_SCALE, NPU_SET_OFM_SCALE_ROUND_MODE(1), 1); + break; + } else + FALLTHROUGH; + } + case ETHOSU_POOLING_TYPE_AVG: { scale = pooling_emit_ofm_scaling( operation->ifm.scale, operation->ofm.scale, @@ -409,8 +492,29 @@ emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operatio operation->kernel.width, &scale_shift); - EMIT1(NPU_SET_OFM_SCALE, scale_shift, scale); + EMIT1(NPU_SET_OFM_SCALE, NPU_SET_OFM_SCALE_SHIFT(scale_shift), scale); + break; } + case ETHOSU_POOLING_TYPE_SUM: { + scale = sum_emit_ofm_scaling( + operation->ifm.scale, + operation->ofm.scale, + operation->kernel.height, + operation->kernel.width, + &scale_shift); + + EMIT1(NPU_SET_OFM_SCALE, NPU_SET_OFM_SCALE_SHIFT(scale_shift) | NPU_SET_OFM_SCALE_ROUND_MODE(1), scale); + break; + } + default: + UNREACHABLE("Invalid pooling type"); + } + + emit_block_config(subgraph, operation); + if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen))) + emit_shram_registers(subgraph, operation); + else + emit_acc_format(subgraph, operation); } static void @@ -543,7 +647,7 @@ emit_operation_code(struct ethosu_subgraph *subgraph, struct ethosu_operation *o break; case ETHOSU_OPERATION_TYPE_POOLING: - EMIT0(NPU_OP_POOL, operation->pooling.avg); + EMIT0(NPU_OP_POOL, operation->pooling.type); break; case ETHOSU_OPERATION_TYPE_ELTWISE: EMIT0(NPU_OP_ELEMENTWISE, 0x1); diff --git a/src/gallium/drivers/ethosu/ethosu_lower.c b/src/gallium/drivers/ethosu/ethosu_lower.c index 8abfde2603c..1006ce09dee 100644 --- a/src/gallium/drivers/ethosu/ethosu_lower.c +++ b/src/gallium/drivers/ethosu/ethosu_lower.c @@ -166,7 +166,17 @@ ethosu_lower_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { operation->type = ETHOSU_OPERATION_TYPE_POOLING; - operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG; + + switch (poperation->pooling.type) { + case PIPE_ML_POOLING_TYPE_MAX: + operation->pooling.type = ETHOSU_POOLING_TYPE_MAX; + break; + case PIPE_ML_POOLING_TYPE_AVG: + operation->pooling.type = ETHOSU_POOLING_TYPE_AVG; + break; + default: + assert(0 && "Unsupported pooling type"); + } set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); @@ -203,13 +213,16 @@ ethosu_lower_concatenation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { operation->type = ETHOSU_OPERATION_TYPE_POOLING; - operation->pooling.avg = true; + + if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen))) { + operation->pooling.type = ETHOSU_POOLING_TYPE_AVG; + operation->round_mode = ETHOSU_ROUNDING_NATURAL; + } else + operation->pooling.type = ETHOSU_POOLING_TYPE_SUM; set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation); operation->ofm.shape.depth = operation->ifm.shape.depth; - operation->round_mode = ETHOSU_ROUNDING_NATURAL; - operation->kernel.height = 1; operation->kernel.width = 1; operation->kernel.stride_y = 1; @@ -238,7 +251,7 @@ ethosu_lower_resize(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { operation->type = ETHOSU_OPERATION_TYPE_POOLING; - operation->pooling.avg = true; + operation->pooling.type = ETHOSU_POOLING_TYPE_AVG; set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); @@ -261,7 +274,7 @@ ethosu_lower_strided_slice(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { operation->type = ETHOSU_OPERATION_TYPE_POOLING; - operation->pooling.avg = true; + operation->pooling.type = ETHOSU_POOLING_TYPE_AVG; set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); operation->ifm.shape = operation->ofm.shape; diff --git a/src/gallium/drivers/ethosu/ethosu_ml.h b/src/gallium/drivers/ethosu/ethosu_ml.h index 0aa1d1c08c4..e582ac08efa 100644 --- a/src/gallium/drivers/ethosu/ethosu_ml.h +++ b/src/gallium/drivers/ethosu/ethosu_ml.h @@ -130,6 +130,17 @@ struct ethosu_block_config { bool is_partkernel; }; +enum ethosu_pooling_type { + ETHOSU_POOLING_TYPE_MAX = 0, + ETHOSU_POOLING_TYPE_AVG, + ETHOSU_POOLING_TYPE_REDUCE_SUM, + ETHOSU_POOLING_TYPE_SUM, + ETHOSU_POOLING_TYPE_NONE, + ETHOSU_POOLING_TYPE_MIN, + ETHOSU_POOLING_TYPE_ARGMAX_X, + ETHOSU_POOLING_TYPE_ARGMAX_Y, +}; + #define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/ struct ethosu_operation { @@ -147,7 +158,7 @@ struct ethosu_operation { } conv; struct { - bool avg; /* true for avg, false for max */ + enum ethosu_pooling_type type; } pooling; struct {