From a305dfd54bf4a4b94c8f0afdde185391f16620b3 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 10 Mar 2026 20:13:24 -0500 Subject: [PATCH] ethosu: Add logistic and TANH operations Logistic and TANH operations are similar and both lower to pooling operation with a LUT. Signed-off-by: Rob Herring (Arm) Part-of: --- src/gallium/drivers/ethosu/ethosu_cmd.c | 25 +++++- src/gallium/drivers/ethosu/ethosu_coefs.c | 14 ++++ src/gallium/drivers/ethosu/ethosu_coefs.h | 5 ++ src/gallium/drivers/ethosu/ethosu_lower.c | 92 +++++++++++++++++++++++ src/gallium/drivers/ethosu/ethosu_ml.c | 2 + src/gallium/drivers/ethosu/ethosu_ml.h | 9 +++ 6 files changed, 143 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.c b/src/gallium/drivers/ethosu/ethosu_cmd.c index 4eeefc557f7..a74df5debe1 100644 --- a/src/gallium/drivers/ethosu/ethosu_cmd.c +++ b/src/gallium/drivers/ethosu/ethosu_cmd.c @@ -285,7 +285,10 @@ emit_activation(struct ethosu_subgraph *subgraph, struct ethosu_operation *opera if (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE) min = operation->eltwise.activation_min; - EMIT0(NPU_SET_ACTIVATION, 0x0); + if (operation->type == ETHOSU_OPERATION_TYPE_POOLING) + EMIT0(NPU_SET_ACTIVATION, operation->pooling.activation); + else + EMIT0(NPU_SET_ACTIVATION, 0x0); if (operation->ofm.is_signed) { EMIT0(NPU_SET_ACTIVATION_MIN, 0xff80); @@ -840,8 +843,8 @@ emit_dma(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) { EMIT0(NPU_SET_DMA0_SRC_REGION, COEFS_REGION); EMIT1(NPU_SET_DMA0_SRC, 0x0, operation->dma.address); - EMIT0(NPU_SET_DMA0_DST_REGION, SCRATCH_REGION); - EMIT1(NPU_SET_DMA0_DST, 0x0, 0x0); + EMIT0(NPU_SET_DMA0_DST_REGION, operation->dma.dst_region); + EMIT1(NPU_SET_DMA0_DST, 0x0, operation->dma.dst_address); EMIT1(NPU_SET_DMA0_LEN, 0x0, operation->dma.size); } @@ -978,10 +981,24 @@ fill_memory_accesses(struct ethosu_subgraph *subgraph) operation->read_accesses[0].address = operation->dma.address; operation->read_accesses[0].size = operation->dma.size; - operation->write_accesses[0].region = SCRATCH_REGION; + operation->write_accesses[0].region = operation->dma.dst_region; operation->write_accesses[0].address = 0x0; operation->write_accesses[0].size = operation->dma.size; + break; + case ETHOSU_OPERATION_TYPE_POOLING: + if (operation->pooling.activation >= ETHOSU_POOLING_ACTIVATION_LUT(0)) { + operation->read_accesses[1].region = LUT_REGION; + operation->read_accesses[1].address = SHRAM_LUT_BASE(operation->pooling.activation & 0xf); + operation->read_accesses[1].size = LUT8_SIZE; + } + operation->read_accesses[0].region = operation->ifm.region; + operation->read_accesses[0].address = operation->ifm.tiles.addresses[0]; + operation->read_accesses[0].size = operation->ifm.shape.height * operation->ifm.shape.width * operation->ifm.shape.depth; + + operation->write_accesses[0].region = operation->ofm.region; + operation->write_accesses[0].address = operation->ofm.tiles.addresses[0]; + operation->write_accesses[0].size = operation->ofm.shape.height * operation->ofm.shape.width * operation->ofm.shape.depth; break; default: operation->read_accesses[0].region = IO_REGION; diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.c b/src/gallium/drivers/ethosu/ethosu_coefs.c index 328a711e6da..57fb21cc772 100644 --- a/src/gallium/drivers/ethosu/ethosu_coefs.c +++ b/src/gallium/drivers/ethosu/ethosu_coefs.c @@ -160,3 +160,17 @@ fill_coefs(struct ethosu_subgraph *subgraph, memcpy(subgraph->coefs + operation->conv.weights.address, weights, operation->conv.weights.size); free(weights); } + +#define LUT_SIZE 256 + +void +fill_lut(struct ethosu_subgraph *subgraph, + struct ethosu_operation *operation, + void *lut) +{ + operation->pooling.lut.region = COEFS_REGION; + operation->pooling.lut.address = subgraph->coefs_used; + subgraph->coefs_used += LUT_SIZE; + subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used); + memcpy(subgraph->coefs + operation->pooling.lut.address, lut, LUT_SIZE); +} diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.h b/src/gallium/drivers/ethosu/ethosu_coefs.h index 472039ee8d6..3caeb5b6119 100644 --- a/src/gallium/drivers/ethosu/ethosu_coefs.h +++ b/src/gallium/drivers/ethosu/ethosu_coefs.h @@ -15,4 +15,9 @@ fill_coefs(struct ethosu_subgraph *subgraph, uint8_t *weight_data, unsigned weight_size); +void +fill_lut(struct ethosu_subgraph *subgraph, + struct ethosu_operation *operation, + void *lut); + #endif /* ETHOSU_COEFS_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_lower.c b/src/gallium/drivers/ethosu/ethosu_lower.c index 49818a5f0a6..2cd7de5047a 100644 --- a/src/gallium/drivers/ethosu/ethosu_lower.c +++ b/src/gallium/drivers/ethosu/ethosu_lower.c @@ -275,6 +275,75 @@ ethosu_lower_pooling(struct ethosu_subgraph *subgraph, ethosu_sched_operation(subgraph, operation); } +static double +clamp_sigmoid8(double x) +{ + if (x <= -8.0) + return 0.0; + else if (x >= 8.0) + return 1.0; + else + return (1.0 / (1.0 + exp(-x))); +} + +static void +ethos_create_lut(struct ethosu_operation *operation, uint8_t *lut, double (*func)(double)) +{ + double ifm_scale = operation->ifm.scale; + double ofm_scale = operation->ofm.scale; + int zpIn = operation->ifm.zero_point; + int zpOut = operation->ofm.zero_point; + + int qMin = operation->ifm.is_signed ? -128 : 0; + int qMax = operation->ifm.is_signed ? 127 : 255; + + for (int x = qMin; x <= qMax; ++x, lut++) { + double xReal = ifm_scale * (double)(x - zpIn); + double yReal = func(xReal); + int lutVal = (int)round((double)zpOut + yReal / ofm_scale); + lutVal = MIN2(qMax, MAX2(qMin, lutVal)); + *lut = lutVal; + } +} + +static void +ethosu_lower_lut_dma(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *pool_operation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_DMA; + operation->dma.address = pool_operation->pooling.lut.address; + operation->dma.size = LUT8_SIZE; + operation->dma.dst_region = LUT_REGION; + operation->dma.dst_address = SHRAM_LUT_BASE(0); +} + +static void +ethosu_lower_lut(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *operation, double (*func)(double)) +{ + uint8_t lut[LUT8_SIZE]; + + operation->type = ETHOSU_OPERATION_TYPE_POOLING; + operation->round_mode = ETHOSU_ROUNDING_NATURAL; + operation->pooling.type = ETHOSU_POOLING_TYPE_AVG; + operation->pooling.activation = ETHOSU_POOLING_ACTIVATION_LUT(0); + + set_feature_maps(subgraph, poperation->input_tensors[0], poperation->output_tensors[0], operation); + + ethos_create_lut(operation, lut, func); + fill_lut(subgraph, operation, lut); + + /* The LUT handles 0 point and scale, so make them equal */ + operation->ofm.zero_point = operation->ifm.zero_point; + operation->ofm.scale = operation->ifm.scale; + + allocate_feature_maps(subgraph, operation); + ethosu_sched_operation(subgraph, operation); +} + static void ethosu_lower_concatenation(struct ethosu_subgraph *subgraph, const struct pipe_ml_operation *poperation, @@ -431,6 +500,7 @@ ethosu_lower_dma(struct ethosu_subgraph *subgraph, operation->dma.address = conv_operation->conv.scales.address; operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size; + operation->dma.dst_region = SCRATCH_REGION; conv_operation->conv.scales.region = SCRATCH_REGION; conv_operation->conv.scales.address = 0; @@ -537,6 +607,28 @@ ethosu_lower_graph(struct ethosu_subgraph *subgraph, break; } + case PIPE_ML_OPERATION_TYPE_LOGISTIC: { + ethosu_lower_lut(subgraph, &poperations[i], &operation, clamp_sigmoid8); + + struct ethosu_operation dma_operation = {0}; + ethosu_lower_lut_dma(subgraph, &poperations[i], &operation, &dma_operation); + util_dynarray_append(&subgraph->operations, dma_operation); + + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_TANH: { + ethosu_lower_lut(subgraph, &poperations[i], &operation, tanh); + + struct ethosu_operation dma_operation = {0}; + ethosu_lower_lut_dma(subgraph, &poperations[i], &operation, &dma_operation); + util_dynarray_append(&subgraph->operations, dma_operation); + + util_dynarray_append(&subgraph->operations, operation); + break; + } + case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: { ethosu_lower_strided_slice(subgraph, &poperations[i], &operation); util_dynarray_append(&subgraph->operations, operation); diff --git a/src/gallium/drivers/ethosu/ethosu_ml.c b/src/gallium/drivers/ethosu/ethosu_ml.c index aa810c12842..212b55664d1 100644 --- a/src/gallium/drivers/ethosu/ethosu_ml.c +++ b/src/gallium/drivers/ethosu/ethosu_ml.c @@ -145,6 +145,8 @@ ethosu_ml_operation_supported(struct pipe_ml_device *pdevice, case PIPE_ML_OPERATION_TYPE_POOLING: case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: case PIPE_ML_OPERATION_TYPE_PAD: + case PIPE_ML_OPERATION_TYPE_LOGISTIC: + case PIPE_ML_OPERATION_TYPE_TANH: supported = true; break; case PIPE_ML_OPERATION_TYPE_RESIZE: { diff --git a/src/gallium/drivers/ethosu/ethosu_ml.h b/src/gallium/drivers/ethosu/ethosu_ml.h index 4530997fa74..fcaf45003dc 100644 --- a/src/gallium/drivers/ethosu/ethosu_ml.h +++ b/src/gallium/drivers/ethosu/ethosu_ml.h @@ -16,6 +16,8 @@ #define SHRAM_RESERVED_END_BANKS 2 #define SHRAM_TOTAL_BANKS SHRAM_BANKS #define SHRAM_BANK_SIZE_BYTES 1024 +#define LUT8_SIZE 256 +#define SHRAM_LUT_BASE(lut) (46 * SHRAM_BANK_SIZE_BYTES + (lut) * LUT8_SIZE) #define ACC_BITS 32 /* Use for now always 32-bit accumulators */ #define IFM_GRANULE 8 #define ACC_GRANULE 16 @@ -34,6 +36,7 @@ extern struct ethosu_block SUB_KERNEL_MAX; #define COEFS_REGION 0 #define IO_REGION 1 #define SCRATCH_REGION 2 +#define LUT_REGION 0x103 // Internal SHRAM enum ethosu_operation_type { ETHOSU_OPERATION_TYPE_CONVOLUTION, @@ -176,6 +179,8 @@ enum ethosu_pooling_type { ETHOSU_POOLING_TYPE_ARGMAX_Y, }; +#define ETHOSU_POOLING_ACTIVATION_LUT(n) (0x10 | (n)) + #define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/ struct ethosu_operation { @@ -195,6 +200,8 @@ struct ethosu_operation { struct { enum ethosu_pooling_type type; bool nop; + uint8_t activation; + struct ethosu_address_range lut; } pooling; struct { @@ -206,7 +213,9 @@ struct ethosu_operation { struct { unsigned address; + unsigned dst_address; long size; + unsigned dst_region; } dma; };