diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.c b/src/gallium/drivers/etnaviv/etnaviv_ml.c index ce5482714c9..659c588f38a 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml.c @@ -13,6 +13,7 @@ #include "etnaviv_debug.h" #include "etnaviv_emit.h" #include "etnaviv_ml_nn.h" +#include "etnaviv_ml_tp.h" #include "etnaviv_ml.h" struct pipe_resource * @@ -60,6 +61,16 @@ etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size); } +static bool +needs_reshuffle(const struct pipe_ml_operation *poperation) +{ + bool has_stride = poperation->conv.stride_x > 1 || poperation->conv.stride_y > 1; + bool pointwise = poperation->conv.pointwise; + unsigned input_width = poperation->input_tensor->dims[1]; + + return has_stride && !(poperation->conv.depthwise && (input_width > 5 || input_width < 3)) && !pointwise; +} + static void reference_tensor_with_offset(struct etna_ml_subgraph *subgraph, unsigned src_tensor, @@ -84,6 +95,10 @@ dump_graph(struct list_head *etna_operations) unsigned i = 0; list_for_each_entry(struct etna_operation, operation, etna_operations, link) { switch(operation->type) { + case ETNA_JOB_TYPE_TP: + ML_DBG("%3d %-4s %3d %3d", + i, "TP", operation->input_tensor, operation->output_tensor); + break; case ETNA_JOB_TYPE_NN: ML_DBG("%3d %-4s %3d %3d in2: %3d", i, "NN", operation->input_tensor, operation->output_tensor, operation->add_input_tensor); @@ -107,6 +122,12 @@ lower_operations(struct etna_ml_subgraph *subgraph, switch(poperation->type) { case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { unsigned input_tensor = poperation->input_tensor->index; + if (needs_reshuffle(poperation)) { + struct etna_operation *operation = calloc(1, sizeof(*operation)); + etna_ml_lower_reshuffle(subgraph, poperation, operation, &input_tensor); + list_addtail(&operation->link, etna_operations); + } + struct etna_operation *operation = calloc(1, sizeof(*operation)); etna_ml_lower_convolution(subgraph, poperation, operation); operation->input_tensor = input_tensor; @@ -129,6 +150,7 @@ lower_operations(struct etna_ml_subgraph *subgraph, struct etna_operation *operation = calloc(1, sizeof(*operation)); unsigned input_tensor = poperations[0].input_tensor->index; unsigned output_tensor; + etna_ml_lower_transpose(subgraph, &poperations[0], operation, &output_tensor); list_for_each_entry(struct etna_operation, operation, etna_operations, link) { if (operation->input_tensor == input_tensor) operation->input_tensor = output_tensor; @@ -150,6 +172,22 @@ lower_operations(struct etna_ml_subgraph *subgraph, operation->input_tensor_size / 2); } + /* Detranspose any output tensors that aren't inputs to other operations + * and have output channels, these are the outputs of the graph. + */ + list_for_each_entry_safe(struct etna_operation, operation, etna_operations, link) { + struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensor); + if (res != NULL) + continue; + + if (operation->output_channels > 1) { + struct etna_operation *transpose_operation = calloc(1, sizeof(*operation)); + etna_ml_lower_detranspose(subgraph, operation, transpose_operation); + operation->output_tensor = transpose_operation->input_tensor; + list_add(&transpose_operation->link, &operation->link); + } + } + /* Create any output tensors that aren't inputs to other operations, these * are the outputs of the graph. */ @@ -236,6 +274,9 @@ etna_ml_subgraph_create(struct pipe_context *pcontext, case ETNA_JOB_TYPE_NN: etna_ml_compile_operation_nn(subgraph, operation, &instruction); break; + case ETNA_JOB_TYPE_TP: + etna_ml_compile_operation_tp(subgraph, operation, &instruction); + break; } util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction); @@ -359,6 +400,12 @@ etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psub if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) { switch (operation->type) { + case ETNA_JOB_TYPE_TP: + for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) { + dump_buffer(operation->configs[j], "tp", dump_id); + dump_id++; + } + break; case ETNA_JOB_TYPE_NN: dump_buffer(operation->configs[0], "nn", dump_id); dump_buffer(operation->coefficients, "compressed", dump_id); @@ -389,6 +436,9 @@ etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psub etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE); switch (operation->type) { + case ETNA_JOB_TYPE_TP: + etna_ml_emit_operation_tp(subgraph, operation, i); + break; case ETNA_JOB_TYPE_NN: etna_ml_emit_operation_nn(subgraph, operation, i); break; diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.h b/src/gallium/drivers/etnaviv/etnaviv_ml.h index 33b757903ae..4b77dc99253 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml.h +++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h @@ -13,6 +13,13 @@ enum etna_job_type { ETNA_JOB_TYPE_NN, + ETNA_JOB_TYPE_TP, +}; + +enum etna_ml_tp_type { + ETNA_ML_TP_TRANSPOSE, + ETNA_ML_TP_DETRANSPOSE, + ETNA_ML_TP_RESHUFFLE, }; struct etna_ml_subgraph { @@ -40,6 +47,7 @@ struct etna_operation { struct list_head link; enum etna_job_type type; + enum etna_ml_tp_type tp_type; bool addition; bool depthwise; diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c new file mode 100644 index 00000000000..1433f9595fd --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c @@ -0,0 +1,780 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/u_inlines.h" + +#include "etnaviv_context.h" +#include "etnaviv_debug.h" +#include "etnaviv_emit.h" +#include "etnaviv_ml_tp.h" + +#define FIELD(field, bits) uint32_t field : bits; + +struct etna_tp_params { + /* 0 */ + FIELD(in_image_x_size, 16) + FIELD(unused0, 16) + + /* 1 */ + FIELD(in_image_y_size, 16) + FIELD(in_image_z_size, 16) + + /* 2 */ + FIELD(in_image_stride, 16) + FIELD(unused1, 16) + + /* 3 */ + FIELD(in_image_slice, 32) + + /* 4 */ + FIELD(in_window_x_start, 16) + FIELD(in_window_y_start, 16) + + /* 5 */ + FIELD(in_window_x_end, 16) + FIELD(in_window_y_end, 16) + + /* 6 */ + FIELD(in_tile_sequence, 2) + FIELD(in_tile_global_mem, 1) + FIELD(in_image_global_mem, 1) + FIELD(alu_i2f_enable, 1) + FIELD(alu_square_enable, 1) + FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */ + FIELD(alu_horz_proc_count, 6) + FIELD(alu_horz_proc_stride, 1) + FIELD(alu_vert_processing, 2) + FIELD(unused2, 1) + FIELD(alu_vert_proc_count, 6) + FIELD(alu_vert_proc_stride, 1) + FIELD(alu_nms_enable, 1) + FIELD(alu_pwl_enable, 1) + FIELD(alu_mult_enable, 1) + FIELD(alu_f2i_enable, 1) + FIELD(alu_load_pwl_lut, 1) + FIELD(alu_load_pwl_lut_global_mem, 1) + + /* 7 */ + FIELD(in_tile_list_address, 32) + + /* 8 */ + FIELD(in_tile_x_size, 16) + FIELD(in_tile_y_size, 16) + + /* 9 */ + FIELD(in_tile_x_inc, 16) + FIELD(in_tile_y_inc, 16) + + /* 10 */ + FIELD(in_image_base_address, 32) + + /* 11 */ + FIELD(alu_load_pwl_lut_address, 32) + + /* 12 */ + FIELD(out_tile_skip_at_border, 1) + FIELD(out_image_global_mem, 1) + FIELD(out_loop_1_reset, 1) + FIELD(out_loop_2_reset, 1) + FIELD(out_loop_3_reset, 1) + FIELD(out_brick_mode, 1) + FIELD(alu_z_filter_mode, 1) + FIELD(unused3, 1) + FIELD(in_window_z_start_overfetch, 2) + FIELD(unused4, 1) + FIELD(in_window_z_end_overfetch, 2) + FIELD(unused5, 1) + FIELD(alu_square_preshift, 4) + FIELD(in_image_data_type, 3) + FIELD(out_image_data_type, 3) + FIELD(unused6, 4) + FIELD(alu_pwl_sign_support, 1) + FIELD(alu_relu_enable, 1) + FIELD(no_flush, 1) + FIELD(last, 1) + + /* 13 */ + FIELD(out_image_base_address, 32) + + /* 14 */ + FIELD(out_loop_0_inc, 32) + + /* 15 */ + FIELD(out_loop_1_inc, 32) + + /* 16 */ + FIELD(out_loop_0_count, 16) + FIELD(out_loop_1_count, 16) + + /* 17 */ + FIELD(out_loop_2_inc, 32) + + /* 18 */ + FIELD(out_loop_3_inc, 32) + + /* 19 */ + FIELD(out_loop_2_count, 16) + FIELD(out_loop_3_count, 16) + + /* 20 */ + FIELD(out_loop_4_inc, 32) + + /* 21 */ + FIELD(out_loop_5_inc, 32) + + /* 22 */ + FIELD(out_loop_4_count, 16) + FIELD(out_loop_5_count, 16) + + /* 23 */ + FIELD(out_loop_6_inc, 32) + + /* 24 */ + FIELD(alu_filter_pwl_swap, 1) + FIELD(flat_rounding_mode, 2) + FIELD(integer_rounding_mode, 2) + FIELD(alu_input_preshift, 5) + FIELD(alu_output_postshift, 5) + FIELD(alu_reorder_bits_used, 4) + FIELD(alu_reorder_loop_2_mode, 1) + FIELD(unused7, 4) + FIELD(in_image_border_mode, 2) + FIELD(alu_output_postshift_5_6, 2) + FIELD(unused8, 4) + + /* 25 */ + FIELD(in_image_circular_buf_size, 32) /* >> 6 */ + + /* 26 */ + FIELD(in_image_circular_buf_end_address_plus_1, 32) /* >> 6 */ + + /* 27 */ + FIELD(out_image_circular_buf_size, 32) /* >> 6 */ + + /* 28 */ + FIELD(out_image_circular_buf_end_address_plus_1, 32) /* >> 6 */ + + /* 29 */ + FIELD(in_image_border_const, 16) + FIELD(coef_zp, 8) + FIELD(in_zp, 8) + + /* 30 */ + FIELD(out_zp, 8) + FIELD(alu_output_post_multiplier, 15) + FIELD(unused9, 9) +}; + +static void +set_default_tp_config(struct etna_tp_params *map) +{ + map->unused0 = 0x0; + map->unused1 = 0x0; + map->in_window_x_start = 0x0; + map->in_window_y_start = 0x0; + map->in_tile_sequence = 0x0; + map->in_tile_global_mem = 0x0; + map->in_image_global_mem = 0x1; + map->alu_i2f_enable = 0x1; + map->alu_square_enable = 0x0; + map->alu_horz_processing = 0x0; + map->alu_horz_proc_count = 0x0; + map->alu_horz_proc_stride = 0x0; + map->alu_vert_processing = 0x0; + map->unused2 = 0x0; + map->alu_vert_proc_count = 0x0; + map->alu_vert_proc_stride = 0x0; + map->alu_nms_enable = 0x0; + map->alu_pwl_enable = 0x0; + map->alu_mult_enable = 0x0; + map->alu_f2i_enable = 0x1; + map->alu_load_pwl_lut = 0x0; + map->alu_load_pwl_lut_global_mem = 0x0; + map->in_tile_list_address = 0x0; + map->in_tile_x_size = 0x1; + map->in_tile_x_inc = 0x1; + map->alu_load_pwl_lut_address = 0x0; + map->out_tile_skip_at_border = 0x0; + map->out_image_global_mem = 0x1; + map->out_loop_1_reset = 0x0; + map->out_loop_2_reset = 0x0; + map->out_loop_3_reset = 0x0; + map->out_brick_mode = 0x0; + map->alu_z_filter_mode = 0x0; + map->unused3 = 0x0; + map->in_window_z_start_overfetch = 0x0; + map->unused4 = 0x0; + map->in_window_z_end_overfetch = 0x0; + map->unused5 = 0x0; + map->alu_square_preshift = 0x0; + map->in_image_data_type = 0x0; + map->out_image_data_type = 0x0; + map->unused6 = 0x0; + map->alu_pwl_sign_support = 0x0; + map->alu_relu_enable = 0x0; + map->no_flush = 0x0; + map->last = 0x1; + map->out_loop_0_inc = 0x1; + map->out_loop_3_inc = 0x0; + map->out_loop_3_count = 0x1; + map->out_loop_4_inc = 0x0; + map->out_loop_5_inc = 0x0; + map->out_loop_4_count = 0x1; + map->out_loop_5_count = 0x1; + map->out_loop_6_inc = 0x0; + map->alu_filter_pwl_swap = 0x0; + map->flat_rounding_mode = 0x1; + map->integer_rounding_mode = 0x1; + map->alu_input_preshift = 0x0; + map->alu_output_postshift = 0x0; + map->alu_reorder_bits_used = 0x0; + map->alu_reorder_loop_2_mode = 0x0; + map->unused7 = 0x0; + map->in_image_border_mode = 0x0; + map->alu_output_postshift_5_6 = 0x0; + map->unused8 = 0x0; + map->in_image_border_const = 0x0; + map->coef_zp = 0x0; + map->alu_output_post_multiplier = 0x0; + map->unused9 = 0x0; +} + +static struct etna_bo * +create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation) +{ + struct etna_context *ctx = etna_context(subgraph->base.context); + struct etna_bo *bo = etna_bo_new(ctx->screen->dev, + sizeof(struct etna_tp_params), + DRM_ETNA_GEM_CACHE_WC); + + etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); + + struct etna_tp_params *map = etna_bo_map(bo); + + set_default_tp_config(map); + + map->in_image_x_size = operation->input_channels; + map->in_image_y_size = operation->input_height; + map->in_image_z_size = operation->input_width; + map->in_image_stride = operation->input_channels; + map->in_image_slice = operation->input_width * operation->input_channels; + map->in_window_x_end = operation->input_channels - 1; + map->in_window_y_end = operation->input_height - 1; + map->in_tile_y_size = operation->input_height; + map->in_tile_y_inc = operation->input_height; + + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + unsigned offset = etna_ml_get_offset(subgraph, operation->output_tensor); + map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset; + + map->out_loop_1_inc = operation->input_width * operation->input_height; + map->out_loop_0_count = operation->input_height; + map->out_loop_1_count = operation->input_channels; + map->out_loop_2_inc = operation->input_height; + map->out_loop_2_count = operation->input_width; + map->in_image_circular_buf_size = 0x0; + map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + map->out_image_circular_buf_size = 0x0; + map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + map->in_zp = operation->input_zero_point; + map->out_zp = operation->input_zero_point; + map->no_flush = 0x0; + + etna_bo_cpu_fini(bo); + + return bo; +} + +static struct etna_bo * +create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation) +{ + struct etna_context *ctx = etna_context(subgraph->base.context); + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned input_channels = operation->input_channels; + struct etna_bo *bo = etna_bo_new(ctx->screen->dev, + sizeof(struct etna_tp_params), + DRM_ETNA_GEM_CACHE_WC); + + etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); + + struct etna_tp_params *map = etna_bo_map(bo); + + set_default_tp_config(map); + + map->in_image_x_size = input_width; + map->in_image_y_size = input_height * input_channels; + map->in_image_z_size = 0x1; + map->in_image_stride = input_width; + map->in_image_slice = input_width * input_height * input_channels; + map->in_window_x_end = input_width - 1; + map->in_window_y_end = input_height * input_channels - 1; + map->in_tile_y_size = 0x1; + map->in_tile_y_inc = 0x1; + + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo); + + map->out_loop_0_inc = input_channels; + map->out_loop_1_inc = 0x0; + map->out_loop_0_count = input_height; + map->out_loop_1_count = 0x1; + map->out_loop_2_inc = input_height * input_channels; + map->out_loop_2_count = input_width; + map->out_loop_3_inc = 0x1; + map->out_loop_3_count = input_channels; + map->out_loop_4_inc = input_width * input_height * input_channels; + map->in_image_circular_buf_size = 0x0; + map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + map->out_image_circular_buf_size = 0x0; + map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + map->in_zp = operation->input_zero_point; + map->out_zp = operation->input_zero_point; + + etna_bo_cpu_fini(bo); + + return bo; +} + +static void +set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used) +{ + map->in_image_x_size = operation->input_width; + + if (operation->padding_same && operation->input_channels > 1) { + map->in_image_y_size = operation->input_height; + map->in_image_z_size = operation->input_channels / tp_cores_used; + } else if (operation->padding_same && operation->input_channels == 1) { + switch(operation->input_width) { + case 3: + case 5: + map->in_image_y_size = operation->input_height; + break; + case 8: + switch(operation->weight_width) { + case 3: + map->in_image_y_size = operation->input_height; + break; + case 5: + map->in_image_y_size = 5; + break; + } + break; + case 80: + case 112: + switch(operation->weight_width) { + case 3: + map->in_image_y_size = operation->input_height / tp_cores_used + 2; + break; + case 5: + map->in_image_y_size = operation->input_height / tp_cores_used + 1; + break; + } + break; + default: + unreachable("Unsupported input width"); + } + map->in_image_z_size = operation->input_channels; + } else { + map->in_image_y_size = operation->input_height / tp_cores_used; + map->in_image_z_size = operation->input_channels; + } +} + +static struct etna_bo * +create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, + unsigned tp_core, unsigned tp_cores_used) +{ + struct etna_context *ctx = etna_context(subgraph->base.context); + unsigned tp_core_count = ctx->screen->specs.tp_core_count; + struct etna_bo *bo = etna_bo_new(ctx->screen->dev, + sizeof(struct etna_tp_params), + DRM_ETNA_GEM_CACHE_WC); + + etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); + + struct etna_tp_params *map = etna_bo_map(bo); + + set_default_tp_config(map); + + set_input_size(operation, map, tp_cores_used); + + map->in_image_stride = operation->input_width; + map->in_image_slice = operation->input_width * operation->input_height; + + if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) { + if (operation->weight_width == 5 && operation->input_width < 8) { + map->in_window_x_start = 0xfffe; + map->in_window_y_start = 0xfffe; + } else { + map->in_window_x_start = 0xffff; + map->in_window_y_start = 0xffff; + } + } else { + map->in_window_x_start = 0x0; + map->in_window_y_start = 0x0; + } + + map->in_window_x_end = operation->input_width - 1; + map->in_window_y_end = (operation->input_height / tp_cores_used) - 1; + map->in_tile_x_size = operation->input_width; + map->in_tile_x_inc = operation->input_width; + + if (operation->input_width <= 8 && operation->input_channels == 1) { + map->in_tile_y_size = operation->input_height; + map->in_tile_y_inc = operation->input_height; + } else { + map->in_tile_y_size = operation->input_height / tp_cores_used; + map->in_tile_y_inc = operation->input_height / tp_cores_used; + } + + if (operation->padding_same) { + switch(operation->weight_width) { + case 3: + map->in_window_x_end += 2; + if (operation->input_width < 8) { + map->in_tile_x_size += 3; + map->in_tile_y_size += 1; + map->in_tile_y_inc += 1; + } else { + map->in_tile_x_size += 2; + } + break; + case 5: + map->in_window_x_end += 3; + if (operation->input_width < 8) { + map->in_tile_x_size += 5; + } else { + map->in_tile_x_size += 4; + } + break; + default: + unreachable("Unsupported weight size"); + } + + if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5) + map->in_tile_x_size = operation->input_width / tp_cores_used + 2; + + if (operation->input_width > 8 && operation->input_channels == 1) { + switch(operation->weight_width) { + case 3: + map->in_window_y_end = (operation->input_height / tp_cores_used) + 1; + break; + case 5: + map->in_window_y_end = (operation->input_height / tp_cores_used); + break; + default: + unreachable("Unsupported weight size"); + } + } else + map->in_window_y_end = map->in_window_x_end; + + map->in_tile_x_inc = map->in_tile_x_size; + + if (operation->input_channels > 1) { + map->in_tile_y_size = map->in_tile_x_size; + map->in_tile_y_inc = map->in_tile_x_size; + } else { + map->in_tile_y_size += 2; + map->in_tile_y_inc += 2; + } + } else { + if (operation->input_width < 8) { + map->in_window_x_end += 1; + map->in_window_y_end += 1; + map->in_tile_x_size += 1; + map->in_tile_y_size += 1; + map->in_tile_x_inc += 1; + map->in_tile_y_inc += 1; + } + } + + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); + + if (operation->padding_same) + map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core; + else + map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core; + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo); + + if (operation->padding_same) + map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core; + else + map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core; + + map->out_loop_1_reset = 0x1; + map->out_loop_2_reset = 0x0; + map->out_loop_3_reset = 0x1; + map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2); + map->out_loop_1_inc = 0x1; + map->out_loop_0_count = 0x2; + map->out_loop_1_count = round(operation->input_width / 2.0); + map->out_loop_2_count = 0x2; + map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used); + + if (operation->padding_same) { + switch(operation->weight_width) { + case 3: + map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2); + map->out_loop_1_count += 1; + break; + case 5: + map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2); + map->out_loop_1_count += 2; + break; + default: + unreachable("Unsupported weight size"); + } + + if (operation->input_channels == 1) + map->out_loop_3_count += 1; + else + map->out_loop_3_count = map->out_loop_1_count; + } + + map->out_loop_2_inc = map->out_loop_0_inc * 2; + map->out_loop_3_inc = map->out_loop_1_count; + map->out_loop_6_inc = map->out_loop_0_inc * 4; + + if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) { + if (tp_core > 0) { + map->in_image_y_size -= 2; + map->in_window_y_end -= 2; + map->in_tile_y_size -= 2; + map->in_tile_y_inc -= 2; + map->out_loop_3_count -= 1; + } + + if (tp_core == tp_core_count - 1) { + map->in_image_y_size -= 2; + } + + if (tp_core > 0) { + map->in_image_base_address += operation->input_width * 2; + map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1); + } + } + + unsigned alu_size = operation->input_width; + if (operation->padding_same) { + alu_size += 1; + if (operation->weight_width == 5) + alu_size += 1; + if (operation->input_width == 5) + alu_size += 1; + } + + map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size); + + map->in_zp = operation->input_zero_point; + map->out_zp = operation->input_zero_point; + + if (tp_cores_used > 1) + map->no_flush = tp_core < tp_cores_used - 1; + + map->in_image_circular_buf_size = 0x0; + map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + map->out_image_circular_buf_size = 0x0; + map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6; + + if (map->in_image_y_size < 2) { + map->in_image_y_size = operation->input_width; + map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used; + map->in_window_y_end = operation->input_width; + map->in_tile_y_size = operation->input_width + 1; + map->in_tile_y_inc = operation->input_width + 1; + map->out_loop_3_count += 1; + + map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); + map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core; + + map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo); + map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core; + } + + etna_bo_cpu_fini(bo); + + return bo; +} + +void +etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *first_operation, + struct etna_operation *operation, + unsigned *output_tensor) +{ + operation->type = ETNA_JOB_TYPE_TP; + operation->tp_type = ETNA_ML_TP_TRANSPOSE; + + operation->input_tensor = first_operation->input_tensor->index; + operation->input_width = first_operation->input_tensor->dims[1]; + operation->input_height = first_operation->input_tensor->dims[2]; + operation->input_channels = first_operation->input_tensor->dims[3]; + operation->input_zero_point = first_operation->input_tensor->zero_point; + operation->input_scale = first_operation->input_tensor->scale; + operation->input_tensor_size = operation->input_width * + operation->input_height * + operation->input_channels; + + *output_tensor = etna_ml_allocate_tensor(subgraph); + operation->output_tensor = *output_tensor; + operation->output_width = first_operation->input_tensor->dims[1]; + operation->output_height = first_operation->input_tensor->dims[2]; + operation->output_channels = first_operation->input_tensor->dims[3]; + operation->output_zero_point = first_operation->input_tensor->zero_point; + operation->output_scale = first_operation->input_tensor->scale; +} + +void +etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph, + struct etna_operation *convolution, + struct etna_operation *operation) +{ + operation->type = ETNA_JOB_TYPE_TP; + operation->tp_type = ETNA_ML_TP_DETRANSPOSE; + + operation->input_tensor = etna_ml_allocate_tensor(subgraph); + operation->input_width = convolution->output_width; + operation->input_height = convolution->output_height; + operation->input_channels = convolution->output_channels; + operation->input_zero_point = convolution->output_zero_point; + operation->input_scale = convolution->output_scale; + operation->input_tensor_size = operation->input_width * + operation->input_height * + operation->input_channels; + + operation->output_tensor = convolution->output_tensor; + operation->output_width = convolution->output_width; + operation->output_height = convolution->output_height; + operation->output_channels = convolution->output_channels; + operation->output_zero_point = convolution->output_zero_point; + operation->output_scale = convolution->output_scale; +} + +void +etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *convolution, + struct etna_operation *operation, + unsigned *output_tensor) +{ + operation->type = ETNA_JOB_TYPE_TP; + operation->tp_type = ETNA_ML_TP_RESHUFFLE; + operation->stride = convolution->conv.stride_x; + operation->padding_same = convolution->conv.padding_same; + + operation->input_tensor = convolution->input_tensor->index; + operation->input_width = convolution->input_tensor->dims[1]; + operation->input_height = convolution->input_tensor->dims[2]; + operation->input_channels = convolution->input_tensor->dims[3]; + operation->input_zero_point = convolution->input_tensor->zero_point; + operation->input_scale = convolution->input_tensor->scale; + operation->input_tensor_size = operation->input_width * + operation->input_height * + operation->input_channels; + + *output_tensor = etna_ml_allocate_tensor(subgraph); + operation->output_tensor = *output_tensor; + operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride); + operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride); + operation->output_channels = operation->input_channels * operation->stride * operation->stride; + operation->output_zero_point = convolution->input_tensor->zero_point; + operation->output_scale = convolution->input_tensor->scale; + + /* When destriding a convolution, the transformation to be made to the input + * tensor will depend on the size of the weight tensor. + */ + operation->weight_width = convolution->conv.weight_tensor->dims[1]; + operation->weight_height = convolution->conv.weight_tensor->dims[2]; + + if (operation->padding_same) { + if (operation->weight_width == 5) { + operation->output_width += 2; + operation->output_height += 2; + } else { + operation->output_width += 1; + operation->output_height += 1; + } + } +} + +void +etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph, + const struct etna_operation *operation, + struct etna_vip_instruction *instruction) +{ + struct etna_context *ctx = etna_context(subgraph->base.context); + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + assert(input); + pipe_resource_reference(&instruction->input, input); + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + assert(output); + pipe_resource_reference(&instruction->output, output); + + switch (operation->tp_type) { + case ETNA_ML_TP_TRANSPOSE: + instruction->configs[0] = create_transpose_config(subgraph, operation); + break; + case ETNA_ML_TP_DETRANSPOSE: + instruction->configs[0] = create_detranspose_config(subgraph, operation); + break; + case ETNA_ML_TP_RESHUFFLE: { + unsigned tp_core_count = ctx->screen->specs.tp_core_count; + unsigned tp_cores_used; + + tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1; + + /* TODO: Run among the 4 cores for faster performance */ + if ((operation->input_width == 320 || operation->input_width == 224) && + operation->input_channels == 3) + tp_cores_used = 1; + + ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used); + for (unsigned i = 0; i < tp_cores_used; i++) { + instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used); + } + break; + } + } + instruction->type = ETNA_JOB_TYPE_TP; +} + +void +etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph, + struct etna_vip_instruction *operation, + unsigned idx) +{ + struct etna_context *ctx = etna_context(subgraph->base.context); + unsigned tp_core_count = ctx->screen->specs.tp_core_count; + struct etna_cmd_stream *stream = ctx->stream; + bool more_than_one_tp_job = operation->configs[1] != NULL; + bool parallel = !DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL); + + for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) { + unsigned offset = parallel ? idx + 1 : 0; + + if (more_than_one_tp_job && (j < tp_core_count - 1)) + offset = parallel ? 0x1f : 0x1; + + etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0); + etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0); + etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0); + etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) { + .bo = operation->configs[j], + .flags = ETNA_RELOC_READ, + .offset = offset, + }); + } + etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0); +} \ No newline at end of file diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.h b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.h new file mode 100644 index 00000000000..5c47938cd41 --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "etnaviv_ml.h" + +void +etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *first_operation, + struct etna_operation *operation, + unsigned *output_tensor); + +void +etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph, + struct etna_operation *convolution, + struct etna_operation *operation); + +void +etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *first_operation, + struct etna_operation *operation, + unsigned *output_tensor); + +void +etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph, + const struct etna_operation *operation, + struct etna_vip_instruction *instruction); + +void +etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph, + struct etna_vip_instruction *operation, + unsigned idx); \ No newline at end of file diff --git a/src/gallium/drivers/etnaviv/meson.build b/src/gallium/drivers/etnaviv/meson.build index 0d60305e1ce..a25a86a29a1 100644 --- a/src/gallium/drivers/etnaviv/meson.build +++ b/src/gallium/drivers/etnaviv/meson.build @@ -61,6 +61,8 @@ files_etnaviv = files( 'etnaviv_ml.h', 'etnaviv_ml_nn.c', 'etnaviv_ml_nn.h', + 'etnaviv_ml_tp.c', + 'etnaviv_ml_tp.h', 'etnaviv_nir_lower_source_mods.c', 'etnaviv_nir_lower_texture.c', 'etnaviv_nir_lower_ubo_to_uniform.c',