diff --git a/src/etnaviv/ci/etnaviv-vipnano-fails.txt b/src/etnaviv/ci/etnaviv-vipnano-fails.txt new file mode 100644 index 00000000000..f266dbf4db8 --- /dev/null +++ b/src/etnaviv/ci/etnaviv-vipnano-fails.txt @@ -0,0 +1,476 @@ +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_5_input_channels_256_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_80_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_1_is_signed_0,Fail + +# Same bits, different result +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail + +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_3_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_5_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Conv2D.Op/input_size_3_weight_size_3_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_120_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_128_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_256_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_120_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_128_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_5_input_channels_32_output_channels_32_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_128_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_256_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_120_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_128_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_256_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_160_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_1_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_120_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_128_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_256_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_32_output_channels_256_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_1_padding_same_1_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_3_weight_size_3_channels_120_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_3_weight_size_3_channels_128_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_3_weight_size_3_channels_256_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_3_weight_size_3_channels_32_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_3_channels_120_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_3_channels_128_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_3_channels_256_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_3_channels_32_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_5_channels_120_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_5_channels_128_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_5_channels_1_stride_2_padding_same_1_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_5_channels_256_stride_2_padding_same_0_is_signed_0,Fail +DepthwiseConv2D.Op/input_size_5_weight_size_5_channels_32_stride_2_padding_same_0_is_signed_0,Fail diff --git a/src/etnaviv/ci/etnaviv-vipnano-flakes.txt b/src/etnaviv/ci/etnaviv-vipnano-flakes.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/etnaviv/ci/etnaviv-vipnano-skips.txt b/src/etnaviv/ci/etnaviv-vipnano-skips.txt new file mode 100644 index 00000000000..50c1ef7a211 --- /dev/null +++ b/src/etnaviv/ci/etnaviv-vipnano-skips.txt @@ -0,0 +1,31 @@ +# The blob produces the same result as Mesa, but different from XNNPACK +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_120_stride_1_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_256_stride_1_padding_same_1_is_signed_0 + +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_1_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_32_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_120_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_128_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_160_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_5_input_channels_1_output_channels_256_stride_2_padding_same_1_is_signed_0 + +# No idea why this one is failing, needs investigation. +# It takes a long time, so better skip for now. +MobileDet.Whole + +# These tests below (adds) aren't well constructed and thus fail in TF +MobileDetParam.Op/mobiledet8 +MobileDetParam.Op/mobiledet11 +MobileDetParam.Op/mobiledet14 +MobileDetParam.Op/mobiledet19 +MobileDetParam.Op/mobiledet22 +MobileDetParam.Op/mobiledet25 +MobileDetParam.Op/mobiledet32 +MobileDetParam.Op/mobiledet35 +MobileDetParam.Op/mobiledet38 +MobileDetParam.Op/mobiledet45 +MobileDetParam.Op/mobiledet49 +MobileDetParam.Op/mobiledet53 +MobileDetParam.Op/mobiledet60 +MobileDetParam.Op/mobiledet64 +MobileDetParam.Op/mobiledet68 diff --git a/src/gallium/drivers/etnaviv/etnaviv_context.c b/src/gallium/drivers/etnaviv/etnaviv_context.c index a2d72928192..be34858ebca 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_context.c +++ b/src/gallium/drivers/etnaviv/etnaviv_context.c @@ -33,6 +33,7 @@ #include "etnaviv_debug.h" #include "etnaviv_emit.h" #include "etnaviv_fence.h" +#include "etnaviv_ml.h" #include "etnaviv_query.h" #include "etnaviv_query_acc.h" #include "etnaviv_rasterizer.h" @@ -665,6 +666,10 @@ etna_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) pctx->destroy = etna_context_destroy; pctx->draw_vbo = etna_draw_vbo; + pctx->ml_subgraph_create = etna_ml_subgraph_create; + pctx->ml_subgraph_invoke = etna_ml_subgraph_invoke; + pctx->ml_subgraph_read_output = etna_ml_subgraph_read_outputs; + pctx->ml_subgraph_destroy = etna_ml_subgraph_destroy; pctx->flush = etna_context_flush; pctx->set_debug_callback = etna_set_debug_callback; pctx->create_fence_fd = etna_create_fence_fd; diff --git a/src/gallium/drivers/etnaviv/etnaviv_debug.h b/src/gallium/drivers/etnaviv/etnaviv_debug.h index d2be530b966..92ec0af9b35 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_debug.h +++ b/src/gallium/drivers/etnaviv/etnaviv_debug.h @@ -43,6 +43,7 @@ enum etna_dbg { ETNA_DBG_DUMP_SHADERS = BITFIELD_BIT(5), ETNA_DRM_MSGS = BITFIELD_BIT(6), /* Debug messages from DRM */ ETNA_DBG_PERF = BITFIELD_BIT(7), + ETNA_DBG_ML_MSGS = BITFIELD_BIT(8), /* Bypasses */ ETNA_DBG_NO_TS = BITFIELD_BIT(12), /* Disable TS */ @@ -61,6 +62,8 @@ enum etna_dbg { ETNA_DBG_LINEAR_PE = BITFIELD_BIT(25), /* Enable linear PE */ ETNA_DBG_MSAA = BITFIELD_BIT(26), /* Enable MSAA */ ETNA_DBG_SHARED_TS = BITFIELD_BIT(27), /* Enable TS sharing */ + ETNA_DBG_NPU_NO_PARALLEL = BITFIELD_BIT(28), /* Disable parallelism inside NPU batches */ + ETNA_DBG_NPU_NO_BATCHING = BITFIELD_BIT(29), /* Disable batching NPU jobs */ }; extern int etna_mesa_debug; /* set in etnaviv_screen.c from ETNA_MESA_DEBUG */ diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.c b/src/gallium/drivers/etnaviv/etnaviv_ml.c new file mode 100644 index 00000000000..ce5482714c9 --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#include "util/u_inlines.h" + +#include "etnaviv_context.h" +#include "etnaviv_debug.h" +#include "etnaviv_emit.h" +#include "etnaviv_ml_nn.h" +#include "etnaviv_ml.h" + +struct pipe_resource * +etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx) +{ + return *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx); +} + +unsigned +etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx) +{ + return *util_dynarray_element(&subgraph->offsets, unsigned, idx); +} + +unsigned +etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph) +{ + struct pipe_resource **tensors = util_dynarray_grow(&subgraph->tensors, struct pipe_resource *, 1); + tensors[0] = NULL; + + unsigned *offsets = util_dynarray_grow(&subgraph->offsets, unsigned, 1); + offsets[0] = 0; + + return util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *) - 1; +} + +static void +etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned size) +{ + struct pipe_context *context = subgraph->base.context; + struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors); + + assert(idx < util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *)); + + struct pipe_resource *res = tensors[idx]; + + if (res != NULL) { + assert(size == pipe_buffer_size(res)); + return; + } + + res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, size); + tensors[idx] = res; + + ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size); +} + +static void +reference_tensor_with_offset(struct etna_ml_subgraph *subgraph, + unsigned src_tensor, + unsigned dst_tensor, + unsigned offset) +{ + struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors); + unsigned *offsets = util_dynarray_begin(&subgraph->offsets); + pipe_resource_reference(&tensors[dst_tensor], tensors[src_tensor]); + offsets[dst_tensor] = offset; +} + +static void +dump_graph(struct list_head *etna_operations) +{ + ML_DBG("\n"); + ML_DBG("dumping intermediate graph: %d operations\n", list_length(etna_operations)); + + ML_DBG("\n"); + ML_DBG("%3s %-4s %3s %3s %s\n", "idx", "type", "in", "out", "operation type-specific"); + ML_DBG("================================================================================================\n"); + unsigned i = 0; + list_for_each_entry(struct etna_operation, operation, etna_operations, link) { + switch(operation->type) { + case ETNA_JOB_TYPE_NN: + ML_DBG("%3d %-4s %3d %3d in2: %3d", + i, "NN", operation->input_tensor, operation->output_tensor, operation->add_input_tensor); + break; + } + ML_DBG("\n"); + i++; + } + ML_DBG("\n"); +} + +static void +lower_operations(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperations, + unsigned count, + struct list_head *etna_operations) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + + switch(poperation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + unsigned input_tensor = poperation->input_tensor->index; + struct etna_operation *operation = calloc(1, sizeof(*operation)); + etna_ml_lower_convolution(subgraph, poperation, operation); + operation->input_tensor = input_tensor; + list_addtail(&operation->link, etna_operations); + break; + } + case PIPE_ML_OPERATION_TYPE_ADD: { + struct etna_operation *operation = calloc(1, sizeof(*operation)); + etna_ml_lower_add(subgraph, poperation, operation); + list_addtail(&operation->link, etna_operations); + break; + } + default: + unreachable("Unsupported ML operation type"); + } + } + + /* TODO: Support graphs with more than one input */ + if (poperations[0].input_tensor->dims[3] > 1) { + struct etna_operation *operation = calloc(1, sizeof(*operation)); + unsigned input_tensor = poperations[0].input_tensor->index; + unsigned output_tensor; + list_for_each_entry(struct etna_operation, operation, etna_operations, link) { + if (operation->input_tensor == input_tensor) + operation->input_tensor = output_tensor; + if (operation->type == ETNA_JOB_TYPE_NN && operation->addition) { + if (operation->add_input_tensor == input_tensor) + operation->add_input_tensor = output_tensor; + } + } + list_add(&operation->link, etna_operations); + } + + list_for_each_entry(struct etna_operation, operation, etna_operations, link) { + etna_ml_create_tensor(subgraph, operation->input_tensor, operation->input_tensor_size); + + if (operation->type == ETNA_JOB_TYPE_NN && operation->addition) + reference_tensor_with_offset(subgraph, + operation->input_tensor, + operation->add_input_tensor, + operation->input_tensor_size / 2); + } + + /* Create any output tensors that aren't inputs to other operations, these + * are the outputs of the graph. + */ + ML_DBG("Ensuring all output tensors have their memory backing.\n"); + list_for_each_entry(struct etna_operation, operation, etna_operations, link) { + struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensor); + if (res != NULL) + continue; + + unsigned size = operation->output_width * operation->output_height * operation->output_channels; + etna_ml_create_tensor(subgraph, operation->output_tensor, size); + } + + if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) + dump_graph(etna_operations); +} + +static unsigned +count_tensors(const struct pipe_ml_operation *poperations, + unsigned count) +{ + unsigned tensor_count = 0; + + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + tensor_count = MAX2(tensor_count, poperation->input_tensor->index); + tensor_count = MAX2(tensor_count, poperation->output_tensor->index); + switch (poperation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: + tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index); + tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index); + break; + case PIPE_ML_OPERATION_TYPE_ADD: + tensor_count = MAX2(tensor_count, poperation->add.input_tensor->index); + break; + default: + unreachable("Unsupported ML operation type"); + } + } + + return tensor_count + 1; +} + +struct pipe_ml_subgraph * +etna_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + struct etna_context *ctx = etna_context(pcontext); + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + struct etna_ml_subgraph *subgraph; + struct list_head operations; + unsigned tensor_count; + + if (nn_core_count < 1) { + fprintf(stderr, "We need at least 1 NN core to do anything useful.\n"); + abort(); + } + + subgraph = calloc(1, sizeof(*subgraph)); + tensor_count = count_tensors(poperations, count); + + list_inithead(&operations); + + subgraph->base.context = pcontext; + util_dynarray_init(&subgraph->operations, NULL); + + util_dynarray_init(&subgraph->tensors, NULL); + if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *, tensor_count)) + return NULL; + memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size); + + util_dynarray_init(&subgraph->offsets, NULL); + if (!util_dynarray_resize(&subgraph->offsets, unsigned, tensor_count)) + return NULL; + memset(util_dynarray_begin(&subgraph->offsets), 0, subgraph->offsets.size); + + lower_operations(subgraph, poperations, count, &operations); + + list_for_each_entry(struct etna_operation, operation, &operations, link) { + struct etna_vip_instruction instruction = {0}; + + switch(operation->type) { + case ETNA_JOB_TYPE_NN: + etna_ml_compile_operation_nn(subgraph, operation, &instruction); + break; + } + + util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction); + } + + list_for_each_entry_safe(struct etna_operation, operation, &operations, link) { + pipe_resource_reference(&operation->weight_tensor, NULL); + pipe_resource_reference(&operation->bias_tensor, NULL); + free(operation); + } + + return &subgraph->base; +} + +static void +dump_buffer(struct etna_bo *bo, char *name, int operation_nr) +{ + char buffer[255]; + + uint32_t *map = etna_bo_map(bo); + snprintf(buffer, sizeof(buffer), "mesa-%s-%08u.bin", name, operation_nr); + ML_DBG("Dumping buffer from 0x%lx (0x%x) to %s\n", map, etna_bo_gpu_va(bo), buffer); + FILE *f = fopen(buffer, "wb"); + assert(f); + fwrite(map, 1, etna_bo_size(bo), f); + if(ferror(f)) { + ML_DBG("Error in writing to file: %s\n", strerror(errno)); + } + fflush(f); + fclose(f); +} + +static void +init_npu(struct pipe_context *pctx) +{ + struct etna_context *ctx = etna_context(pctx); + struct etna_cmd_stream *stream = ctx->stream; + + /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/ + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + + etna_set_state(stream, VIVS_PA_SYSTEM_MODE, VIVS_PA_SYSTEM_MODE_PROVOKING_VERTEX_LAST | + VIVS_PA_SYSTEM_MODE_HALF_PIXEL_CENTER); + etna_set_state(stream, VIVS_GL_API_MODE, VIVS_GL_API_MODE_OPENCL); + + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + + pctx->flush(pctx, NULL, 0); +} + +static void +close_batch(struct pipe_context *pctx) +{ + struct etna_context *ctx = etna_context(pctx); + struct etna_cmd_stream *stream = ctx->stream; + + unsigned cache = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_UNK10; + if (DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL)) + cache |= VIVS_GL_FLUSH_CACHE_UNK11 | VIVS_GL_FLUSH_CACHE_SHADER_L1; + + etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache); + etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache); + + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + + ctx->dirty = 0; +} + +void +etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph, struct pipe_tensor *input) +{ + struct etna_context *ctx = etna_context(pctx); + unsigned tp_core_count = ctx->screen->specs.tp_core_count; + struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph); + struct etna_cmd_stream *stream = ctx->stream; + static bool is_initialized = false; + + if (!is_initialized) { + init_npu(pctx); + is_initialized = true; + } + + if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) { + /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/ + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + } + + unsigned i = 0; + unsigned dump_id = 0; + util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) { + #if 0 + if (i == util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction) - 1) { + /* TODO: This may be necessary when bypassing all-zero kernels */ + etna_bo_cpu_prep(etna_resource(operation->output)->bo, DRM_ETNA_PREP_WRITE); + uint8_t *dst_map = etna_bo_map(etna_resource(operation->output)->bo); + memset(dst_map, 0x77, etna_bo_size(etna_resource(operation->output)->bo)); + etna_bo_cpu_fini(etna_resource(operation->output)->bo); + } + #endif + + if (i == 0) { + unsigned size = input->dims[0] * input->dims[1] * input->dims[2] * input->dims[3]; + pipe_buffer_copy(pctx, operation->input, input->resource, 0, 0, size); + } + + if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) { + switch (operation->type) { + case ETNA_JOB_TYPE_NN: + dump_buffer(operation->configs[0], "nn", dump_id); + dump_buffer(operation->coefficients, "compressed", dump_id); + dump_id++; + break; + default: + unreachable("Unsupported ML operation type"); + } + } + + if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) { + /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/ + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + etna_cmd_stream_emit(stream, 0x0); + } + + for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) + etna_cmd_stream_ref_bo(stream, operation->configs[j], ETNA_RELOC_READ); + if (operation->coefficients) + etna_cmd_stream_ref_bo(stream, operation->coefficients, ETNA_RELOC_READ); + etna_cmd_stream_ref_bo(stream, etna_resource(operation->input)->bo, ETNA_RELOC_READ); + etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE); + + switch (operation->type) { + case ETNA_JOB_TYPE_NN: + etna_ml_emit_operation_nn(subgraph, operation, i); + break; + default: + unreachable("Unsupported ML operation type"); + } + + if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) { + ML_DBG("Running operation %d - %d\n", i, operation->type); + close_batch(pctx); + pctx->flush(pctx, NULL, 0); + stream = ctx->stream; + } + + i++; + } + + if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) + close_batch(pctx); + + if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL)) + pctx->flush(pctx, NULL, 0); +} + +void +etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, unsigned output_idxs[], void *outputs[]) +{ + struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph); + unsigned operation_count = util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction); + struct etna_vip_instruction *last_operation; + + last_operation = util_dynarray_element(&subgraph->operations, + struct etna_vip_instruction, + operation_count - 1); + + if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) { + long start, end; + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000; + + context->flush(context, NULL, 0); + + struct pipe_transfer *transfer = NULL; + pipe_buffer_map(context, last_operation->output, PIPE_MAP_READ, &transfer); + pipe_buffer_unmap(context, transfer); + + clock_gettime(CLOCK_MONOTONIC, &time); + end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000; + ML_DBG("Running the NN job took %ld ms.\n", (end - start)); + } else + context->flush(context, NULL, 0); + + for (int i = 0; i < outputs_count; i++) { + struct pipe_resource *res = etna_ml_get_tensor(subgraph, output_idxs[i]); + pipe_buffer_read(context, res, 0, pipe_buffer_size(res), outputs[i]); + } + + if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) { + unsigned i = 0; + util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) { + struct pipe_transfer *transfer = NULL; + + pipe_buffer_map(context, operation->input, PIPE_MAP_READ, &transfer); + dump_buffer(etna_resource(operation->input)->bo, "input", i); + pipe_buffer_unmap(context, transfer); + + pipe_buffer_map(context, operation->output, PIPE_MAP_READ, &transfer); + dump_buffer(etna_resource(operation->output)->bo, "output", i); + pipe_buffer_unmap(context, transfer); + + i++; + } + } +} + +void +etna_ml_subgraph_destroy(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph) +{ + struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph); + + util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) { + for (unsigned j = 0; j < MAX_CONFIG_BOS && operation->configs[j]; j++) + etna_bo_del(operation->configs[j]); + etna_bo_del(operation->coefficients); + pipe_resource_reference(&operation->input, NULL); + pipe_resource_reference(&operation->output, NULL); + } + util_dynarray_fini(&subgraph->operations); + + util_dynarray_foreach(&subgraph->tensors, struct pipe_resource *, tensor) { + pipe_resource_reference(tensor, NULL); + } + util_dynarray_fini(&subgraph->tensors); + util_dynarray_fini(&subgraph->offsets); + + free(subgraph); +} diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.h b/src/gallium/drivers/etnaviv/etnaviv_ml.h new file mode 100644 index 00000000000..33b757903ae --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef H_ETNA_ML +#define H_ETNA_ML + +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +#define MAX_CONFIG_BOS 4 + +enum etna_job_type { + ETNA_JOB_TYPE_NN, +}; + +struct etna_ml_subgraph { + struct pipe_ml_subgraph base; + + struct util_dynarray operations; + + /* Bother are indexed by tensor index */ + struct util_dynarray tensors; /* Contains struct pipe_resource* */ + struct util_dynarray offsets; /* These are integers */ +}; + +struct etna_vip_instruction { + enum etna_job_type type; + + struct etna_bo *configs[MAX_CONFIG_BOS]; + struct etna_bo *coefficients; + struct pipe_resource *input; + struct pipe_resource *output; + + struct etna_bo *kernel; +}; + +struct etna_operation { + struct list_head link; + + enum etna_job_type type; + + bool addition; + bool depthwise; + bool pointwise; + bool pooling_first_pixel; + bool padding_same; + + unsigned stride; + + unsigned input_tensor; + unsigned input_tensor_size; + unsigned add_input_tensor; + unsigned input_width; + unsigned input_height; + unsigned input_channels; + uint8_t input_zero_point; + float input_scale; + + unsigned output_tensor; + unsigned output_width; + unsigned output_height; + unsigned output_channels; + uint8_t output_zero_point; + float output_scale; + + struct pipe_resource *weight_tensor; + unsigned weight_width; + unsigned weight_height; + uint8_t weight_zero_point; + float weight_scale; + + uint8_t addition_offset; + + struct pipe_resource *bias_tensor; +}; + +#define ML_DBG(fmt, ...) \ + do { \ + if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) \ + _debug_printf(fmt, ##__VA_ARGS__); \ + } while (0) + +unsigned etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph); +struct pipe_resource *etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx); +unsigned etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx); + +struct pipe_ml_subgraph * +etna_ml_subgraph_create(struct pipe_context *context, + const struct pipe_ml_operation *operations, + unsigned count); + +void +etna_ml_subgraph_invoke(struct pipe_context *context, struct pipe_ml_subgraph *subgraph, + struct pipe_tensor *input); + +void +etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgraph *subgraph, + unsigned outputs_count, unsigned output_idxs[], void *outputs[]); + +void +etna_ml_subgraph_destroy(struct pipe_context *context, struct pipe_ml_subgraph *subgraph); + +#endif diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c new file mode 100644 index 00000000000..7f0b8696842 --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c @@ -0,0 +1,1182 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/u_inlines.h" + +#include "etnaviv_context.h" +#include "etnaviv_debug.h" +#include "etnaviv_emit.h" +#include "etnaviv_ml_nn.h" + +#define ETNA_NN_INT8 0 + +#define SRAM_CACHE_MODE_NO_CACHE 0x0 +#define SRAM_CACHE_MODE_FULL_CACHE 0x1 +#define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2 + +enum pooling_type { + ETNA_NN_POOLING_NON, + ETNA_NN_POOLING_MAX, + ETNA_NN_POOLING_AVG, + ETNA_NN_POOLING_FIRST_PIXEL +}; + +#define FIELD(field, bits) uint32_t field : bits; + +struct etna_nn_params { + + FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */ + FIELD(no_z_offset, 1) + FIELD(kernel_xy_size, 4) + FIELD(kernel_z_size, 14) /* & 0x3FFF */ + FIELD(kernels_per_core, 7) + FIELD(pooling, 2) + FIELD(pooling_xy_size, 1) + FIELD(prelu, 1) + FIELD(nn_layer_flush, 1) + + /* 1 */ + FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */ + FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */ + FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */ + FIELD(in_image_x_size, 13) + FIELD(in_image_y_size, 13) + + /* 2 */ + FIELD(in_image_x_offset, 3) + FIELD(in_image_y_offset, 3) + FIELD(unused0, 1) + FIELD(brick_mode, 1) + FIELD(brick_distance, 16) + FIELD(relu, 1) + FIELD(unused1, 1) + FIELD(post_multiplier, 1) + FIELD(post_shift, 5) + + /* 3 */ + FIELD(unused2, 3) + FIELD(no_flush, 1) + FIELD(unused3, 2) + FIELD(out_image_x_size, 13) + FIELD(out_image_y_size, 13) + + /* 4 */ + /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */ + FIELD(out_image_z_size, 14) + FIELD(rounding_mode, 2) + FIELD(in_image_x_offset_bit_3, 1) /* >> 3 & 0x1 */ + FIELD(in_image_y_offset_bit_3, 1) /* >> 3 & 0x1 */ + FIELD(out_image_tile_x_size, 7) + FIELD(out_image_tile_y_size, 7) + + /* 5 */ + FIELD(kernel_address, 26) /* >> 6 */ + FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */ + + /* 6 */ + FIELD(in_image_address, 32) + + /* 7 */ + FIELD(out_image_address, 32) + + /* 8 */ + FIELD(image_caching_mode, 2) + FIELD(kernel_caching_mode, 2) + FIELD(partial_cache_data_unit, 2) + FIELD(kernel_pattern_msb, 6) + FIELD(kernel_y_size, 4) + FIELD(out_image_y_stride, 16) + + /* 9 */ + FIELD(kernel_pattern_low, 32) + + /* 10 */ + FIELD(kernel_pattern_high, 32) + + /* 11 */ + FIELD(kernel_cache_start_address, 32) + + /* 12 */ + FIELD(kernel_cache_end_address, 32) + + /* 13 */ + FIELD(image_cache_start_address, 32) + + /* 14 */ + FIELD(image_cache_end_address, 32) + + /* 15 */ + FIELD(in_image_border_mode, 2) + FIELD(in_image_border_const, 16) + FIELD(unused4, 1) + FIELD(kernel_data_type_bit_2, 1) + FIELD(in_image_data_type_bit_2, 1) + FIELD(out_image_data_type_bit_2, 1) + FIELD(post_multiplier_1_to_6, 6) + FIELD(post_shift_bit_5_6, 2) + FIELD(unused5, 2) + + /* 16 */ + FIELD(in_image_x_stride, 16) + FIELD(in_image_y_stride, 16) + + /* 17 */ + FIELD(out_image_x_stride, 16) + FIELD(unused6, 8) + FIELD(post_multiplier_7_to_14, 8) + + /* 18 */ + FIELD(out_image_circular_buf_size, 26) /* >> 6 */ + FIELD(unused7, 5) + FIELD(per_channel_post_mul, 1) + + /* 19 */ + FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */ + FIELD(unused8, 6) + + /* 20 */ + FIELD(in_image_circular_buf_size, 26) /* >> 6 */ + FIELD(unused9, 6) + + /* 21 */ + FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */ + FIELD(unused10, 6) + + /* 22 */ + FIELD(coef_zero_point, 8) + FIELD(out_zero_point, 8) + FIELD(kernel_direct_stream_from_VIP_sram, 1) + FIELD(depthwise, 1) + FIELD(unused11, 14) + + /* 23, from here they aren't set on */ + FIELD(unused12, 32) + + /* 24 */ + FIELD(unused13, 4) + FIELD(unused14, 28) /* 0 >> 4 */ + + /* 25 */ + FIELD(unused15, 4) + FIELD(unused16, 28) /* 0 >> 4 */ + + /* 26 */ + FIELD(further1, 32) + FIELD(further2, 32) + FIELD(further3, 32) + FIELD(further4, 32) + FIELD(further5, 32) + FIELD(further6, 32) + FIELD(further7, 32) + FIELD(further8, 32) +}; + +static void * +map_resource(struct pipe_resource *resource) +{ + return etna_bo_map(etna_resource(resource)->bo); +} + + +static void +pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation) +{ + /* Fill a Nx2x2xN tensor with zero_points */ + struct pipe_context *context = subgraph->base.context; + uint8_t *input = map_resource(operation->weight_tensor); + unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels; + struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, + new_size); + uint8_t *output = map_resource(output_res); + + for (unsigned channel = 0; channel < operation->output_channels; channel++) { + uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels; + uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels; + + map_out[0] = map_in[0]; + map_out[1] = operation->weight_zero_point; + map_out[2] = operation->weight_zero_point; + map_out[3] = operation->weight_zero_point; + } + + pipe_resource_reference(&operation->weight_tensor, NULL); + operation->weight_tensor = output_res; + + operation->weight_width = operation->weight_height = 2; + operation->pointwise = false; +} + +static void +expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + uint8_t *input = map_resource(operation->weight_tensor); + unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels; + struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, + new_size); + uint8_t *output = map_resource(output_res); + + /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */ + for (unsigned channel = 0; channel < operation->output_channels; channel++) { + unsigned in_channel = channel / operation->output_channels; + unsigned in_depth = channel % operation->output_channels; + + uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels; + uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels; + + for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) { + if (i % operation->input_channels == in_depth) + map_out[i] = map_in[i]; + else + map_out[i] = operation->weight_zero_point; + } + } + + pipe_resource_reference(&operation->weight_tensor, NULL); + operation->weight_tensor = output_res; +} + +static void +transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + void *map = map_resource(operation->weight_tensor); + unsigned new_size = operation->output_channels * operation->weight_width * \ + operation->weight_height * operation->input_channels; + struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, + new_size); + uint8_t *output = map_resource(output_res); + unsigned output_channels = operation->output_channels; + unsigned input_channels = operation->input_channels; + + if (operation->addition) { + output_channels = 1; + input_channels = 2; + } + + uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map; + unsigned i = 0; + for (unsigned d0 = 0; d0 < output_channels; d0++) + for (unsigned d3 = 0; d3 < input_channels; d3++) + for (unsigned d1 = 0; d1 < operation->weight_width; d1++) + for (unsigned d2 = 0; d2 < operation->weight_height; d2++) + ((uint8_t*)output)[i++] = input[d0][d1][d2][d3]; + + pipe_resource_reference(&operation->weight_tensor, NULL); + operation->weight_tensor = output_res; +} + +static void +subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp) +{ + uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in; + uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out; + + for(unsigned x = 0; x < out_width; x++) + for(unsigned y = 0; y < out_height; y++) { + unsigned in_x = x * stride + offset_x; + unsigned in_y = y * stride + offset_y; + if (in_x < in_width && in_y < in_height) + out[x][y] = in[in_x][in_y][in_z]; + else + out[x][y] = in_zp; + } +} + +/* TODO: Do the reshaping in the TP units, for big enough buffers */ +static void +reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4]) +{ + for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) { + void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3]; + void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3]; + + /* See Figure 3 in https://arxiv.org/abs/1712.02502 */ + /* This is only valid for stride == 2 */ + assert(stride == 2); + uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out; + for (unsigned z = 0; z < dims_in[3]; z++) { + subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp); + subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp); + subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp); + subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp); + } + } +} + +static void +strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + uint8_t *input = map_resource(operation->weight_tensor); + unsigned new_size; + struct pipe_resource *output_res; + uint8_t *output; + + /* The hardware doesn't support strides natively, so we "lower" them as + * described in this paper: + * + * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502 + */ + + /* TODO: Support more strides */ + assert(operation->stride == 2); + + unsigned wdims_in[4] = {operation->output_channels, + operation->weight_width, + operation->weight_height, + operation->input_channels}; + + operation->input_channels = operation->input_channels * operation->stride * operation->stride; + operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride); + operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride); + + if (operation->padding_same) { + if (operation->weight_width == 5) { + operation->input_width += 2; + operation->input_height += 2; + } else { + operation->input_width += 1; + operation->input_height += 1; + } + } + + operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride); + operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride); + + new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels; + output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, new_size); + output = map_resource(output_res); + + unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels}; + reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out); + + pipe_resource_reference(&operation->weight_tensor, NULL); + operation->weight_tensor = output_res; +} + +void +etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct etna_operation *operation) +{ + /* TODO: Support stride_x != stride_y */ + assert(poperation->conv.stride_x == poperation->conv.stride_y); + assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION); + + operation->type = ETNA_JOB_TYPE_NN; + operation->addition = false; + operation->depthwise = poperation->conv.depthwise; + operation->pointwise = poperation->conv.pointwise; + operation->pooling_first_pixel = poperation->conv.stride_x > 1 && \ + (poperation->conv.depthwise || poperation->conv.pointwise); + operation->padding_same = poperation->conv.padding_same; + operation->stride = poperation->conv.stride_x; + + operation->input_tensor = poperation->input_tensor->index; + operation->input_width = poperation->input_tensor->dims[1]; + operation->input_height = poperation->input_tensor->dims[2]; + operation->input_channels = poperation->input_tensor->dims[3]; + operation->input_zero_point = poperation->input_tensor->zero_point; + operation->input_scale = poperation->input_tensor->scale; + + operation->output_tensor = poperation->output_tensor->index; + operation->output_width = poperation->output_tensor->dims[1]; + operation->output_height = poperation->output_tensor->dims[2]; + operation->output_channels = poperation->output_tensor->dims[3]; + operation->output_zero_point = poperation->output_tensor->zero_point; + operation->output_scale = poperation->output_tensor->scale; + + pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource); + operation->weight_width = poperation->conv.weight_tensor->dims[1]; + operation->weight_height = poperation->conv.weight_tensor->dims[2]; + operation->weight_zero_point = poperation->conv.weight_tensor->zero_point; + operation->weight_scale = poperation->conv.weight_tensor->scale; + + pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource); + + if (operation->pointwise && operation->input_channels == 1) + pointwise_to_2x2(subgraph, operation); + + if (operation->depthwise && (operation->output_channels > 1 || operation->stride > 1)) { + + if (operation->input_width < 8 && operation->input_width > 2) + operation->pooling_first_pixel = false; + + expand_depthwise(subgraph, operation); + } + + if (operation->stride > 1 && !operation->pooling_first_pixel) + strided_to_normal(subgraph, operation); /* This will already transpose if input_channels > 1 */ + else if (operation->input_channels > 1) + transpose(subgraph, operation); + + operation->input_tensor_size = operation->input_width * + operation->input_height * + operation->input_channels; + ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels); +} + +static float +compute_weight_scale_add(float input1_scale, float input2_scale) +{ + double scale_ratio = input1_scale / input2_scale; + + return (float) MAX2(scale_ratio, 1.0) / 255.0; +} + +static uint8_t +compute_addition_offset(float input1_scale, float input2_scale, float weight_scale) +{ + double addition_offset = input1_scale / input2_scale; + addition_offset /= weight_scale; + return round(addition_offset + 0.0) * 1; +} + +static uint8_t +compute_weight_add(float input1_scale, float input2_scale, float weight_scale) +{ + double weight = 1.0 / weight_scale; + return round(weight + 0.0); +} + +static uint32_t +compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale) +{ + int zero_point_diff = input2_zp - input1_zp; + double bias = zero_point_diff * input1_scale; + bias /= weight_scale * input2_scale; + + double addition_offset = input1_scale / input2_scale; + addition_offset /= weight_scale; + addition_offset = round(addition_offset + 0.0) * 1; + + return (int) (round(bias) - round(addition_offset) * input2_zp); +} + +void +etna_ml_lower_add(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + + assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD); + + operation->addition = true; + operation->depthwise = false; + operation->pointwise = false; + operation->pooling_first_pixel = false; + operation->padding_same = false; + operation->stride = 1; + + operation->input_tensor = poperation->input_tensor->index; + operation->add_input_tensor = poperation->add.input_tensor->index; + operation->input_width = poperation->input_tensor->dims[1]; + operation->input_height = poperation->input_tensor->dims[2]; + operation->input_channels = poperation->input_tensor->dims[3]; + operation->input_zero_point = poperation->input_tensor->zero_point; + operation->input_scale = poperation->input_tensor->scale; + operation->input_tensor_size = operation->input_width * + operation->input_height * + operation->input_channels * + 2; + + operation->output_tensor = poperation->output_tensor->index; + operation->output_width = poperation->output_tensor->dims[1]; + operation->output_height = poperation->output_tensor->dims[2]; + operation->output_channels = poperation->output_tensor->dims[3]; + operation->output_zero_point = poperation->output_tensor->zero_point; + operation->output_scale = poperation->output_tensor->scale; + + operation->weight_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 8); + operation->weight_width = 2; + operation->weight_height = 2; + operation->weight_zero_point = 0x0; + operation->weight_scale = compute_weight_scale_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale); + operation->addition_offset = compute_addition_offset(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale); + + uint8_t *weight_map = map_resource(operation->weight_tensor); + memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor)); + weight_map[0] = compute_weight_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale); + + operation->bias_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 4); + int32_t *bias_map = map_resource(operation->bias_tensor); + bias_map[0] = compute_bias_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, + poperation->add.input_tensor->zero_point, poperation->input_tensor->zero_point, + operation->weight_scale); +} + +#define ACCUM_BUFFER_DEPTH 64 +#define INPUT_BUFFER_DEPTH 12 +#define MAX_TILE_WIDTH 64 + +static unsigned +calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode) +{ + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count); + unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y; + + if (operation->weight_width == 1) + foo = MIN2(foo, ACCUM_BUFFER_DEPTH / 3); + + foo = MIN2(foo, kernels_per_core); + foo = MIN2(foo, 127); + + kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo); + unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count); + unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels); + + /* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */ + while(operation->output_channels % superblocks) + superblocks++; + + ML_DBG("superblocks %d\n", superblocks); + + return superblocks; +} + +static unsigned +calc_interleave_mode(unsigned tile_width, unsigned weight_height) +{ + unsigned mode = 8; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2) + return 1; + + if (tile_width > MAX_TILE_WIDTH / 2) + mode = 1; + else if (tile_width > MAX_TILE_WIDTH / 4) + mode = 2; + else if (tile_width > MAX_TILE_WIDTH / 8) + mode = 4; + + if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) + return MIN2(mode, 4); + + return MIN2(mode, 2); +} + +static void +calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels, + unsigned *output_width, unsigned *output_height, unsigned *output_channels) +{ + ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels); + + unsigned channel_size = *input_width * *input_height; + unsigned width = 0; + if (channel_size % 128 == 0) + width = 128; + else if (channel_size % 64 == 0) + width = 64; + else if (channel_size % 32 == 0) + width = 32; + else { + for (int i = 63; i > 0; i--) { + if (channel_size % i == 0) { + width = i; + break; + } + } + } + + *input_height = (*input_width * *input_height * *input_channels) / width; + *input_width = width; + *input_channels = 2; + + *output_height = *output_width * *output_height * *output_channels / width; + *output_width = width; + *output_channels = 1; +} + +static unsigned +calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out) +{ + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned input_channels = operation->input_channels; + unsigned output_width = operation->output_width; + unsigned output_height = operation->output_height; + unsigned output_channels = operation->output_channels; + unsigned tile_width; + unsigned tile_height; + unsigned superblocks; + unsigned interleave_mode; + + if (operation->addition) + calc_addition_sizes(&input_width, &input_height, &input_channels, + &output_width, &output_height, &output_channels); + + if (operation->pooling_first_pixel) { + output_width *= 2; + output_height *= 2; + } + + tile_width = MIN2(output_width, 64); + interleave_mode = calc_interleave_mode(tile_width, operation->weight_height); + + tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1; + ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width); + tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH); + //tile_height = MIN2(tile_height, operation->input_width); + tile_height = MIN2(tile_height, output_height); + + if (operation->stride > 1 && tile_height % 2 > 0) + tile_height -= 1; + + superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode); + ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks); + + if (tile_width_out) + *tile_width_out = tile_width; + + if (tile_height_out) + *tile_height_out = tile_height; + + return superblocks; +} + +static struct etna_bo * +create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coefficients_size) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + unsigned oc_sram_size = ctx->screen->specs.on_chip_sram_size; + struct etna_bo *bo = etna_bo_new(ctx->screen->dev, + sizeof(struct etna_nn_params), + DRM_ETNA_GEM_CACHE_WC); + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned input_channels = operation->input_channels; + unsigned output_width = operation->output_width; + unsigned output_height = operation->output_height; + unsigned output_channels = operation->output_channels; + unsigned weight_width = operation->weight_width; + unsigned weight_height = operation->weight_height; + + if (operation->pointwise && input_channels == 1) + weight_width = weight_height = 2; + + if (operation->addition) + calc_addition_sizes(&input_width, &input_height, &input_channels, + &output_width, &output_height, &output_channels); + + unsigned input_size = input_width * input_height * input_channels; + + etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE); + + struct etna_nn_params *map = etna_bo_map(bo); + map->layer_type = 0x0; + map->no_z_offset = 0x0; + map->prelu = 0x0; + map->nn_layer_flush = 0x1; + map->brick_mode = 0x0; + map->brick_distance = 0x0; + map->relu = 0x0; + map->no_flush = 0x0; + map->rounding_mode = 0x1; + map->partial_cache_data_unit = 0x0; + map->depthwise = 0x0; + + map->unused0 = 0x0; + map->unused1 = 0x0; + map->unused2 = 0x0; + map->unused3 = 0x0; + map->unused4 = 0x0; + map->unused5 = 0x0; + map->unused6 = 0x0; + map->unused7 = 0x0; + map->unused8 = 0x0; + map->unused9 = 0x0; + map->unused10 = 0x0; + map->unused11 = 0x0; + map->unused12 = 0x0; + map->unused13 = 0x0; + map->unused14 = 0x0; + map->further1 = 0x0; + map->further2 = 0x0; + map->further3 = 0x3ffffff; + map->further4 = 0x7f800000; + map->further5 = 0xff800000; + map->further6 = 0x0; + map->further7 = 0x0; + map->further8 = 0x0; + + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor); + map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset; + map->in_image_x_size = input_width; + map->in_image_y_size = input_height; + map->in_image_x_stride = input_width; + map->in_image_y_stride = input_height; + map->in_image_data_type = ETNA_NN_INT8; + map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2; + map->in_image_circular_buf_size = 0x0; + map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6; + map->in_image_border_mode = 0x0; + map->in_image_border_const = operation->input_zero_point; + + if (operation->padding_same && operation->stride == 1 && weight_width > 2) { + if (weight_width < 5) { + map->in_image_x_offset = 0x7; + map->in_image_y_offset = 0x7; + } else { + map->in_image_x_offset = 0x6; + map->in_image_y_offset = 0x6; + } + map->in_image_x_offset_bit_3 = 0x1; + map->in_image_y_offset_bit_3 = 0x1; + } else { + map->in_image_x_offset = 0x0; + map->in_image_y_offset = 0x0; + map->in_image_x_offset_bit_3 = 0x0; + map->in_image_y_offset_bit_3 = 0x0; + } + + if (operation->padding_same && operation->stride == 2 && weight_width == 5) { + map->in_image_x_offset = 0x7; + map->in_image_y_offset = 0x7; + map->in_image_x_offset_bit_3 = 0x1; + map->in_image_y_offset_bit_3 = 0x1; + } + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + offset = etna_ml_get_offset(subgraph, operation->output_tensor); + map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset; + map->out_image_x_size = output_width; + map->out_image_y_size = output_height; + map->out_image_z_size = output_channels; + + map->out_image_x_stride = map->out_image_x_size; + map->out_image_y_stride = map->out_image_y_size; + + map->out_image_data_type = ETNA_NN_INT8; + map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2; + map->out_image_circular_buf_size = 0x0; + map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6; + map->out_zero_point = operation->output_zero_point; + + if (operation->pooling_first_pixel) { + map->pooling = ETNA_NN_POOLING_FIRST_PIXEL; + map->pooling_xy_size = 0x0; + + map->out_image_x_size *= 2; + map->out_image_y_size *= 2; + } else { + map->pooling = ETNA_NN_POOLING_NON; + map->pooling_xy_size = 0x1; + } + + unsigned tile_x, tile_y; + unsigned superblocks = calculate_tiling(ctx, operation, &tile_x, &tile_y); + map->out_image_tile_x_size = tile_x; + map->out_image_tile_y_size = tile_y; + + map->kernel_address = etna_bo_gpu_va(coefficients) >> 6; + map->kernel_xy_size = weight_width; + map->kernel_y_size = weight_height; + map->kernel_z_size = input_channels; + map->kernel_z_size2 = 0x0; + map->kernel_data_type = ETNA_NN_INT8; + map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2; + map->kernel_direct_stream_from_VIP_sram = 0x0; + + map->coef_zero_point = operation->weight_zero_point; + + map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks); + + /* Should be max accumBufferDepth (64) / zdpNum (3) */ + //assert(map->kernels_per_core <= (64 / 3)); + + /* The header doesn't get cached */ + coefficients_size -= 64; + + map->kernel_cache_start_address = 0x800; + map->kernel_cache_end_address = MAX2(MIN2(map->kernel_cache_start_address + coefficients_size, oc_sram_size), 0x1a00); + + if (output_channels <= 128 || map->kernel_cache_end_address == oc_sram_size) { + map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE; + map->image_cache_start_address = 0x0; + map->image_cache_end_address = 0x800; + } else { + map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE; + map->image_cache_start_address = map->kernel_cache_end_address; + map->image_cache_end_address = MIN2(map->image_cache_start_address + input_size + 1024, oc_sram_size); + } + + /* TODO: Look at re-enabling the image cache again */ + map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE; + map->image_cache_start_address = 0x0; + map->image_cache_end_address = 0x800; + + if (etna_bo_size(coefficients) <= 0x80000 - 0x800) { + map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE; + map->kernel_pattern_msb = 0x0; + map->kernel_pattern_low = 0x0; + map->kernel_pattern_high = 0x0; + } else { + /* Doesn't fit in the 512KB we have of on-chip SRAM */ + map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE; + if (map->out_image_z_size >= 1024) { + map->kernel_pattern_msb = 0x13; + map->kernel_pattern_low = 0x80000; + map->kernel_pattern_high = 0x0; + } else if (map->out_image_z_size >= 512) { + map->kernel_pattern_msb = 0x3d; + map->kernel_pattern_low = 0x0; + map->kernel_pattern_high = 0x2aaaaaa0; + } else if (map->out_image_z_size >= 256) { + map->kernel_pattern_msb = 0x3e; + map->kernel_pattern_low = 0xffffaaaa; + map->kernel_pattern_high = 0x7fffffff; + } else if (map->out_image_z_size >= 160) { + map->kernel_pattern_msb = 0x6; + map->kernel_pattern_low = 0x7e; + map->kernel_pattern_high = 0x0; + } else { + map->kernel_pattern_msb = 0x3f; + map->kernel_pattern_low = 0xfffffffe; + map->kernel_pattern_high = 0xffffffff; + } + } + + float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale; + uint32_t scale_bits = fui(conv_scale); + /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */ + unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16; + + /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */ + map->post_shift = shift & 0x1f; + map->post_shift_bit_5_6 = (shift >> 5) & 0x3; + + /* Multiplies by (multiplier * 2^15) */ + map->post_multiplier = (scale_bits >> 8) & 0x1; + map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f; + map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff; + + map->per_channel_post_mul = 0x0; + + etna_bo_cpu_fini(bo); + + return bo; +} + +static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation) +{ + int32_t correction = 0; + + for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) { + correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point; + } + + return correction; +} + +static void +write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +{ + struct pipe_context *pctx = subgraph->base.context; + unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; + unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + uint8_t *input = map_resource(operation->weight_tensor); + uint32_t *biases = map_resource(operation->bias_tensor); + unsigned out_values_per_channel = operation->output_width * operation->output_height; + unsigned stride = MIN2(operation->input_channels, 6); + unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); + uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)]; + + ML_DBG("%s\n", __func__); + + for (unsigned superblock = 0; superblock < superblocks; superblock++) { + + unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks); + if (superblock == superblocks - 1) + kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; + + for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels; + } + + for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) { + for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + + if (block == 0) { + *map++ = weights_maps[kernel][0]; + + uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation); + //fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]); + *((uint32_t *)map) = biases[out_channel] - corr; + map += sizeof(uint32_t); + + for (int i = 1; i < stride; i++) { + *map++ = weights_maps[kernel][i]; + } + } else { + for (int i = 0; i < stride; i++) { + if (i + block * stride < operation->input_channels) + *map++ = weights_maps[kernel][i + block * stride]; + } + } + if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) { + *((uint32_t*)map) = out_values_per_channel * out_channel; + map += sizeof(uint32_t); + } + } + } + } +} + +static void +write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +{ + struct pipe_context *pctx = subgraph->base.context; + unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; + unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + uint8_t *input = map_resource(operation->weight_tensor); + uint32_t *biases = map_resource(operation->bias_tensor); + unsigned out_values_per_channel = operation->output_width * operation->output_height; + unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); + uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input; + + ML_DBG("%s core %d\n", __func__, core); + + for (unsigned superblock = 0; superblock < superblocks; superblock++) { + + unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks); + if (superblock == superblocks - 1) + kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; + + for (unsigned z = 0; z < operation->input_channels; z++) { + for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + +#if 0 + if (z == 0) + fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n", + core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel); +#endif + + for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) { + unsigned stride = operation->weight_height; + if (operation->weight_height > 3) + stride = 3; + for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) { + if (x >= operation->weight_width) + break; + for (unsigned y = 0; y < stride; y++) { + //fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]); + *map++ = weights_map[out_channel][z][x][y]; + if (x == 0 && y == 0 && z == 0) { + uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation); + //fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]); + *((uint32_t *)map) = biases[out_channel] - corr; + map += sizeof(uint32_t); + } + } + } + if (operation->weight_height > 3) { + for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) { + if (x >= operation->weight_width) + break; + for (unsigned y = stride; y < operation->weight_width; y++) { + //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]); + *map++ = weights_map[out_channel][z][x][y]; + } + } + } + } + + if (z == operation->input_channels - 1) { + *((uint32_t*)map) = out_values_per_channel * out_channel; + map += sizeof(uint32_t); + } + } + } + } +} + +static void +write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +{ + struct pipe_context *pctx = subgraph->base.context; + unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; + unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + uint8_t *input = map_resource(operation->weight_tensor); + uint32_t *biases = map_resource(operation->bias_tensor); + unsigned out_values_per_channel = operation->output_width * operation->output_height; + unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); + + ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels); + + for (unsigned superblock = 0; superblock < superblocks; superblock++) { + + unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks); + if (superblock == superblocks - 1) + kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; + + for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + + uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height; + + for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) { + unsigned stride = operation->weight_height; + if ((operation->depthwise || operation->input_width > 64) && \ + operation->weight_height > 3) + stride = 3; + for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) { + if (x >= operation->weight_width) + break; + for (unsigned y = 0; y < stride; y++) { + //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]); + + *map++ = weights_map[x][y]; + if (x == 0 && y == 0) { + uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation); + *((uint32_t *)map) = biases[out_channel] - corr; + map += sizeof(uint32_t); + } + } + } + if ((operation->depthwise || operation->input_width > 64) && \ + operation->weight_height > 3) { + for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) { + if (x >= operation->weight_width) + break; + for (unsigned y = stride; y < operation->weight_width; y++) { + //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]); + *map++ = weights_map[x][y]; + } + } + } + } + if (operation->addition) { + *((uint32_t*)map) = operation->addition_offset; + } else + *((uint32_t*)map) = out_values_per_channel * out_channel; + map += sizeof(uint32_t); + } + } +} + +static struct etna_bo * +create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size) +{ + /* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + unsigned header_size = ALIGN(nn_core_count * 4, 64); + unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */ + unsigned input_channels; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); + uint8_t zero_length_encoding = false; + unsigned weights_size; + unsigned core_size; + unsigned core_size_aligned; + + input_channels = operation->addition ? 1 : operation->input_channels; + weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size; + core_size = 3 + (weights_size + 4 + 4) * kernels_per_core; + core_size_aligned = ALIGN(core_size, 64); + *size = header_size + core_size_aligned * cores_used; + + struct etna_bo *compressed = etna_bo_new(ctx->screen->dev, + *size, + DRM_ETNA_GEM_CACHE_WC); + + etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE); + + uint8_t *map = etna_bo_map(compressed); + uint32_t *header = (uint32_t *)map; + + memset(map, 0, *size); + + for (unsigned core = 0; core < cores_used; core++) + header[core] = core_size_aligned; + + map += header_size; + +#if 0 + uint8_t *input = map_resource(operation->weight_tensor); + for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++) + fprintf(stderr, "i %d: %02x\n", i, input[i]); +#endif + + for (unsigned core = 0; core < cores_used; core++) { + + *map++ = zero_length_encoding; + + *((uint16_t *)map) = kernels_per_core; + map += sizeof(uint16_t); + + if (operation->pointwise && input_channels >= 1 && output_channels > 8) + write_6_weight_format(subgraph, map, kernels_per_core, core, operation); + else if (input_channels > 1) + write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation); + else + write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation); + + map += core_size_aligned - 3; + } + + etna_bo_cpu_fini(compressed); + + return compressed; +} + +void +etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, + struct etna_vip_instruction *instruction) +{ + unsigned coefficients_size; + + instruction->type = ETNA_JOB_TYPE_NN; + instruction->coefficients = create_coefficients_bo(subgraph, operation, &coefficients_size); + + struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor); + assert(input); + pipe_resource_reference(&instruction->input, input); + + struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor); + assert(output); + pipe_resource_reference(&instruction->output, output); + + instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coefficients_size); +} + +void +etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph, + struct etna_vip_instruction *operation, + unsigned idx) +{ + struct pipe_context *pctx = subgraph->base.context; + struct etna_context *ctx = etna_context(pctx); + struct etna_cmd_stream *stream = ctx->stream; + unsigned offset = idx + 1; + unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */ + + if (DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL)) { + nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH; + offset = 0; + } + + etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0); + etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0); + + etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config); + etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) { + .bo = operation->configs[0], + .flags = ETNA_RELOC_READ, + .offset = offset, + }); + etna_set_state(stream, VIVS_PS_UNK10A4, offset); +} diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h new file mode 100644 index 00000000000..468db300f6a --- /dev/null +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2023-2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "etnaviv_ml.h" + +void +etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct etna_operation *operation); + +void +etna_ml_lower_add(struct etna_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct etna_operation *operation); + +void +etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, + const struct etna_operation *operation, + struct etna_vip_instruction *instruction); + +void +etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph, + struct etna_vip_instruction *operation, + unsigned idx); \ No newline at end of file diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index e1dcb0bc496..e4c40452e72 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -59,6 +59,7 @@ static const struct debug_named_value etna_debug_options[] = { {"resource_msgs", ETNA_DBG_RESOURCE_MSGS, "Print resource messages"}, {"compiler_msgs", ETNA_DBG_COMPILER_MSGS, "Print compiler messages"}, {"linker_msgs", ETNA_DBG_LINKER_MSGS, "Print linker messages"}, + {"ml_msgs", ETNA_DBG_ML_MSGS, "Print ML messages"}, {"dump_shaders", ETNA_DBG_DUMP_SHADERS, "Dump shaders"}, {"no_ts", ETNA_DBG_NO_TS, "Disable TS"}, {"no_autodisable", ETNA_DBG_NO_AUTODISABLE, "Disable autodisable"}, @@ -76,6 +77,8 @@ static const struct debug_named_value etna_debug_options[] = { {"msaa", ETNA_DBG_MSAA, "Enable MSAA support"}, {"shared_ts", ETNA_DBG_SHARED_TS, "Enable TS sharing"}, {"perf", ETNA_DBG_PERF, "Enable performance warnings"}, + {"npu_no_parallel",ETNA_DBG_NPU_NO_PARALLEL, "Disable parallelism inside NPU batches"}, + {"npu_no_batching",ETNA_DBG_NPU_NO_BATCHING, "Disable batching NPU jobs"}, DEBUG_NAMED_VALUE_END }; diff --git a/src/gallium/drivers/etnaviv/meson.build b/src/gallium/drivers/etnaviv/meson.build index cc450fc2e72..0d60305e1ce 100644 --- a/src/gallium/drivers/etnaviv/meson.build +++ b/src/gallium/drivers/etnaviv/meson.build @@ -57,6 +57,10 @@ files_etnaviv = files( 'etnaviv_format.c', 'etnaviv_format.h', 'etnaviv_internal.h', + 'etnaviv_ml.c', + 'etnaviv_ml.h', + 'etnaviv_ml_nn.c', + 'etnaviv_ml_nn.h', 'etnaviv_nir_lower_source_mods.c', 'etnaviv_nir_lower_texture.c', 'etnaviv_nir_lower_ubo_to_uniform.c',