ARM-software/ComputeLibrary

Latency for Conv2d and Depthwise

Closed this issue · 2 comments

arm_compute_version=v23.05 Build options: {'Werror': '1', 'debug': '0', 'asserts': '1', 'neon': '1', 'opencl': '0', 'os': 'linux', 'arch': 'armv8a'} Git hash=b'6c713f090601ea839d944a30888ea56eb2f43988'

Platform: Raspberry-Pi 3B A53

Operating System: Linux

Problem description: The latency of a depthwise is affected by the previous convolution

Two conv2d and one is 3x3 conv2d one is 3x3 depthwise, the first conv2d output is second depthwise input, I use same memory buffer to save feature map. The latency of conv2d is 13ms and depthwise is 4 ms. If don't use same memory buffer, the latency of depthwise is 1 ms.

Use same buffer conv_output_buffer

{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(conv_output_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    for(int i = 0; i < 5; i++)
    {
   
        conv.run();
       
        depth_conv.run();

        conv1.run();
   
        std::cout << "=============================" << std::endl;
    }

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);
}

Don't use same buffer

{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(input_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    for(int i = 0; i < 5; i++)
    {
   
        conv.run();
       
        depth_conv.run();

        conv1.run();
   
        std::cout << "=============================" << std::endl;
    }

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);
}

Time used is 13.863
Time used is 4.076
Time used is 0.593
=============================

Time used is 13.736
Time used is 1.204
Time used is 0.66

Hi @wenhyan

I made some changes to your code to assess the performance and I don't see any differences. I tried on A73 and built the test with -O3. The library was built with scons os=linux opencl=0 asserts=0 examples=0 neon=1 arch=armv8a benchmark_examples=0 examples=0 arch=armv8a debug=0 validation_tests=0 opencl=0

See the output of the two binaries:

# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./same_latency 
SAVE BUFFER
 same buffers 92ms to run.
# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./diff_latency 
Different buffer
DIFF BUFFER
 diff buffers 91ms to run.

And the code below

#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/Utils.h"
#include "tests/SimpleTensor.h"
#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
#include <chrono>


using namespace std;
using namespace arm_compute;
using namespace arm_compute::test;



int main()
{

#if 0
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(conv_output_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    std::cout << "SAVE BUFFER\n";

    auto start_time = std::chrono::high_resolution_clock::now();
        conv.run();
        depth_conv.run();
        conv1.run();
 
    for(int i = 0; i < 25; i++)
    {
        conv.run();
        depth_conv.run();
        conv1.run();
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto time = end_time - start_time;
    std::cout << " same buffers "<<   time/std::chrono::milliseconds(1) << "ms to run.\n";

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer); 
#else
    std::cout << "Different buffer\n";

    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(input_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));
    std::cout << "DIFF BUFFER\n";

    auto start_time = std::chrono::high_resolution_clock::now();
        conv.run();
        depth_conv.run();
        conv1.run();
 
    for(int i = 0; i < 25; i++)
    {
        conv.run();
        depth_conv.run();
        conv1.run();
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto time = end_time - start_time;
    std::cout << " diff buffers "<<   time/std::chrono::milliseconds(1) << "ms to run.\n";


    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);

#endif

    return 0;
}

Hi @morgolock Thx. I will try it again.