NEConvolutionLayer Segmentation Fault
poltomo opened this issue · 6 comments
I am getting a segmentation fault for a simple 3x3 convolution.
#include"arm_compute/core/Types.h"
#include"arm_compute/runtime/NEON/NEFunctions.h"
#include"utils/Utils.h"
#include<chrono>
#include<iostream>
#define HI 64343.324234
#define LO -64343.324234
using namespace std;
using namespace arm_compute;
struct Timer {
std::chrono::time_point<std::chrono::high_resolution_clock> start;
std::chrono::duration<double>* time;
Timer(std::chrono::duration<double>* time) : start{std::chrono::high_resolution_clock::now()}, time{time} {}
~Timer() {
auto end = std::chrono::high_resolution_clock::now();
*time += (end - start);
}
};
void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LO, float hi = HI) {
switch ((int)dt) {
case (int)DataType::F32:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((float*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
}
break;
case (int)DataType::F16:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((__fp16*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
}
break;
case (int)DataType::QSYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
case (int)DataType::QASYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
}
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
switch ((int)dt) {
case (int)DataType::F32:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
break;
case (int)DataType::F16:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
break;
case (int)DataType::QSYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
case (int)DataType::QASYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
}
}
int main()
{
Tensor conv_input;
Tensor conv_weight;
Tensor conv_bias;
Tensor conv_output;
const unsigned int N = 1;
const unsigned int Hi = 256;
const unsigned int Wi = 256;
const unsigned int Ci = 240;
const unsigned int Hf = 3;
const unsigned int Wf = 3;
const int Ho = Hi - Hf + 1;
const unsigned int Wo = Wi - Wf + 1;
const unsigned int Co = 64;
cout << "N " << N << endl;
cout << "Hi " << Hi << endl;
cout << "Wi " << Wi << endl;
cout << "Ci " << Ci << endl;
cout << "Hf " << Hf << endl;
cout << "Wf " << Wf << endl;
cout << "Ho " << Ho << endl;
cout << "Wo " << Wo << endl;
cout << "Co " << Co << endl;
auto data_type = DataType::F32;
auto input_info = TensorInfo(TensorShape(Ci, Wi, Hi), 1, data_type, DataLayout::NHWC);
auto weight_info = TensorInfo(TensorShape(Co, Hf, Wf, Ci), 1, data_type, DataLayout::NHWC);
auto output_info = TensorInfo(TensorShape(Co, Wo, Ho), 1, data_type, DataLayout::NHWC);
conv_input.allocator()->init(input_info);
conv_weight.allocator()->init(weight_info);
conv_output.allocator()->init(output_info);
conv_input.allocator()->allocate();
conv_weight.allocator()->allocate();
conv_output.allocator()->allocate();
NEConvolutionLayer conv5{};
conv5.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1,1,0,0));
conv5.run();
std::chrono::duration<double> total_time5(0);
double n = 5;
for (int j= 0;j < n;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time5);
conv5.run();
}
}
std::cout << (total_time5.count() / n) << "\n";
}
Hi @poltomo,
The exact example above was compiled against Arm Compute Library built for armv8 on Linux and ran without segmentation fault.
Here are the steps done:
scons -s -j 8 Werror=0 debug=0 arch=armv8a os=linux neon=1 validation_tests=0 build_dir=neconv_example opencl=0
aarch64-none-linux-gnu-g++ --version aarch64-none-linux-gnu-g++ (fsf-10.128) 10.2.1 20201112
$ aarch64-none-linux-gnu-g++ examples/neconv.cpp -I. -I include/ utils/Utils.cpp -std=c++14 -L build/neconv_example/ -larm_compute -o ne_conv_layer
- running on a Linux board with armv8 architecture, got the following output, with exit code 0:
$ ./ne_conv_layer
N 1
Hi 256
Wi 256
Ci 240
Hf 3
Wf 3
Ho 254
Wo 254
Co 64
0.569512
echo $?
0
- Could you please provide more information to reproduce the Segmentation Fault?
Thanks
@ramelg01
I think the operation just was not supported, that's probably why it segfaulted. See below. I confirmed this is the case for some op configs with the validate function (see below).
see below for updated and cleaner benchmark that checks if op is supported
built libarm_compute.so
CC=aarch64-linux-android26-clang CXX=aarch64-linux-android26-clang++ scons build_dir=build_neon_flags/ toolchain_prefix="" Werror=1 -j4 debug=0 asserts=0 neon=1 cppthreads=0 openmp=0 opencl=0 embed_kernels=1 os=android arch=arm64-v8a extra_cxx_flags="-Ofast -ffast-math -funsafe-math-optimizations"
#include "arm_compute/core/Types.h"
#include "utils/Utils.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include<iostream>
#include "my_benchmark.hpp"
#define HIGH 12312.232
#define LOW -12312.232
// #ifndef DATA_TYPE
#define DATA_TYPE F32 // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 3
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 3
// #endif // ifndef N
#define TRIALS 2.0
using namespace std;
using namespace arm_compute;
void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LOW, float hi = HIGH) {
switch ((int)dt) {
case (int)DataType::F32:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((float*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
}
break;
case (int)DataType::F16:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((__fp16*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
}
break;
case (int)DataType::QSYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
case (int)DataType::QASYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
}
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
switch ((int)dt) {
case (int)DataType::F32:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
break;
case (int)DataType::F16:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
break;
case (int)DataType::QSYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
case (int)DataType::QASYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
}
}
int main() {
// print benchmark info
switch ((int)DataType::DATA_TYPE) {
case (int)DataType::F32:
cout << "F32" << '\n';
break;
case (int)DataType::F16:
cout << "F16" << '\n';
break;
case (int)DataType::QSYMM8:
cout << "QSYMM8" << '\n';
break;
case (int)DataType::QASYMM8:
cout << "QASYMM8" << '\n';
break;
}
cout << "N " << BATCH_N << '\n';
cout << "Hi " << HI << '\n';
cout << "Wi " << WI << '\n';
cout << "Ci " << CI << '\n';
cout << "Hf " << HF << '\n';
cout << "Wf " << WF << '\n';
cout << "Ho " << HO << '\n';
cout << "Wo " << WO << '\n';
cout << "Co " << CO << endl;
// test initialization
auto data_type = DataType::DATA_TYPE;
Tensor conv_input;
Tensor conv_weight;
Tensor conv_output;
auto input_info = TensorInfo(TensorShape(CI, WI, HI, BATCH_N), 1, data_type, DataLayout::NHWC);
auto weight_info = TensorInfo(TensorShape(CO, HF, WF, CI), 1, data_type, DataLayout::NHWC);
auto output_info = TensorInfo(TensorShape(CO, WO, HO, BATCH_N), 1, data_type, DataLayout::NHWC);
conv_input.allocator()->init(input_info);
conv_weight.allocator()->init(weight_info);
conv_output.allocator()->init(output_info);
conv_input.allocator()->allocate();
conv_weight.allocator()->allocate();
conv_output.allocator()->allocate();
NEDirectConvolutionLayer conv1{};
NEGEMMConvolutionLayer conv2{};
NEWinogradConvolutionLayer conv3{};
if (NEDirectConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1, 1, 0, 0))) {
cout << "NEDirectConvolutionLayer" << '\n';
conv1.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv1.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv1.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEDirectConvolutionLayer not supported" << "\n";
}
if (NEGEMMConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), WeightsInfo(), Size2D(1U,1U), ActivationLayerInfo(), true, 1U)) {
cout << "NEGEMMConvolutionLayer" << '\n';
conv2.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv2.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv2.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEGEMMConvolutionLayer not supported" << "\n";
}
if (NEWinogradConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), ActivationLayerInfo(), true)) {
cout << "NEWinogradConvolutionLayer" << '\n';
conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv3.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv3.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEWinogradConvolutionLayer not supported" << "\n";
}
conv_input.allocator()->free();
conv_output.allocator()->free();
conv_weight.allocator()->free();
}
output
F32
N 1
Hi 1024
Wi 1024
Ci 3
Hf 1
Wf 1
Ho 1024
Wo 1024
Co 3
NEDirectConvolutionLayer
0.0791217
NEGEMMConvolutionLayer
0.00665224
NEWinogradConvolutionLayer not supported
In this case, its reasonable since gemm is most performant across the board for 1x1 convs
but why does this dimension define not get any support?
I've seen this exact conv in mobilenet.
#define DATA_TYPE F32 // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 32
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 120
@ramelg01
basically no 1x1 convs are working for big channel_in and channel_out sizes
@ramelg01
Also, how do I benchmark conv implementation minus any runtime/scheduler activity?
Hi @poltomo
case 1x1 is not supported in wingrad convolution,
There is an error thrown when running winograd convolution configure
conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
terminate called after throwing an instance of 'std::runtime_error'
what(): in validate src/cpu/operators/CpuWinogradConv2d.cpp:347: Unsupported kernel size: 1 x 1.
Winograd breaks the dot-product down to smaller pieces which can't be used by adjacent convolution operation and it'll only incur extra processing. So, 1x1 Winograd will increase overhead, thereby deteriorating performance.