Segmentation fault
kangheng opened this issue · 7 comments
I've recorded the following pytorch program with uftrace.
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
#from apex import amp
from torch.profiler import profile, record_function, ProfilerActivity
#from torchsummary import summary
import logging
logging.basicConfig(level='INFO')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=1, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--bsize', default=8, type=int, metavar='N',
help='number of batch_size')
args = parser.parse_args()
#args.world_size = args.gpus * args.nodes
args.world_size = 1
torch.set_num_threads(1)
os.environ["OMP_NUM_THREADS"] = '1'
os.environ["MKL__NUM_THREADS"] = '1'
os.environ['MASTER_ADDR'] = '109.105.115.156'
os.environ['MASTER_PORT'] = '8087'
#os.environ['CUDA_LAUNCH_BLOCKING']='1'
logging.info("main args")
#mp.spawn(train, nprocs=args.gpus, args=(args,))
mp.spawn(train, nprocs=1, args=(args,))
logging.info("main spawn")
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(4096*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def train(gpu, args):
#print('calnum begin',flush=True)
#rank = args.nr * args.gpus + gpu
rank = gpu
dist.init_process_group(backend='gloo', init_method='env://', world_size=args.world_size, rank=rank)
torch.manual_seed(0)
model = ConvNet()
#torch.cuda.set_device(gpu)
#model.cuda(gpu)
batch_size = args.bsize
# define loss function (criterion) and optimizer
#criterion = nn.CrossEntropyLoss().cuda(gpu)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrap the model
logging.info("nn.parallel.DistributedDataParallel")
#model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
model = nn.parallel.DistributedDataParallel(model)
# Data loading code
images=torch.randn(batch_size,1,256,256)
labels=torch.randn(batch_size,10)
start = datetime.now()
for epoch in range(args.epochs):
#train_sampler.set_epoch(epoch) #shuffle
#logging.info("calnum begin")
#print('enumerate begin',flush=True)
logging.info("load_begin_0")
#for i, (images, labels) in enumerate(train_loader):
# load
if(epoch>1): break
#images = images.cuda(non_blocking=True)
#labels = labels.cuda(non_blocking=True)
logging.info("load_over_%d",epoch)
# Forwad
logging.info("Forwad_begin_%d",epoch)
outputs = model(images)
loss = criterion(outputs, labels)
logging.info("Forwad_over_%d",epoch)
# Backward and optimize
logging.info("Backward_begin_%d",epoch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
logging.info("Backward_over_%d",epoch)
logging.info("load_begin_%d",epoch+1)
if (epoch + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, epoch + 1, total_step,loss.item()))
#print('enumerate over',flush=True)
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
#print('calnum over',flush=True)
if __name__ == '__main__':
logging.info("main_begin")
main()
It works fine without uftrace.
$ python conv2.py
INFO:root:main_begin
INFO:root:main args
INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 0
INFO:torch.distributed.distributed_c10d:Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
INFO:root:nn.parallel.DistributedDataParallel
INFO:root:load_begin_0
INFO:root:load_over_0
INFO:root:Forwad_begin_0
INFO:root:Forwad_over_0
INFO:root:Backward_begin_0
INFO:root:Backward_over_0
INFO:root:load_begin_1
Training complete in: 0:00:04.732139
INFO:root:main spawn
But it fails when running with uftrace as follows.
$ uftrace record -P . python conv2d.py
WARN: process crashed by signal 11: Segmentation fault (si_code: 128)
WARN: if this happens only with uftrace, please consider -e/--estimate-return option.
WARN: Backtrace from uftrace v0.16 ( x86_64 dwarf python3 luajit tui perf sched dynamic )
WARN: =====================================
WARN: [34] (call_unbound_noarg[55e3876b6485] <= slot_tp_repr[55e387785095])
WARN: [33] (lookup_maybe_method.cold.1789[55e3876211f3] <= PyId___repr__.15069[55e3878d70a0])
WARN: [32] (lookup_maybe_method[55e3876e8135] <= slot_tp_repr[55e38778507a])
WARN: [31] (<7fc1baa3bdce>[7fc1baa3bdce] <= <7fc1baa41335>[7fc1baa41335])
WARN: [30] (<7fc1baa41188>[7fc1baa41188] <= <7fc1bb1762ab>[7fc1bb1762ab])
WARN: [29] (<7fc1bb1761a5>[7fc1bb1761a5] <= <7fc1bb16e518>[7fc1bb16e518])
WARN: [28] (<7fc1bb16e497>[7fc1bb16e497] <= <7fc1bb161e8d>[7fc1bb161e8d])
WARN: [27] (<7fc1bb161dbb>[7fc1bb161dbb] <= <7fc1bb15de94>[7fc1bb15de94])
WARN: [26] (<7fc1bb15d80d>[7fc1bb15d80d] <= <7fc1baf7663d>[7fc1baf7663d])
WARN: [25] (<7fc1baf73b5c>[7fc1baf73b5c] <= <7fc1bae05221>[7fc1bae05221])
WARN: [24] (<7fc1bae04eea>[7fc1bae04eea] <= <7fc1bd7ba118>[7fc1bd7ba118])
WARN: [23] (<7fc1bd7ba113>[7fc1bd7ba113] <= _PyImport_LoadDynamicModuleWithSpec[55e3877c86da])
WARN: [22] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [21] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [20] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [19] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [18] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [17] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [16] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [15] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [14] (PyEval_EvalCode[55e387798f45] <= builtin_exec[55e3877d1f33])
WARN: [13] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [12] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [11] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [10] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [9] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [8] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [7] (builtin___import__[55e387744315] <= PyCFunction_Call[55e3876f9c96])
WARN: [6] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [5] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [4] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [3] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [2] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [1] (PyEval_EvalCode[55e387798f45] <= run_eval_code_obj[55e387799004])
WARN: [0] (Py_BytesMain[55e3877d1b95] <= __libc_start_main[7fc1c029b493])
Please report this bug to https://github.com/namhyung/uftrace/issues.
WARN: child terminated by signal: 11: Segmentation fault
+1
can you add shebang
to your python script
#!/usr/bin/env python3
chmod +x conv2d.py
then
uftrace record conv2d.py
can you add
shebang
to your python script#!/usr/bin/env python3
chmod +x conv2d.py
then
uftrace record conv2d.py
I have added shebang to my python script, but it still fails
$ uftrace record -P . ./conv2d.py
WARN: Segmentation fault: address not mapped (addr: (nil))
WARN: if this happens only with uftrace, please consider -e/--estimate-return option.
WARN: Backtrace from uftrace v0.16 ( x86_64 dwarf python3 luajit tui perf sched dynamic )
WARN: =====================================
WARN: [10] (call_trampoline[557f2695897c] <= profile_trampoline[557f26958ab7])
WARN: [9] (call_trace_protected[557f26909514] <= _PyEval_EvalFrameDefault.cold.2787[557f2699c00c])
WARN: [8] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [7] (PyEval_EvalCode[557f26ab5f45] <= run_eval_code_obj[557f26ab6004])
WARN: [6] (PyRun_StringFlags[557f26aee575] <= builtin_exec[557f26aeeffc])
WARN: [5] (trace_call_function[557f2691c0ac] <= _PyEval_EvalFrameDefault.cold.2787[557f2699c417])
WARN: [4] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [3] (PyEval_EvalCode[557f26ab5f45] <= builtin_exec[557f26aeef33])
WARN: [2] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [1] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [0] (Py_BytesMain[557f26aeeb95] <= __libc_start_main[7f1166c1a493])
Please report this bug to https://github.com/namhyung/uftrace/issues.
WARN: child terminated by signal: 11: Segmentation fault
You should remove -P .
in this case.
Do you want to trace python function or python interpreter itself, which is written in C?
You should remove
-P .
in this case.Do you want to trace python function or python interpreter itself, which is written in C?
I want to trace the C/C++ interface functions in pytorch
Then it will need more fine tuned argument usage. Having python conv2d.py
shows python interpreter mostly so you should hide those, then it will eventually call some native functions of pytorch. But -P .
is only applied to the python
interpreter so you won't be able to see native functions of pytorch. Firstly, you need to find which binary file has pytorch native functions.
Then it will need more fine tuned argument usage. Having
python conv2d.py
shows python interpreter mostly so you should hide those, then it will eventually call some native functions of pytorch. But-P .
is only applied to thepython
interpreter so you won't be able to see native functions of pytorch. Firstly, you need to find which binary file has pytorch native functions.
When I added the "-e" option, although I couldn't record the exit time, I was able to trace the function interface of pytorch.
$ uftrace record -e -P . python conv2d.py