Out of memory when saving object
isaacgerg opened this issue · 9 comments
def save_object(self, data, name):
"""Saves given object to the object backup directory.
Args:
data: object that needs to be saved.
name: string value representing the name of the object.
"""
with open(str(self.current_path / Storage.DIR["OBJECT"] / name), 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
this yeilds a MemoryError when trying to do the pickle. It takes up at least 20GB of RAM. My model is fairly small. See log excerpt following:
-------------------------------NEW BEST ANT FOUND-------------------------------
---------------------------BEST ANT DURING ITERATION----------------------------
=======
Ant: 0x5eaa4358
Loss: 0.246397
Accuracy: 0.908695
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(kernel_size:3, filter_count:64, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 0f8bcb2b4fbd88fb3c5ffca7be08add054c8d35df1f122f4db0f043a91f7c901
=======
Hi can you provide following details:
- After how long running the search the MemoryError was triggered?
- What is the size of your backup file (located inside saves/date-of-your-run/objects/backup)?
- Training sample code (no need for .yaml).
Thank you in advance.
-
About 3 hours.
-
backup file is 0 bytes.
3
dataset = Dataset(training_examples=x_train, training_labels=y_train, testing_examples=x_test, testing_labels=y_test)
backend = TFKerasBackend(dataset=dataset, optimizer=tf.keras.optimizers.Adam(1e-4))
deepswarm = DeepSwarm(backend=backend)
topology = deepswarm.find_topology()
trained_topology = deepswarm.train_topology(topology, 50)
EDIT 1:
I tried to pickle using a different protocol than what is specified and got the following:
pickle.dump(data, f)
Traceback (most recent call last):
Debug Probe, prompt 65, line 1
builtins.OverflowError: cannot serialize a bytes object larger than 4 GiB
Okay I will try to run your sample code for longer time and see if I can reproduce it.
Also you could copy and paste the exact error you got the first time? And can you check what is the backup file size after DeepSwarm finishes the first iteration (i.e. when the depth is increased first time)?
Error:
File "D:\ASASINATR\projects\muscle_deep_swarm\trainer.py", line 332, in
main()
File "D:\ASASINATR\projects\muscle_deep_swarm\trainer.py", line 325, in main
topology = deepswarm.find_topology()
File "c:\python35\Lib\site-packages\deepswarm\deepswarm.py", line 43, in find_topology
best_ant = self.aco.search()
File "c:\python35\Lib\site-packages\deepswarm\aco.py", line 61, in search
self.storage.perform_backup()
File "c:\python35\Lib\site-packages\deepswarm\storage.py", line 67, in perform_backup
self.save_object(self.deepswarm, Storage.ITEM["BACKUP"])
File "c:\python35\Lib\site-packages\deepswarm\storage.py", line 213, in save_object
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
builtins.MemoryError:
The backup file is 0 bytes in length.
Below is the deepswarm log.
================================================================================
2019-05-23 09:31:27,241
DeepSwarm settings
================================================================================
================================================================================
2019-05-23 09:31:27,255
{
"DeepSwarm": {
"aco": {
"ant_count": 16,
"greediness": 0.5,
"pheromone": {
"decay": 0.1,
"evaporation": 0.1,
"start": 0.1,
"verbose": false
}
},
"backend": {
"batch_size": 16,
"epochs": 15,
"loss": "binary_crossentropy",
"patience": 5,
"verbose": true
},
"flat_nodes": [
"FlattenNode",
"DenseNode",
"DropoutFlatNode",
"BatchNormalizationFlatNode"
],
"max_depth": 15,
"metrics": "accuracy",
"reuse_patience": 1,
"save_folder": null,
"spatial_nodes": [
"InputNode",
"Conv2DNode",
"DropoutSpatialNode",
"BatchNormalizationNode",
"Pool2DNode"
]
},
"Nodes": {
"BatchNormalizationFlatNode": {
"attributes": {},
"transitions": {
"DenseNode": 1.1,
"DropoutFlatNode": 1.1,
"OutputNode": 0.9
},
"type": "BatchNormalization"
},
"BatchNormalizationNode": {
"attributes": {},
"transitions": {
"Conv2DNode": 1.1,
"DropoutSpatialNode": 1.0,
"FlattenNode": 1.0,
"Pool2DNode": 1.1
},
"type": "BatchNormalization"
},
"Conv2DNode": {
"attributes": {
"activation": [
"ReLU"
],
"filter_count": [
8,
16,
32
],
"kernel_size": [
1,
3,
5
]
},
"transitions": {
"BatchNormalizationNode": 1.2,
"Conv2DNode": 0.8,
"DropoutSpatialNode": 1.1,
"FlattenNode": 1.0,
"Pool2DNode": 1.2
},
"type": "Conv2D"
},
"DenseNode": {
"attributes": {
"activation": [
"ReLU"
],
"output_size": [
16,
32
]
},
"transitions": {
"BatchNormalizationFlatNode": 1.2,
"DenseNode": 0.8,
"DropoutFlatNode": 1.2,
"OutputNode": 1.0
},
"type": "Dense"
},
"DropoutFlatNode": {
"attributes": {
"rate": [
0.1,
0.3
]
},
"transitions": {
"BatchNormalizationFlatNode": 1.0,
"DenseNode": 1.0,
"OutputNode": 0.9
},
"type": "Dropout"
},
"DropoutSpatialNode": {
"attributes": {
"rate": [
0.1,
0.3
]
},
"transitions": {
"BatchNormalizationNode": 1.1,
"Conv2DNode": 1.1,
"FlattenNode": 1.0,
"Pool2DNode": 1.0
},
"type": "Dropout"
},
"FlattenNode": {
"attributes": {},
"transitions": {
"BatchNormalizationFlatNode": 0.9,
"DenseNode": 1.0,
"OutputNode": 0.8
},
"type": "Flatten"
},
"InputNode": {
"attributes": {
"shape": [
[
256,
256,
1
]
]
},
"transitions": {
"Conv2DNode": 1.0
},
"type": "Input"
},
"OutputNode": {
"attributes": {
"activation": [
"Sigmoid"
],
"output_size": [
1
]
},
"transitions": {},
"type": "Output"
},
"Pool2DNode": {
"attributes": {
"pool_size": [
2
],
"pool_type": [
"max",
"average"
],
"stride": [
2
]
},
"transitions": {
"BatchNormalizationNode": 1.1,
"Conv2DNode": 1.1,
"FlattenNode": 1.0
},
"type": "Pool2D"
}
},
"script": "trainer.py",
"settings_file": "D:\\ASASINATR\\projects\\muscle_deep_swarm\\settings\\default.yaml"
}
================================================================================
================================================================================
2019-05-23 09:31:27,271
STARTING ACO SEARCH
================================================================================
================================================================================
2019-05-23 09:41:06,909
=======
Ant: 0x5f572780
Loss: 0.305501
Accuracy: 0.882046
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:8, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 532b17629541338340324c7ec983eb77475eb467271fd91d8291660c183005ea
=======
================================================================================
================================================================================
2019-05-23 09:41:06,910
Current search depth is 1
================================================================================
================================================================================
2019-05-23 09:41:06,910
GENERATING ANT 1
================================================================================
================================================================================
2019-05-23 09:53:35,652
=======
Ant: 0x20a63cf8
Loss: 0.408496
Accuracy: 0.842963
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:1, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 3995d2461ebc14187f5f015afe474268c7771934e4538a04c102d548c9243f5b
=======
================================================================================
================================================================================
2019-05-23 09:53:35,653
GENERATING ANT 2
================================================================================
================================================================================
2019-05-23 10:05:39,989
=======
Ant: 0x5e720278
Loss: 0.357701
Accuracy: 0.859789
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: cafa59afca2fd87d4ac51b447e574794e12392b7a5afa76742346a132876af5c
=======
================================================================================
================================================================================
2019-05-23 10:05:39,990
GENERATING ANT 3
================================================================================
================================================================================
2019-05-23 10:14:13,083
=======
Ant: 0x5e32b898
Loss: 0.321589
Accuracy: 0.875414
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 4fcaee9fa268d198d84263ff68179aafc2b5a2f1b7968b072d0f82957038b4f2
=======
================================================================================
================================================================================
2019-05-23 10:14:13,102
GENERATING ANT 4
================================================================================
================================================================================
2019-05-23 10:24:43,887
=======
Ant: 0x5ec09c88
Loss: 0.308654
Accuracy: 0.883372
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 195ff809cbe26546c021a508bcd3b0480b6a06eef04ed73f7a9677c377f8917e
=======
================================================================================
================================================================================
2019-05-23 10:24:43,888
GENERATING ANT 5
================================================================================
================================================================================
2019-05-23 10:39:11,747
=======
Ant: 0x5e7eda90
Loss: 0.373283
Accuracy: 0.842299
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: b4261d53b59582b4629b353d857588506dacb9ba7bdf0725b550b09287c06261
=======
================================================================================
================================================================================
2019-05-23 10:39:11,748
GENERATING ANT 6
================================================================================
================================================================================
2019-05-23 10:49:43,374
=======
Ant: 0x5ea1ef98
Loss: 0.397807
Accuracy: 0.835502
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: b4261d53b59582b4629b353d857588506dacb9ba7bdf0725b550b09287c06261
=======
================================================================================
================================================================================
2019-05-23 10:49:43,375
GENERATING ANT 7
================================================================================
================================================================================
2019-05-23 11:03:10,789
=======
Ant: 0x5ec14e80
Loss: 0.464672
Accuracy: 0.808811
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:8, kernel_size:1, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: b12eeecf14a6e2f481cf0eef0b033ec9ca6d85b6ac8e80bc763ee786926a53be
=======
================================================================================
================================================================================
2019-05-23 11:03:10,791
GENERATING ANT 8
================================================================================
================================================================================
2019-05-23 11:13:44,985
=======
Ant: 0x5e2a70f0
Loss: 0.300659
Accuracy: 0.883455
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: cafa59afca2fd87d4ac51b447e574794e12392b7a5afa76742346a132876af5c
=======
================================================================================
================================================================================
2019-05-23 11:13:44,986
GENERATING ANT 9
================================================================================
================================================================================
2019-05-23 11:21:00,680
=======
Ant: 0x5eb450f0
Loss: 0.354258
Accuracy: 0.859126
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 4fcaee9fa268d198d84263ff68179aafc2b5a2f1b7968b072d0f82957038b4f2
=======
================================================================================
================================================================================
2019-05-23 11:21:00,681
GENERATING ANT 10
================================================================================
================================================================================
2019-05-23 11:28:17,632
=======
Ant: 0x5ebcceb8
Loss: 0.372203
Accuracy: 0.846734
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:8, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: a76bc52f0128ffbff1552656451d8f0f4e4044f0be1f506b8355a1b7d86da492
=======
================================================================================
================================================================================
2019-05-23 11:28:17,632
GENERATING ANT 11
================================================================================
================================================================================
2019-05-23 11:36:56,947
=======
Ant: 0x62b1b128
Loss: 0.411597
Accuracy: 0.830860
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 4fcaee9fa268d198d84263ff68179aafc2b5a2f1b7968b072d0f82957038b4f2
=======
================================================================================
================================================================================
2019-05-23 11:36:56,948
GENERATING ANT 12
================================================================================
================================================================================
2019-05-23 11:44:32,906
=======
Ant: 0x655e0a58
Loss: 0.304148
Accuracy: 0.884160
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:16, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 195ff809cbe26546c021a508bcd3b0480b6a06eef04ed73f7a9677c377f8917e
=======
================================================================================
================================================================================
2019-05-23 11:44:32,907
GENERATING ANT 13
================================================================================
================================================================================
2019-05-23 12:07:59,288
=======
Ant: 0x5fa917b8
Loss: 0.406788
Accuracy: 0.842548
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:1, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 12d4b09dfe8bd8e5f21cb0371d61d2c5ae6348c5e4bb914d6680dd974a8fe56f
=======
================================================================================
================================================================================
2019-05-23 12:07:59,289
GENERATING ANT 14
================================================================================
================================================================================
2019-05-23 12:20:04,685
=======
Ant: 0x5e277cc0
Loss: 0.396931
Accuracy: 0.845615
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:1, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: 12d4b09dfe8bd8e5f21cb0371d61d2c5ae6348c5e4bb914d6680dd974a8fe56f
=======
================================================================================
================================================================================
2019-05-23 12:20:04,686
GENERATING ANT 15
================================================================================
================================================================================
2019-05-23 12:30:25,658
=======
Ant: 0x5e35fb00
Loss: 0.295127
Accuracy: 0.885859
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: cafa59afca2fd87d4ac51b447e574794e12392b7a5afa76742346a132876af5c
=======
================================================================================
================================================================================
2019-05-23 12:30:25,659
GENERATING ANT 16
================================================================================
================================================================================
2019-05-23 12:40:50,392
=======
Ant: 0x5f8ac208
Loss: 0.294060
Accuracy: 0.882253
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:5, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: b4261d53b59582b4629b353d857588506dacb9ba7bdf0725b550b09287c06261
=======
================================================================================
================================================================================
2019-05-23 12:40:50,393
NEW BEST ANT FOUND
================================================================================
================================================================================
2019-05-23 12:40:50,393
BEST ANT DURING ITERATION
================================================================================
================================================================================
2019-05-23 12:40:50,393
=======
Ant: 0x5e35fb00
Loss: 0.295127
Accuracy: 0.885859
Path: InputNode(shape:(256, 256, 1)) -> Conv2DNode(filter_count:32, kernel_size:3, activation:ReLU) -> FlattenNode() -> OutputNode(output_size:1, activation:Sigmoid)
Hash: cafa59afca2fd87d4ac51b447e574794e12392b7a5afa76742346a132876af5c
=======
================================================================================
I just ran the test on local machine and couldn't reproduce the error and I am currently running the test on Colab (the depth was increased and it still didn't crash).
However, after reading some issues on pickle I found that for some people issue was resolved by replacing pickle with dill. So maybe you want to give it a try and see if it solves the problem for you? The steps that you would need to follow are:
-
Install dill
pip install dill
-
Go to storage.py
2.1. Replaceimport pickle
withimport dill
2.2. Replacepickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
withdill.dump(data, f)
2.3 Replacedata = pickle.load(f)
withdata = dill.load(f)
-
Run the code
P.S. Also update to the newest DeepSwarm version (0.0.8)
Okay I couldn't reproduce the crash, but I just created a new version which drastically reduces the backup file size (which should solve the crash for you). I will test this version on different machines and I will update you soon.
Okay I just released a new version (0.0.9) which should solve your problem 👍
Everything appears to be working now. Thank you for the help.