Local app is crashing- when sweep is passed in- ValueError: "TensorboardConfig" object has no field "desired_state"
edenlightning opened this issue · 0 comments
edenlightning commented
Steps to reproduce
**#train.py**
import os
import torch
import torch.nn.functional as F
from torchvision.datasets import CIFAR10
from torchvision import transforms
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from torch.utils.data import random_split
from pytorch_lightning.metrics.functional import accuracy
class LitModel(pl.LightningModule):
def __init__(self, lr:float = 0.0001, batch_size:int = 32):
super().__init__()
self.save_hyperparameters()
self.layer_1 = torch.nn.Linear(3 * 32 * 32, 128)
self.layer_2 = torch.nn.Linear(128, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.layer_1(x)
x = F.relu(x)
x = self.layer_2(x)
x = F.log_softmax(x)
return x
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
return optimizer
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.nll_loss(y_hat, y)
self.log('train_loss', loss)
self.log('train_acc', accuracy(y_hat.exp(), y), prog_bar=True)
return loss
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--gpus', type=int, default=None)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--data_dir', type=str, default=os.getcwd())
args = parser.parse_args()
dataset = CIFAR10(args.data_dir, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(dataset, batch_size=args.batch_size)
# init model
model = LitModel(lr=args.lr)
# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.max_epochs)
trainer.fit(model, train_loader)
**#requirements.txt**
torch==1.7.1
pytorch-lightning==1.1.2
torchvision==0.8.2
Command
lightning run sweep pl_cifar10.py --requirements=requirements.txt
Actual behaviour
App is failing.
I tried to restrt the app but it’s failing with the same error every time.
Error
INFO: Received SIGTERM signal. Gracefully terminating sweep_controller.r.edenafek-4cfc91d6.w_0.ws.0...
INFO: Your Lightning App is being stopped. This won't take long.
INFO: Your Lightning App has been stopped successfully!
Traceback (most recent call last):
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/runpy.py", line 197, in _run_module_as_main
INFO: Received SIGTERM signal. Gracefully terminating tensorboard_controller.r.edenafek-4cfc91d6...
INFO: Received SIGTERM signal. Gracefully terminating sweep_controller.r.edenafek-4cfc91d6.w_0.ws.0...
return _run_code(code, main_globals, None,
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/Users/edenafek/studio4/lightning/src/lightning/__main__.py", line 4, in <module>
main()
File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 74, in main
_main()
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1657, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1657, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 406, in run_app
_run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env, secret)
File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 344, in _run_app
dispatch(
File "/Users/edenafek/studio4/lightning/src/lightning_app/runners/runtime.py", line 76, in dispatch
return runtime.dispatch(on_before_run=on_before_run, name=name, no_cache=no_cache, cluster_id=cluster_id)
File "/Users/edenafek/studio4/lightning/src/lightning_app/runners/multiprocess.py", line 103, in dispatch
self.app._run()
File "/Users/edenafek/studio4/lightning/src/lightning_app/core/app.py", line 451, in _run
done = self.run_once()
File "/Users/edenafek/studio4/lightning/src/lightning_app/core/app.py", line 400, in run_once
self.root.run()
File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/app/main.py", line 58, in run
self.sweep_controller.run(self.db.db_url)
File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/controllers/controller.py", line 57, in run
self.on_reconcile_start(db_configs)
File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/controllers/sweep.py", line 44, in on_reconcile_start
tensorboard.desired_state = Stage.STOPPED
File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/sqlmodel/main.py", line 529, in __setattr__
super().__setattr__(name, value)
File "pydantic/main.py", line 358, in pydantic.main.BaseModel.__setattr__
ValueError: "TensorboardConfig" object has no field "desired_state"