Lightning-Universe/Training-Studio_app

Local app is crashing- when sweep is passed in- ValueError: "TensorboardConfig" object has no field "desired_state"

edenlightning opened this issue · 0 comments

Steps to reproduce

**#train.py**
import os
import torch
import torch.nn.functional as F
from torchvision.datasets import CIFAR10
from torchvision import transforms
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from torch.utils.data import random_split
from pytorch_lightning.metrics.functional import accuracy

class LitModel(pl.LightningModule):

    def __init__(self, lr:float = 0.0001, batch_size:int = 32):
        super().__init__()
        self.save_hyperparameters()
        self.layer_1 = torch.nn.Linear(3 * 32 * 32, 128)
        self.layer_2 = torch.nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.layer_1(x)
        x = F.relu(x)
        x = self.layer_2(x)
        x = F.log_softmax(x)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.nll_loss(y_hat, y)
        self.log('train_loss', loss)
        self.log('train_acc', accuracy(y_hat.exp(), y), prog_bar=True)
        return loss

if __name__ == '__main__':
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument('--gpus', type=int, default=None)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--max_epochs', type=int, default=10)
    parser.add_argument('--data_dir', type=str, default=os.getcwd())
    args = parser.parse_args()

    dataset = CIFAR10(args.data_dir, download=True, transform=transforms.ToTensor())
    train_loader = DataLoader(dataset, batch_size=args.batch_size)

    # init model
    model = LitModel(lr=args.lr)

    # most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
    trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.max_epochs)
    trainer.fit(model, train_loader)
**#requirements.txt**
torch==1.7.1
pytorch-lightning==1.1.2
torchvision==0.8.2

Command

lightning run sweep pl_cifar10.py --requirements=requirements.txt

Actual behaviour

App is failing.

I tried to restrt the app but it’s failing with the same error every time.

Error

INFO: Received SIGTERM signal. Gracefully terminating sweep_controller.r.edenafek-4cfc91d6.w_0.ws.0...
INFO: Your Lightning App is being stopped. This won't take long.
INFO: Your Lightning App has been stopped successfully!
Traceback (most recent call last):
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/runpy.py", line 197, in _run_module_as_main
INFO: Received SIGTERM signal. Gracefully terminating tensorboard_controller.r.edenafek-4cfc91d6...
INFO: Received SIGTERM signal. Gracefully terminating sweep_controller.r.edenafek-4cfc91d6.w_0.ws.0...
    return _run_code(code, main_globals, None,
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/edenafek/studio4/lightning/src/lightning/__main__.py", line 4, in <module>
    main()
  File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 74, in main
    _main()
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/click/core.py", line 760, in invoke
    return __callback(*args, **kwargs)
  File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 406, in run_app
    _run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env, secret)
  File "/Users/edenafek/studio4/lightning/src/lightning_app/cli/lightning_cli.py", line 344, in _run_app
    dispatch(
  File "/Users/edenafek/studio4/lightning/src/lightning_app/runners/runtime.py", line 76, in dispatch
    return runtime.dispatch(on_before_run=on_before_run, name=name, no_cache=no_cache, cluster_id=cluster_id)
  File "/Users/edenafek/studio4/lightning/src/lightning_app/runners/multiprocess.py", line 103, in dispatch
    self.app._run()
  File "/Users/edenafek/studio4/lightning/src/lightning_app/core/app.py", line 451, in _run
    done = self.run_once()
  File "/Users/edenafek/studio4/lightning/src/lightning_app/core/app.py", line 400, in run_once
    self.root.run()
  File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/app/main.py", line 58, in run
    self.sweep_controller.run(self.db.db_url)
  File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/controllers/controller.py", line 57, in run
    self.on_reconcile_start(db_configs)
  File "/Users/edenafek/studio4/lightning-hpo/lightning_hpo/controllers/sweep.py", line 44, in on_reconcile_start
    tensorboard.desired_state = Stage.STOPPED
  File "/Users/edenafek/opt/anaconda3/envs/studio4/lib/python3.9/site-packages/sqlmodel/main.py", line 529, in __setattr__
    super().__setattr__(name, value)
  File "pydantic/main.py", line 358, in pydantic.main.BaseModel.__setattr__
ValueError: "TensorboardConfig" object has no field "desired_state"