diff --git a/.gitignore b/.gitignore index f43557192..df8737c1e 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,9 @@ dmypy.json .pyre/ .idea/ + +# data artifacts +logs/ +lightning_logs/ +cifar-10-batches-py +*.tar.gz diff --git a/course_UvA-DL/01-introduction-to-pytorch/.meta.yml b/course_UvA-DL/01-introduction-to-pytorch/.meta.yml index 7af544f9c..1e5b5b978 100644 --- a/course_UvA-DL/01-introduction-to-pytorch/.meta.yml +++ b/course_UvA-DL/01-introduction-to-pytorch/.meta.yml @@ -9,6 +9,7 @@ description: | The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. requirements: - matplotlib + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/course_UvA-DL/02-activation-functions/.meta.yml b/course_UvA-DL/02-activation-functions/.meta.yml index febc3faa3..8d6392ae3 100644 --- a/course_UvA-DL/02-activation-functions/.meta.yml +++ b/course_UvA-DL/02-activation-functions/.meta.yml @@ -14,6 +14,7 @@ requirements: - torchvision - matplotlib - seaborn + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/course_UvA-DL/03-initialization-and-optimization/.meta.yml b/course_UvA-DL/03-initialization-and-optimization/.meta.yml index 5f448da1c..dee86a006 100644 --- a/course_UvA-DL/03-initialization-and-optimization/.meta.yml +++ b/course_UvA-DL/03-initialization-and-optimization/.meta.yml @@ -18,6 +18,7 @@ requirements: - torchvision - matplotlib - seaborn + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/lightning_examples/augmentation_kornia/.meta.yml b/lightning_examples/augmentation_kornia/.meta.yml index 425344039..7f5c11cab 100644 --- a/lightning_examples/augmentation_kornia/.meta.yml +++ b/lightning_examples/augmentation_kornia/.meta.yml @@ -1,9 +1,9 @@ title: GPU and batched data augmentation with Kornia and PyTorch-Lightning author: PL/Kornia team created: 2021-06-11 -updated: 2021-06-16 +updated: 2023-03-15 license: CC BY-SA -build: 4 +build: 0 tags: - Image description: | @@ -12,12 +12,13 @@ description: | mode without additional effort. requirements: - kornia - - pytorch-lightning !=1.8.0, !=1.8.0.post1 # skip for PermissionError: [Errno 13] Permission denied: 'command' + - lightning - torchmetrics - torchvision - matplotlib - pandas - seaborn + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/lightning_examples/augmentation_kornia/augmentation.py b/lightning_examples/augmentation_kornia/augmentation.py index 5a3781e9a..46ab9690f 100644 --- a/lightning_examples/augmentation_kornia/augmentation.py +++ b/lightning_examples/augmentation_kornia/augmentation.py @@ -1,6 +1,7 @@ # %% import os +import lightning as L import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -9,12 +10,10 @@ import torch.nn as nn import torchmetrics import torchvision -from IPython.core.display import display +from IPython.display import display from kornia import image_to_tensor, tensor_to_image from kornia.augmentation import ColorJitter, RandomChannelShuffle, RandomHorizontalFlip, RandomThinPlateSpline -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar -from pytorch_lightning.loggers import CSVLogger +from lightning.pytorch.loggers import CSVLogger from torch import Tensor from torch.nn import functional as F from torch.utils.data import DataLoader @@ -99,15 +98,15 @@ def forward(self, x) -> Tensor: # %% -class CoolSystem(LightningModule): +class CoolSystem(L.LightningModule): def __init__(self): super().__init__() # not the best model: expereiment yourself self.model = torchvision.models.resnet18(pretrained=True) self.preprocess = Preprocess() # per sample transforms self.transform = DataAugmentation() # per batch augmentation_kornia - self.train_accuracy = torchmetrics.Accuracy() - self.val_accuracy = torchmetrics.Accuracy() + self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=1000) + self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=1000) def forward(self, x): return self.model(x) @@ -186,10 +185,9 @@ def val_dataloader(self): # %% # Initialize a trainer -trainer = Trainer( - callbacks=[TQDMProgressBar(refresh_rate=20)], +trainer = L.Trainer( accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, max_epochs=10, logger=CSVLogger(save_dir="logs/"), ) diff --git a/lightning_examples/barlow-twins/.meta.yml b/lightning_examples/barlow-twins/.meta.yml index 54ad115c5..9a2227b89 100644 --- a/lightning_examples/barlow-twins/.meta.yml +++ b/lightning_examples/barlow-twins/.meta.yml @@ -1,9 +1,9 @@ title: Barlow Twins Tutorial -author: Ananya Harsh Jha (ananya@pytorchlightning.ai) +author: Ananya Harsh Jha created: 2021-09-19 -updated: 2021-09-20 +updated: 2023-03-15 license: CC BY-SA -build: 1 +build: 0 tags: - Image - Self-Supervised @@ -17,6 +17,7 @@ description: | requirements: - torchvision - matplotlib + - lightning>=2.0.0rc0 accelerator: - GPU - CPU diff --git a/lightning_examples/barlow-twins/barlow_twins.py b/lightning_examples/barlow-twins/barlow_twins.py index 87e262e5d..85a31320f 100644 --- a/lightning_examples/barlow-twins/barlow_twins.py +++ b/lightning_examples/barlow-twins/barlow_twins.py @@ -7,16 +7,15 @@ from functools import partial from typing import Sequence, Tuple, Union +import lightning as L import matplotlib.pyplot as plt import numpy as np -import pytorch_lightning as pl import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms import torchvision.transforms.functional as VisionF -from pytorch_lightning import Callback, LightningModule, Trainer -from pytorch_lightning.callbacks import ModelCheckpoint +from lightning.pytorch.callbacks import Callback, ModelCheckpoint from torch import Tensor from torch.utils.data import DataLoader from torchmetrics.functional import accuracy @@ -245,7 +244,7 @@ def linear_warmup_decay(warmup_steps): # We keep the LightningModule for Barlow Twins neat and simple. It takes in an backbone encoder and initializes the projection head and the loss function. We configure the optimizer and the learning rate scheduler in the ``configure_optimizers`` method. # %% -class BarlowTwins(LightningModule): +class BarlowTwins(L.LightningModule): def __init__( self, encoder, @@ -326,7 +325,7 @@ def __init__( self.encoder_output_dim = encoder_output_dim self.num_classes = num_classes - def on_fit_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + def on_fit_start(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None: # add linear_eval layer and optimizer pl_module.online_finetuner = nn.Linear(self.encoder_output_dim, self.num_classes).to(pl_module.device) self.optimizer = torch.optim.Adam(pl_module.online_finetuner.parameters(), lr=1e-4) @@ -342,12 +341,11 @@ def extract_online_finetuning_view( def on_train_batch_end( self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, + trainer: L.Trainer, + pl_module: L.LightningModule, outputs: Sequence, batch: Sequence, batch_idx: int, - dataloader_idx: int, ) -> None: x, y = self.extract_online_finetuning_view(batch, pl_module.device) @@ -362,18 +360,17 @@ def on_train_batch_end( self.optimizer.step() self.optimizer.zero_grad() - acc = accuracy(F.softmax(preds, dim=1), y) + acc = accuracy(F.softmax(preds, dim=1), y, task="multiclass", num_classes=10) pl_module.log("online_train_acc", acc, on_step=True, on_epoch=False) pl_module.log("online_train_loss", loss, on_step=True, on_epoch=False) def on_validation_batch_end( self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, + trainer: L.Trainer, + pl_module: L.LightningModule, outputs: Sequence, batch: Sequence, batch_idx: int, - dataloader_idx: int, ) -> None: x, y = self.extract_online_finetuning_view(batch, pl_module.device) @@ -384,7 +381,7 @@ def on_validation_batch_end( preds = pl_module.online_finetuner(feats) loss = F.cross_entropy(preds, y) - acc = accuracy(F.softmax(preds, dim=1), y) + acc = accuracy(F.softmax(preds, dim=1), y, task="multiclass", num_classes=10) pl_module.log("online_val_acc", acc, on_step=False, on_epoch=True, sync_dist=True) pl_module.log("online_val_loss", loss, on_step=False, on_epoch=True, sync_dist=True) @@ -406,10 +403,10 @@ def on_validation_batch_end( online_finetuner = OnlineFineTuner(encoder_output_dim=encoder_out_dim, num_classes=10) checkpoint_callback = ModelCheckpoint(every_n_epochs=100, save_top_k=-1, save_last=True) -trainer = Trainer( +trainer = L.Trainer( max_epochs=max_epochs, accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, callbacks=[online_finetuner, checkpoint_callback], ) diff --git a/lightning_examples/basic-gan/.meta.yaml b/lightning_examples/basic-gan/.meta.yaml index 150256ce3..dca051778 100644 --- a/lightning_examples/basic-gan/.meta.yaml +++ b/lightning_examples/basic-gan/.meta.yaml @@ -1,9 +1,9 @@ title: PyTorch Lightning Basic GAN Tutorial author: PL team created: 2020-12-21 -updated: 2021-06-16 +updated: 2023-03-15 license: CC BY-SA -build: 5 +build: 0 tags: - Image description: | @@ -14,6 +14,7 @@ description: | 2. training_step does both the generator and discriminator training. requirements: - torchvision + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/lightning_examples/basic-gan/gan.py b/lightning_examples/basic-gan/gan.py index 1f2bd54a9..b8e38274a 100644 --- a/lightning_examples/basic-gan/gan.py +++ b/lightning_examples/basic-gan/gan.py @@ -1,14 +1,13 @@ # %% import os +import lightning as L import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as transforms -from pytorch_lightning import LightningDataModule, LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch.utils.data import DataLoader, random_split from torchvision.datasets import MNIST @@ -20,11 +19,11 @@ # ### MNIST DataModule # # Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial -# on them or see the [latest release docs](https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html). +# on them or see the [latest release docs](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). # %% -class MNISTDataModule(LightningDataModule): +class MNISTDataModule(L.LightningDataModule): def __init__( self, data_dir: str = PATH_DATASETS, @@ -145,7 +144,7 @@ def forward(self, img): # %% -class GAN(LightningModule): +class GAN(L.LightningModule): def __init__( self, channels, @@ -160,6 +159,7 @@ def __init__( ): super().__init__() self.save_hyperparameters() + self.automatic_optimization = False # networks data_shape = (channels, width, height) @@ -176,54 +176,61 @@ def forward(self, z): def adversarial_loss(self, y_hat, y): return F.binary_cross_entropy(y_hat, y) - def training_step(self, batch, batch_idx, optimizer_idx): + def training_step(self, batch): imgs, _ = batch + optimizer_g, optimizer_d = self.optimizers() + # sample noise z = torch.randn(imgs.shape[0], self.hparams.latent_dim) z = z.type_as(imgs) # train generator - if optimizer_idx == 0: - - # generate images - self.generated_imgs = self(z) + # generate images + self.toggle_optimizer(optimizer_g) + self.generated_imgs = self(z) - # log sampled images - sample_imgs = self.generated_imgs[:6] - grid = torchvision.utils.make_grid(sample_imgs) - self.logger.experiment.add_image("generated_images", grid, 0) + # log sampled images + sample_imgs = self.generated_imgs[:6] + grid = torchvision.utils.make_grid(sample_imgs) + self.logger.experiment.add_image("generated_images", grid, 0) - # ground truth result (ie: all fake) - # put on GPU because we created this tensor inside training_loop - valid = torch.ones(imgs.size(0), 1) - valid = valid.type_as(imgs) + # ground truth result (ie: all fake) + # put on GPU because we created this tensor inside training_loop + valid = torch.ones(imgs.size(0), 1) + valid = valid.type_as(imgs) - # adversarial loss is binary cross-entropy - g_loss = self.adversarial_loss(self.discriminator(self(z)), valid) - self.log("g_loss", g_loss, prog_bar=True) - return g_loss + # adversarial loss is binary cross-entropy + g_loss = self.adversarial_loss(self.discriminator(self(z)), valid) + self.log("g_loss", g_loss, prog_bar=True) + self.manual_backward(g_loss) + optimizer_g.step() + optimizer_g.zero_grad() + self.untoggle_optimizer(optimizer_g) # train discriminator - if optimizer_idx == 1: - # Measure discriminator's ability to classify real from generated samples + # Measure discriminator's ability to classify real from generated samples + self.toggle_optimizer(optimizer_d) - # how well can it label as real? - valid = torch.ones(imgs.size(0), 1) - valid = valid.type_as(imgs) + # how well can it label as real? + valid = torch.ones(imgs.size(0), 1) + valid = valid.type_as(imgs) - real_loss = self.adversarial_loss(self.discriminator(imgs), valid) + real_loss = self.adversarial_loss(self.discriminator(imgs), valid) - # how well can it label as fake? - fake = torch.zeros(imgs.size(0), 1) - fake = fake.type_as(imgs) + # how well can it label as fake? + fake = torch.zeros(imgs.size(0), 1) + fake = fake.type_as(imgs) - fake_loss = self.adversarial_loss(self.discriminator(self(z).detach()), fake) + fake_loss = self.adversarial_loss(self.discriminator(self(z).detach()), fake) - # discriminator loss is the average of these - d_loss = (real_loss + fake_loss) / 2 - self.log("d_loss", d_loss, prog_bar=True) - return d_loss + # discriminator loss is the average of these + d_loss = (real_loss + fake_loss) / 2 + self.log("d_loss", d_loss, prog_bar=True) + self.manual_backward(d_loss) + optimizer_d.step() + optimizer_d.zero_grad() + self.untoggle_optimizer(optimizer_d) def configure_optimizers(self): lr = self.hparams.lr @@ -246,11 +253,10 @@ def on_validation_epoch_end(self): # %% dm = MNISTDataModule() model = GAN(*dm.dims) -trainer = Trainer( +trainer = L.Trainer( accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, max_epochs=5, - callbacks=[TQDMProgressBar(refresh_rate=20)], ) trainer.fit(model, dm) diff --git a/lightning_examples/cifar10-baseline/.meta.yml b/lightning_examples/cifar10-baseline/.meta.yml index a598df03d..686253164 100644 --- a/lightning_examples/cifar10-baseline/.meta.yml +++ b/lightning_examples/cifar10-baseline/.meta.yml @@ -1,17 +1,17 @@ title: PyTorch Lightning CIFAR10 ~94% Baseline Tutorial author: PL team created: 2020-12-21 -updated: 2021-06-16 +updated: 2023-03-15 license: CC BY-SA -build: 3 +build: 0 tags: - Image description: > Train a Resnet to 94% accuracy on Cifar10! requirements: - torchvision - - lightning-bolts - pandas - seaborn + - lightning>=2.0.0rc0 accelerator: - GPU diff --git a/lightning_examples/cifar10-baseline/baseline.py b/lightning_examples/cifar10-baseline/baseline.py index 95a060be0..9abc859b9 100644 --- a/lightning_examples/cifar10-baseline/baseline.py +++ b/lightning_examples/cifar10-baseline/baseline.py @@ -5,60 +5,91 @@ # %% import os +import lightning as L import pandas as pd import seaborn as sn import torch import torch.nn as nn import torch.nn.functional as F import torchvision -from IPython.core.display import display -from pl_bolts.datamodules import CIFAR10DataModule -from pl_bolts.transforms.dataset_normalizations import cifar10_normalization -from pytorch_lightning import LightningModule, Trainer, seed_everything -from pytorch_lightning.callbacks import LearningRateMonitor -from pytorch_lightning.callbacks.progress import TQDMProgressBar -from pytorch_lightning.loggers import CSVLogger +from IPython.display import display +from lightning.pytorch.callbacks import LearningRateMonitor +from lightning.pytorch.loggers import CSVLogger from torch.optim.lr_scheduler import OneCycleLR from torch.optim.swa_utils import AveragedModel, update_bn +from torch.utils.data import DataLoader, random_split from torchmetrics.functional import accuracy +from torchvision.datasets import CIFAR10 -seed_everything(7) +L.seed_everything(7) PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") BATCH_SIZE = 256 if torch.cuda.is_available() else 64 NUM_WORKERS = int(os.cpu_count() / 2) # %% [markdown] -# ### CIFAR10 Data Module +# ### CIFAR10 DataLoaders # -# Import the existing data module from `bolts` and modify the train and test transforms. # %% +cifar10_normalization = torchvision.transforms.Normalize( + mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], + std=[x / 255.0 for x in [63.0, 62.1, 66.7]], +) + + +def split_dataset(dataset, val_split=0.2, train=True): + """Splits the dataset into train and validation set.""" + len_dataset = len(dataset) + splits = get_splits(len_dataset, val_split) + dataset_train, dataset_val = random_split(dataset, splits, generator=torch.Generator().manual_seed(42)) + + if train: + return dataset_train + return dataset_val + + +def get_splits(len_dataset, val_split): + """Computes split lengths for train and validation set.""" + if isinstance(val_split, int): + train_len = len_dataset - val_split + splits = [train_len, val_split] + elif isinstance(val_split, float): + val_len = int(val_split * len_dataset) + train_len = len_dataset - val_len + splits = [train_len, val_len] + else: + raise ValueError(f"Unsupported type {type(val_split)}") + + return splits + + train_transforms = torchvision.transforms.Compose( [ torchvision.transforms.RandomCrop(32, padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), - cifar10_normalization(), + cifar10_normalization, ] ) - test_transforms = torchvision.transforms.Compose( [ torchvision.transforms.ToTensor(), - cifar10_normalization(), + cifar10_normalization, ] ) -cifar10_dm = CIFAR10DataModule( - data_dir=PATH_DATASETS, - batch_size=BATCH_SIZE, - num_workers=NUM_WORKERS, - train_transforms=train_transforms, - test_transforms=test_transforms, - val_transforms=test_transforms, -) +dataset_train = CIFAR10(PATH_DATASETS, train=True, download=True, transform=train_transforms) +dataset_val = CIFAR10(PATH_DATASETS, train=True, download=True, transform=test_transforms) +dataset_train = split_dataset(dataset_train) +dataset_val = split_dataset(dataset_val, train=False) +dataset_test = CIFAR10(PATH_DATASETS, train=False, download=True, transform=test_transforms) + +train_dataloader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) +val_dataloader = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS) +test_dataloader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS) + # %% [markdown] # ### Resnet @@ -76,14 +107,14 @@ def create_model(): # %% [markdown] # ### Lightning Module -# Check out the [`configure_optimizers`](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#configure-optimizers) +# Check out the [`configure_optimizers`](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#configure-optimizers) # method to use custom Learning Rate schedulers. The OneCycleLR with SGD will get you to around 92-93% accuracy # in 20-30 epochs and 93-94% accuracy in 40-50 epochs. Feel free to experiment with different # LR schedules from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate # %% -class LitResnet(LightningModule): +class LitResnet(L.LightningModule): def __init__(self, lr=0.05): super().__init__() @@ -106,7 +137,7 @@ def evaluate(self, batch, stage=None): logits = self(x) loss = F.nll_loss(logits, y) preds = torch.argmax(logits, dim=1) - acc = accuracy(preds, y) + acc = accuracy(preds, y, task="multiclass", num_classes=10) if stage: self.log(f"{stage}_loss", loss, prog_bar=True) @@ -141,16 +172,16 @@ def configure_optimizers(self): # %% model = LitResnet(lr=0.05) -trainer = Trainer( +trainer = L.Trainer( max_epochs=30, accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, logger=CSVLogger(save_dir="logs/"), - callbacks=[LearningRateMonitor(logging_interval="step"), TQDMProgressBar(refresh_rate=10)], + callbacks=[LearningRateMonitor(logging_interval="step")], ) -trainer.fit(model, cifar10_dm) -trainer.test(model, datamodule=cifar10_dm) +trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader) +trainer.test(model, test_dataloader) # %% @@ -181,7 +212,7 @@ def forward(self, x): out = self.swa_model(x) return F.log_softmax(out, dim=1) - def training_epoch_end(self, training_step_outputs): + def on_train_epoch_end(self): self.swa_model.update_parameters(self.model) def validation_step(self, batch, batch_idx, stage=None): @@ -189,7 +220,7 @@ def validation_step(self, batch, batch_idx, stage=None): logits = F.log_softmax(self.model(x), dim=1) loss = F.nll_loss(logits, y) preds = torch.argmax(logits, dim=1) - acc = accuracy(preds, y) + acc = accuracy(preds, y, task="multiclass", num_classes=10) self.log("val_loss", loss, prog_bar=True) self.log("val_acc", acc, prog_bar=True) @@ -204,18 +235,16 @@ def on_train_end(self): # %% swa_model = SWAResnet(model.model, lr=0.01) -swa_model.datamodule = cifar10_dm -swa_trainer = Trainer( +swa_trainer = L.Trainer( max_epochs=20, accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs - callbacks=[TQDMProgressBar(refresh_rate=20)], + devices=1, logger=CSVLogger(save_dir="logs/"), ) -swa_trainer.fit(swa_model, cifar10_dm) -swa_trainer.test(swa_model, datamodule=cifar10_dm) +swa_trainer.fit(swa_model, train_dataloader, val_dataloader=val_dataloader) +swa_trainer.test(swa_model, test_dataloader) # %% diff --git a/lightning_examples/datamodules/.meta.yml b/lightning_examples/datamodules/.meta.yml index 11cb43b60..5c8fc0b98 100644 --- a/lightning_examples/datamodules/.meta.yml +++ b/lightning_examples/datamodules/.meta.yml @@ -1,14 +1,14 @@ title: PyTorch Lightning DataModules author: PL team created: 2020-12-21 -updated: 2021-06-07 +updated: 2023-03-15 license: CC BY-SA -build: 3 +build: 0 description: This notebook will walk you through how to start using Datamodules. With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`. The most up-to-date documentation on datamodules can be found - [here](https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html). + [here](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). requirements: - torchvision accelerator: diff --git a/lightning_examples/datamodules/datamodules.py b/lightning_examples/datamodules/datamodules.py index 2dfc79f81..dd5b655b2 100644 --- a/lightning_examples/datamodules/datamodules.py +++ b/lightning_examples/datamodules/datamodules.py @@ -6,10 +6,9 @@ # %% import os +import lightning as L import torch import torch.nn.functional as F -from pytorch_lightning import LightningDataModule, LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch import nn from torch.utils.data import DataLoader, random_split from torchmetrics.functional import accuracy @@ -34,7 +33,7 @@ # %% -class LitMNIST(LightningModule): +class LitMNIST(L.LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): super().__init__() @@ -69,7 +68,7 @@ def forward(self, x): x = self.model(x) return F.log_softmax(x, dim=1) - def training_step(self, batch, batch_idx): + def training_step(self, batch): x, y = batch logits = self(x) loss = F.nll_loss(logits, y) @@ -80,7 +79,7 @@ def validation_step(self, batch, batch_idx): logits = self(x) loss = F.nll_loss(logits, y) preds = torch.argmax(logits, dim=1) - acc = accuracy(preds, y) + acc = accuracy(preds, y, task="multiclass", num_classes=10) self.log("val_loss", loss, prog_bar=True) self.log("val_acc", acc, prog_bar=True) @@ -122,11 +121,10 @@ def test_dataloader(self): # %% model = LitMNIST() -trainer = Trainer( +trainer = L.Trainer( max_epochs=2, accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs - callbacks=[TQDMProgressBar(refresh_rate=20)], + devices=1, ) trainer.fit(model) @@ -163,7 +161,7 @@ def test_dataloader(self): # %% -class MNISTDataModule(LightningDataModule): +class MNISTDataModule(L.LightningDataModule): def __init__(self, data_dir: str = PATH_DATASETS): super().__init__() self.data_dir = data_dir @@ -211,7 +209,7 @@ def test_dataloader(self): # %% -class LitModel(LightningModule): +class LitModel(L.LightningModule): def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4): super().__init__() @@ -238,7 +236,7 @@ def forward(self, x): x = self.model(x) return F.log_softmax(x, dim=1) - def training_step(self, batch, batch_idx): + def training_step(self, batch): x, y = batch logits = self(x) loss = F.nll_loss(logits, y) @@ -249,7 +247,7 @@ def validation_step(self, batch, batch_idx): logits = self(x) loss = F.nll_loss(logits, y) preds = torch.argmax(logits, dim=1) - acc = accuracy(preds, y) + acc = accuracy(preds, y, task="multiclass", num_classes=10) self.log("val_loss", loss, prog_bar=True) self.log("val_acc", acc, prog_bar=True) @@ -269,11 +267,10 @@ def configure_optimizers(self): # Init model from datamodule's attributes model = LitModel(*dm.dims, dm.num_classes) # Init trainer -trainer = Trainer( +trainer = L.Trainer( max_epochs=3, - callbacks=[TQDMProgressBar(refresh_rate=20)], accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, ) # Pass the datamodule as arg to trainer.fit to override model hooks :) trainer.fit(model, dm) @@ -285,7 +282,7 @@ def configure_optimizers(self): # %% -class CIFAR10DataModule(LightningDataModule): +class CIFAR10DataModule(L.LightningDataModule): def __init__(self, data_dir: str = "./"): super().__init__() self.data_dir = data_dir @@ -334,11 +331,9 @@ def test_dataloader(self): # %% dm = CIFAR10DataModule() model = LitModel(*dm.dims, dm.num_classes, hidden_size=256) -tqdm_progress_bar = TQDMProgressBar(refresh_rate=20) -trainer = Trainer( +trainer = L.Trainer( max_epochs=5, accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs - callbacks=[tqdm_progress_bar], + devices=1, ) trainer.fit(model, dm) diff --git a/lightning_examples/finetuning-scheduler/.meta.yml b/lightning_examples/finetuning-scheduler/.meta.yml index 28ccfe005..62641551d 100644 --- a/lightning_examples/finetuning-scheduler/.meta.yml +++ b/lightning_examples/finetuning-scheduler/.meta.yml @@ -15,5 +15,7 @@ description: | and foundation model weights. The required dependencies are installed via the finetuning-scheduler ``[examples]`` extra. requirements: - finetuning-scheduler[examples]>=0.4.0 + - datasets<2.8.0 # todo: AttributeError: module 'datasets.arrow_dataset' has no attribute 'Batch' + - lightning>=2.0.0rc0 accelerator: - GPU diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py index 1d1a77d78..956897de4 100644 --- a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py +++ b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py @@ -34,7 +34,7 @@ # criteria (a multi-phase extension of ``EarlyStopping`` packaged with FinetuningScheduler), user-specified epoch transitions or a composition of the two (the default mode). # A [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) training session completes when the # final phase of the schedule has its stopping criteria met. See -# the [early stopping documentation](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.EarlyStopping.html) for more details on that callback's configuration. +# the [early stopping documentation](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.callbacks.EarlyStopping.html) for more details on that callback's configuration. # # ![FinetuningScheduler explicit loss animation](fts_explicit_loss_anim.gif){height="272px" width="376px"} @@ -118,7 +118,7 @@ # ## Resuming Scheduled Fine-Tuning Training Sessions # # Resumption of scheduled fine-tuning training is identical to the continuation of -# [other training sessions](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html) with the caveat that the provided checkpoint must have been saved by a [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) session. +# [other training sessions](https://lightning.ai/docs/pytorch/stable/common/trainer.html) with the caveat that the provided checkpoint must have been saved by a [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) session. # [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) uses [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) (an extension of ``ModelCheckpoint``) to maintain schedule state with special metadata. # # @@ -138,7 +138,7 @@ # trainer.fit(..., ckpt_path="some/path/to/my_kth_best_checkpoint.ckpt") # ``` # -# Note that similar to the behavior of [ModelCheckpoint](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html), (specifically [this PR](https://github.com/Lightning-AI/lightning/pull/12045)), +# Note that similar to the behavior of [ModelCheckpoint](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html), (specifically [this PR](https://github.com/Lightning-AI/lightning/pull/12045)), # when resuming training with a different [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) ``dirpath`` from the provided # checkpoint, the new training session's checkpoint state will be re-initialized at the resumption depth with the provided checkpoint being set as the best checkpoint. diff --git a/lightning_examples/mnist-hello-world/.meta.yml b/lightning_examples/mnist-hello-world/.meta.yml index e08fb4263..ae9f221c8 100644 --- a/lightning_examples/mnist-hello-world/.meta.yml +++ b/lightning_examples/mnist-hello-world/.meta.yml @@ -1,9 +1,9 @@ -title: Introduction to Pytorch Lightning +title: Introduction to PyTorch Lightning author: PL team created: 2020-12-21 -updated: 2021-06-16 +updated: 2023-05-15 license: CC BY-SA -build: 4 +build: 0 tags: - Image description: In this notebook, we'll go over the basics of lightning by preparing @@ -13,6 +13,7 @@ requirements: - torchmetrics >=0.11.0 - pandas - seaborn + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/lightning_examples/mnist-hello-world/hello-world.py b/lightning_examples/mnist-hello-world/hello-world.py index 35f59afe2..cd6c6eff5 100644 --- a/lightning_examples/mnist-hello-world/hello-world.py +++ b/lightning_examples/mnist-hello-world/hello-world.py @@ -1,13 +1,12 @@ # %% import os +import lightning as L import pandas as pd import seaborn as sn import torch -from IPython.core.display import display -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar -from pytorch_lightning.loggers import CSVLogger +from IPython.display import display +from lightning.pytorch.loggers import CSVLogger from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader, random_split @@ -27,7 +26,7 @@ # %% -class MNISTModel(LightningModule): +class MNISTModel(L.LightningModule): def __init__(self): super().__init__() self.l1 = torch.nn.Linear(28 * 28, 10) @@ -60,11 +59,10 @@ def configure_optimizers(self): train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE) # Initialize a trainer -trainer = Trainer( +trainer = L.Trainer( accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, max_epochs=3, - callbacks=[TQDMProgressBar(refresh_rate=20)], ) # Train the model ⚡ @@ -84,22 +82,22 @@ def configure_optimizers(self): # # ### Note what the following built-in functions are doing: # -# 1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#prepare-data) 💾 +# 1. [prepare_data()](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#prepare-data) 💾 # - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there. # - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`) # -# 2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#setup) ⚙️ +# 2. [setup(stage)](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#setup) ⚙️ # - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). # - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'. # - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals). # - **Note this runs across all GPUs and it *is* safe to make state assignments here** # -# 3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.hooks.DataHooks.html#pytorch_lightning.core.hooks.DataHooks.train_dataloader) ♻️ +# 3. [x_dataloader()](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.core.hooks.DataHooks.html#pytorch_lightning.core.hooks.DataHooks.train_dataloader) ♻️ # - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()` # %% -class LitMNIST(LightningModule): +class LitMNIST(L.LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): super().__init__() @@ -201,11 +199,10 @@ def test_dataloader(self): # %% model = LitMNIST() -trainer = Trainer( +trainer = L.Trainer( accelerator="auto", - devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + devices=1, max_epochs=3, - callbacks=[TQDMProgressBar(refresh_rate=20)], logger=CSVLogger(save_dir="logs/"), ) trainer.fit(model) diff --git a/lightning_examples/mnist-tpu-training/.meta.yml b/lightning_examples/mnist-tpu-training/.meta.yml index bba3e2bfa..7c8236269 100644 --- a/lightning_examples/mnist-tpu-training/.meta.yml +++ b/lightning_examples/mnist-tpu-training/.meta.yml @@ -1,15 +1,16 @@ title: TPU training with PyTorch Lightning author: PL team created: 2020-12-21 -updated: 2021-06-25 +updated: 2023-05-15 license: CC BY-SA -build: 1 +build: 0 tags: - Image description: In this notebook, we'll train a model on TPUs. Updating one Trainer flag is all you need for that. The most up to documentation related to TPU training can be found - [here](https://pytorch-lightning.readthedocs.io/en/stable/accelerators/tpu.html). + [here](https://lightning.ai/docs/pytorch/stable/accelerators/tpu.html). requirements: - torchvision + - lightning>=2.0.0rc0 accelerator: - TPU diff --git a/lightning_examples/mnist-tpu-training/mnist-tpu.py b/lightning_examples/mnist-tpu-training/mnist-tpu.py index d2e3b3e5f..f0d74271d 100644 --- a/lightning_examples/mnist-tpu-training/mnist-tpu.py +++ b/lightning_examples/mnist-tpu-training/mnist-tpu.py @@ -4,11 +4,11 @@ # %% # ! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl +import lightning as L + # %% import torch import torch.nn.functional as F -from pytorch_lightning import LightningDataModule, LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch import nn from torch.utils.data import DataLoader, random_split from torchmetrics.functional import accuracy @@ -23,11 +23,11 @@ # ### Defining The `MNISTDataModule` # # Below we define `MNISTDataModule`. You can learn more about datamodules -# in [docs](https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html). +# in [docs](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). # %% -class MNISTDataModule(LightningDataModule): +class MNISTDataModule(L.LightningDataModule): def __init__(self, data_dir: str = "./"): super().__init__() self.data_dir = data_dir @@ -68,7 +68,7 @@ def test_dataloader(self): # %% -class LitModel(LightningModule): +class LitModel(L.LightningModule): def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4): super().__init__() @@ -130,9 +130,8 @@ def configure_optimizers(self): # Init model from datamodule's attributes model = LitModel(*dm.size(), dm.num_classes) # Init trainer -trainer = Trainer( +trainer = L.Trainer( max_epochs=3, - callbacks=[TQDMProgressBar(refresh_rate=20)], accelerator="tpu", devices=[5], ) @@ -148,11 +147,10 @@ def configure_optimizers(self): # Init model from datamodule's attributes model = LitModel(*dm.dims, dm.num_classes) # Init trainer -trainer = Trainer( +trainer = L.Trainer( max_epochs=3, accelerator="tpu", devices=1, - callbacks=[TQDMProgressBar(refresh_rate=20)], ) # Train trainer.fit(model, dm) @@ -167,9 +165,8 @@ def configure_optimizers(self): # Init model from datamodule's attributes model = LitModel(*dm.dims, dm.num_classes) # Init trainer -trainer = Trainer( +trainer = L.Trainer( max_epochs=3, - callbacks=[TQDMProgressBar(refresh_rate=20)], accelerator="tpu", devices=8, ) diff --git a/lightning_examples/reinforce-learning-DQN/.meta.yml b/lightning_examples/reinforce-learning-DQN/.meta.yml index 0baed6d82..abf466a6d 100644 --- a/lightning_examples/reinforce-learning-DQN/.meta.yml +++ b/lightning_examples/reinforce-learning-DQN/.meta.yml @@ -17,6 +17,7 @@ requirements: - pygame - pandas - seaborn + - lightning>=2.0.0rc0 accelerator: - CPU - GPU diff --git a/lightning_examples/text-transformers/.meta.yml b/lightning_examples/text-transformers/.meta.yml index 9dec0c632..f34c5fe85 100644 --- a/lightning_examples/text-transformers/.meta.yml +++ b/lightning_examples/text-transformers/.meta.yml @@ -16,5 +16,6 @@ requirements: - scipy - scikit-learn - torchtext>=0.9 + - lightning>=2.0.0rc0 accelerator: - GPU diff --git a/lightning_examples/warp-drive/.meta.yml b/lightning_examples/warp-drive/.meta.yml index ce6b67470..adf93f01b 100644 --- a/lightning_examples/warp-drive/.meta.yml +++ b/lightning_examples/warp-drive/.meta.yml @@ -25,5 +25,6 @@ requirements: - torch==1.10.* - torchvision==0.11.* - torchtext==0.11.* + - lightning>=2.0.0rc0 accelerator: - GPU diff --git a/lightning_examples/warp-drive/multi_agent_rl.py b/lightning_examples/warp-drive/multi_agent_rl.py index 4fa4cf71f..33efbe266 100644 --- a/lightning_examples/warp-drive/multi_agent_rl.py +++ b/lightning_examples/warp-drive/multi_agent_rl.py @@ -9,8 +9,8 @@ # This tutorial provides a demonstration of a multi-agent Reinforcement Learning (RL) training loop with [WarpDrive](https://github.com/salesforce/warp-drive). WarpDrive is a flexible, lightweight, and easy-to-use RL framework that implements end-to-end deep multi-agent RL on a GPU (Graphics Processing Unit). Using the extreme parallelization capability of GPUs, it enables [orders-of-magnitude faster RL](https://arxiv.org/abs/2108.13976) compared to common implementations that blend CPU simulations and GPU models. WarpDrive is extremely efficient as it runs simulations across multiple agents and multiple environment replicas all in parallel and completely eliminates the back-and-forth data copying between the CPU and the GPU during every step. As such, WarpDrive # - Can simulate 1000s of agents in each environment and thousands of environments in parallel, harnessing the extreme parallelism capability of GPUs. # - Eliminates communication between CPU and GPU, and also within the GPU, as read and write operations occur in-place. -# - Is fully compatible with Pytorch, a highly flexible and very fast deep learning framework. -# - Implements parallel action sampling on CUDA C, which is ~3x faster than using Pytorch’s sampling methods. +# - Is fully compatible with PyTorch, a highly flexible and very fast deep learning framework. +# - Implements parallel action sampling on CUDA C, which is ~3x faster than using PyTorch’s sampling methods. # - Allows for large-scale distributed training on multiple GPUs. # # Below is an overview of WarpDrive’s layout of computational and data structures on a single GPU. @@ -22,9 +22,9 @@ # # We invite everyone to **contribute to WarpDrive**, including adding new multi-agent environments, proposing new features and reporting issues on our open source [repository](https://github.com/salesforce/warp-drive). # -# We have integrated WarpDrive with the [Pytorch Lightning](https://www.pytorchlightning.ai/) framework, which greatly reduces the trainer boilerplate code, and improves training modularity and flexibility. It abstracts away most of the engineering pieces of code, so users can focus on research and building models, and iterate on experiments really fast. Pytorch Lightning also provides support for easily running the model on any hardware, performing distributed training, model checkpointing, performance profiling, logging and visualization. +# We have integrated WarpDrive with the [PyTorch Lightning](https://www.lightning.ai/) framework, which greatly reduces the trainer boilerplate code, and improves training modularity and flexibility. It abstracts away most of the engineering pieces of code, so users can focus on research and building models, and iterate on experiments really fast. PyTorch Lightning also provides support for easily running the model on any hardware, performing distributed training, model checkpointing, performance profiling, logging and visualization. # -# Below, we demonstrate how to use WarpDrive and PytorchLightning together to train a game of [Tag](https://github.com/salesforce/warp-drive/blob/master/example_envs/tag_continuous/tag_continuous.py) where multiple *tagger* agents are trying to run after and tag multiple other *runner* agents. Here's a sample depiction of the game of Tag with $100$ runners and $5$ taggers. +# Below, we demonstrate how to use WarpDrive and PyTorch Lightning together to train a game of [Tag](https://github.com/salesforce/warp-drive/blob/master/example_envs/tag_continuous/tag_continuous.py) where multiple *tagger* agents are trying to run after and tag multiple other *runner* agents. Here's a sample depiction of the game of Tag with $100$ runners and $5$ taggers. # ![](https://blog.salesforceairesearch.com/content/images/2021/08/same_speed_50fps-1.gif) # %% [markdown] @@ -213,7 +213,7 @@ log_freq=log_freq, ) -# Instantiate the PytorchLightning trainer with the callbacks. +# Instantiate the PyTorch Lightning trainer with the callbacks. # Also, set the number of gpus to 1, since this notebook uses just a single GPU. num_gpus = 1 num_episodes = run_config["trainer"]["num_episodes"]