Lightning-AI · Borda · Mar 24, 2023 · Jan 25, 2023 · Jan 25, 2023 · Jan 25, 2023
@@ -604,7 +604,7 @@ def __init__(self, size, std=0.1):
         """
         Inputs:
             size - Number of data points we want to generate
-            std - Standard deviation of the noise (see generate_continuous_xor function)
+            std - Standard deviation of the noise (see generate_continuous_xor function).
         """
         super().__init__()
         self.size = size

@@ -213,6 +213,7 @@ def get_grads(act_fn, x):
     Args:
         act_fn: An object of the class "ActivationFunction" with an implemented forward pass.
         x: 1D input tensor.
+
     Returns:
         A tensor with the same size of x containing the gradients of act_fn at x.
     """
@@ -282,7 +283,7 @@ def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 25
             act_fn: Object of the activation function that should be used as non-linearity in the network.
             input_size: Size of the input images in pixels
             num_classes: Number of classes we want to predict
-            hidden_sizes: A list of integers specifying the hidden layer sizes in the NN
+            hidden_sizes: A list of integers specifying the hidden layer sizes in the NN.
         """
         super().__init__()
 
@@ -432,7 +433,7 @@ def visualize_gradients(net, color="C0"):
     """
     Args:
         net: Object of class BaseNetwork
-        color: Color in which we want to visualize the histogram (for easier separation of activation functions)
+        color: Color in which we want to visualize the histogram (for easier separation of activation functions).
     """
     net.eval()
     small_loader = data.DataLoader(train_set, batch_size=256, shuffle=False)

@@ -156,7 +156,7 @@ def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 25
             act_fn: Object of the activation function that should be used as non-linearity in the network.
             input_size: Size of the input images in pixels
             num_classes: Number of classes we want to predict
-            hidden_sizes: A list of integers specifying the hidden layer sizes in the NN
+            hidden_sizes: A list of integers specifying the hidden layer sizes in the NN.
         """
         super().__init__()
 
@@ -258,7 +258,7 @@ def visualize_gradients(model, color="C0", print_variance=False):
     """
     Args:
         net: Object of class BaseNetwork
-        color: Color in which we want to visualize the histogram (for easier separation of activation functions)
+        color: Color in which we want to visualize the histogram (for easier separation of activation functions).
     """
     model.eval()
     small_loader = data.DataLoader(train_set, batch_size=1024, shuffle=False)

@@ -421,7 +421,7 @@ def __init__(self, c_in, c_red: dict, c_out: dict, act_fn):
             c_in - Number of input feature maps from the previous layers
             c_red - Dictionary with keys "3x3" and "5x5" specifying the output of the dimensionality reducing 1x1 convolutions
             c_out - Dictionary with keys "1x1", "3x3", "5x5", and "max"
-            act_fn - Activation class constructor (e.g. nn.ReLU)
+            act_fn - Activation class constructor (e.g. nn.ReLU).
         """
         super().__init__()
 
@@ -670,7 +670,7 @@ def __init__(self, c_in, act_fn, subsample=False, c_out=-1):
             c_in - Number of input features
             act_fn - Activation class constructor (e.g. nn.ReLU)
             subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width
-            c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in
+            c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in.
         """
         super().__init__()
         if not subsample:
@@ -715,7 +715,7 @@ def __init__(self, c_in, act_fn, subsample=False, c_out=-1):
             c_in - Number of input features
             act_fn - Activation class constructor (e.g. nn.ReLU)
             subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width
-            c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in
+            c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in.
         """
         super().__init__()
         if not subsample:
@@ -785,7 +785,7 @@ def __init__(
             num_blocks - List with the number of ResNet blocks to use. The first block of each group uses downsampling, except the first.
             c_hidden - List with the hidden dimensionalities in the different blocks. Usually multiplied by 2 the deeper we go.
             act_fn_name - Name of the activation function to use, looked up in "act_fn_by_name"
-            block_name - Name of the ResNet block, looked up in "resnet_blocks_by_name"
+            block_name - Name of the ResNet block, looked up in "resnet_blocks_by_name".
         """
         super().__init__()
         assert block_name in resnet_blocks_by_name
@@ -953,7 +953,7 @@ def __init__(self, c_in, bn_size, growth_rate, act_fn):
             c_in - Number of input channels
             bn_size - Bottleneck size (factor of growth rate) for the output of the 1x1 convolution. Typically between 2 and 4.
             growth_rate - Number of output channels of the 3x3 convolution
-            act_fn - Activation class constructor (e.g. nn.ReLU)
+            act_fn - Activation class constructor (e.g. nn.ReLU).
         """
         super().__init__()
         self.net = nn.Sequential(
@@ -985,7 +985,7 @@ def __init__(self, c_in, num_layers, bn_size, growth_rate, act_fn):
             num_layers - Number of dense layers to apply in the block
             bn_size - Bottleneck size to use in the dense layers
             growth_rate - Growth rate to use in the dense layers
-            act_fn - Activation function to use in the dense layers
+            act_fn - Activation function to use in the dense layers.
         """
         super().__init__()
         layers = []

@@ -468,7 +468,7 @@ def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
             input_dim: Dimensionality of the input
             num_heads: Number of heads to use in the attention block
             dim_feedforward: Dimensionality of the hidden layer in the MLP
-            dropout: Dropout probability to use in the dropout layers
+            dropout: Dropout probability to use in the dropout layers.
         """
         super().__init__()
 
@@ -573,7 +573,7 @@ def get_attention_maps(self, x, mask=None):
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model, max_len=5000):
         """
-        Args
+        Args:
             d_model: Hidden dimensionality of the input.
             max_len: Maximum length of a sequence to expect.
         """
@@ -769,7 +769,7 @@ def __init__(
             warmup: Number of warmup steps. Usually between 50 and 500
             max_iters: Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler
             dropout: Dropout to apply inside the model
-            input_dropout: Dropout to apply on the input features
+            input_dropout: Dropout to apply on the input features.
         """
         super().__init__()
         self.save_hyperparameters()

@@ -168,7 +168,7 @@ def forward(self, node_feats, adj_matrix):
             adj_matrix: Batch of adjacency matrices of the graph. If there is an edge from i to j,
                          adj_matrix[b,i,j]=1 else 0. Supports directed edges by non-symmetric matrices.
                          Assumes to already have added the identity connections.
-                         Shape: [batch_size, num_nodes, num_nodes]
+                         Shape: [batch_size, num_nodes, num_nodes].
         """
         # Num neighbours = number of incoming edges
         num_neighbours = adj_matrix.sum(dim=-1, keepdims=True)
@@ -322,7 +322,7 @@ def forward(self, node_feats, adj_matrix, print_attn_probs=False):
             node_feats: Input features of the node. Shape: [batch_size, c_in]
             adj_matrix: Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
             print_attn_probs: If True, the attention weights are printed during the forward pass
-                               (for debugging purposes)
+                               (for debugging purposes).
         """
         batch_size, num_nodes = node_feats.size(0), node_feats.size(1)
 
@@ -505,7 +505,7 @@ def __init__(
             num_layers: Number of "hidden" graph layers
             layer_name: String of the graph layer to use
             dp_rate: Dropout rate to apply throughout the network
-            kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT)
+            kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT).
         """
         super().__init__()
         gnn_layer = gnn_layer_by_name[layer_name]
@@ -526,7 +526,7 @@ def forward(self, x, edge_index):
         """
         Args:
             x: Input features per node
-            edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
+            edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation).
         """
         for layer in self.layers:
             # For graph layers, we need to add the "edge_index" tensor as additional input
@@ -555,7 +555,7 @@ def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
             c_hidden: Dimension of hidden features
             c_out: Dimension of the output features. Usually number of classes in classification
             num_layers: Number of hidden layers
-            dp_rate: Dropout rate to apply throughout the network
+            dp_rate: Dropout rate to apply throughout the network.
         """
         super().__init__()
         layers = []
@@ -569,7 +569,7 @@ def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
     def forward(self, x, *args, **kwargs):
         """
         Args:
-            x: Input features per node
+            x: Input features per node.
         """
         return self.layers(x)
 
@@ -849,7 +849,7 @@ def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
             c_hidden: Dimension of hidden features
             c_out: Dimension of output features (usually number of classes)
             dp_rate_linear: Dropout rate before the linear layer (usually much higher than inside the GNN)
-            kwargs: Additional arguments for the GNNModel object
+            kwargs: Additional arguments for the GNNModel object.
         """
         super().__init__()
         self.GNN = GNNModel(c_in=c_in, c_hidden=c_hidden, c_out=c_hidden, **kwargs)  # Not our prediction output yet!
@@ -860,7 +860,7 @@ def forward(self, x, edge_index, batch_idx):
         Args:
             x: Input features per node
             edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
-            batch_idx: Index of batch element for each node
+            batch_idx: Index of batch element for each node.
         """
         x = self.GNN(x, edge_index)
         x = geom_nn.global_mean_pool(x, batch_idx)  # Average pooling

@@ -340,7 +340,7 @@ def __init__(self, model, img_shape, sample_size, max_len=8192):
             model: Neural network to use for modeling E_theta
             img_shape: Shape of the images to model
             sample_size: Batch size of the samples
-            max_len: Maximum number of data points to keep in the buffer
+            max_len: Maximum number of data points to keep in the buffer.
         """
         super().__init__()
         self.model = model

@@ -136,7 +136,7 @@ def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim:
            num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
            base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
            latent_dim : Dimensionality of latent representation z
-           act_fn : Activation function used throughout the encoder network
+           act_fn : Activation function used throughout the encoder network.
         """
         super().__init__()
         c_hid = base_channel_size
@@ -195,7 +195,7 @@ def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim:
            num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
            base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
            latent_dim : Dimensionality of latent representation z
-           act_fn : Activation function used throughout the decoder network
+           act_fn : Activation function used throughout the decoder network.
         """
         super().__init__()
         c_hid = base_channel_size
@@ -263,7 +263,7 @@ def forward(self, x):
         return x_hat
 
     def _get_reconstruction_loss(self, batch):
-        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)"""
+        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)."""
         x, _ = batch  # We do not need the labels
         x_hat = self.forward(x)
         loss = F.mse_loss(x, x_hat, reduction="none")

@@ -263,7 +263,7 @@ def __init__(self, flows, import_samples=8):
         """
         Args:
             flows: A list of flows (each a nn.Module) that should be applied on the images.
-            import_samples: Number of importance samples to use during testing (see explanation below). Can be changed at any time
+            import_samples: Number of importance samples to use during testing (see explanation below). Can be changed at any time.
         """
         super().__init__()
         self.flows = nn.ModuleList(flows)
@@ -404,7 +404,7 @@ def __init__(self, alpha=1e-5, quants=256):
         Args:
             alpha: small constant that is used to scale the original input.
                     Prevents dealing with values very close to 0 and 1 when inverting the sigmoid
-            quants: Number of possible discrete values (usually 256 for 8-bit image)
+            quants: Number of possible discrete values (usually 256 for 8-bit image).
         """
         super().__init__()
         self.alpha = alpha
@@ -590,7 +590,7 @@ def __init__(self, var_flows, alpha=1e-5):
         """
         Args:
             var_flows: A list of flow transformations to use for modeling q(u|x)
-            alpha: Small constant, see Dequantization for details
+            alpha: Small constant, see Dequantization for details.
         """
         super().__init__(alpha=alpha)
         self.flows = nn.ModuleList(var_flows)
@@ -679,7 +679,7 @@ def forward(self, z, ldj, reverse=False, orig_img=None):
                   The ldj of this layer will be added to this tensor.
             reverse: If True, we apply the inverse of the layer.
             orig_img (optional): Only needed in VarDeq. Allows external
-                                  input to condition the flow on (e.g. original image)
+                                  input to condition the flow on (e.g. original image).
         """
         # Apply network to masked input
         z_in = z * self.mask
@@ -802,6 +802,7 @@ def __init__(self, c_in):
         """This module applies layer norm across channels in an image.
 
         Has been shown to work well with ResNet connections.
+
         Args:
             c_in: Number of channels of the input
         """
@@ -821,7 +822,7 @@ def __init__(self, c_in, c_hidden):
         This module applies a two-layer convolutional ResNet block with input gate
         Args:
             c_in: Number of channels of the input
-            c_hidden: Number of hidden dimensions we want to model (usually similar to c_in)
+            c_hidden: Number of hidden dimensions we want to model (usually similar to c_in).
         """
         super().__init__()
         self.net = nn.Sequential(
@@ -1249,7 +1250,7 @@ def interpolate(model, img1, img2, num_steps=8):
     Args:
         model: object of ImageFlow class that represents the (trained) flow model
         img1, img2: Image tensors of shape [1, 28, 28]. Images between which should be interpolated.
-        num_steps: Number of interpolation steps. 8 interpolation steps mean 6 intermediate pictures besides img1 and img2
+        num_steps: Number of interpolation steps. 8 interpolation steps mean 6 intermediate pictures besides img1 and img2.
     """
     imgs = torch.stack([img1, img2], dim=0).to(model.device)
     z, _ = model.encode(imgs)
@@ -1322,7 +1323,7 @@ def visualize_dequant_distribution(model: ImageFlow, imgs: Tensor, title: str =
     """
     Args:
         model: The flow of which we want to visualize the dequantization distribution
-        imgs: Example training images of which we want to visualize the dequantization distribution
+        imgs: Example training images of which we want to visualize the dequantization distribution.
     """
     imgs = imgs.to(device)
     ldj = torch.zeros(imgs.shape[0], dtype=torch.float32).to(device)

@@ -215,7 +215,7 @@ def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
             hidden_dim - Dimensionality of hidden layer in feed-forward network
                          (usually 2-4x larger than embed_dim)
             num_heads - Number of heads to use in the Multi-Head Attention block
-            dropout - Amount of dropout to apply in the feed-forward network
+            dropout - Amount of dropout to apply in the feed-forward network.
         """
         super().__init__()
 
@@ -280,7 +280,7 @@ def __init__(
             patch_size - Number of pixels that the patches have per dimension
             num_patches - Maximum number of patches an image can have
             dropout - Amount of dropout to apply in the feed-forward network and
-                      on the input encoding
+                      on the input encoding.
         """
         super().__init__()
 

@@ -270,7 +270,7 @@ def __init__(self, dataset_targets, N_way, K_shot, include_query=False, shuffle=
                       iteration (for training)
             shuffle_once - If True, examples and classes are shuffled once in
                            the beginning, but kept constant across iterations
-                           (for validation)
+                           (for validation).
         """
         super().__init__()
         self.dataset_targets = dataset_targets
@@ -977,7 +977,7 @@ def __init__(self, dataset_targets, batch_size, N_way, K_shot, include_query=Fal
                             the implementation of sampling the same classes but
                             distinct examples for support and query set.
             shuffle - If True, examples and classes are newly shuffled in each
-                      iteration (for training)
+                      iteration (for training).
         """
         super().__init__()
         self.batch_sampler = FewShotBatchSampler(dataset_targets, N_way, K_shot, include_query, shuffle)

diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py
@@ -324,7 +324,8 @@ def _convert_to_features(self, example_batch: datasets.arrow_dataset.Batch) -> B
 # %%
 class RteBoolqModule(pl.LightningModule):
     """A ``LightningModule`` that can be used to fine-tune a foundational model on either the RTE or BoolQ
-    SuperGLUE tasks using Hugging Face implementations of a given model and the `SuperGLUE Hugging Face dataset."""
+    SuperGLUE tasks using Hugging Face implementations of a given model and the `SuperGLUE Hugging Face dataset.
+    """
 
     def __init__(
         self,