diff --git a/.github/workflows/ci_test-acts.yml b/.github/workflows/ci_test-acts.yml
index 7f1d99aa5..205e26858 100644
--- a/.github/workflows/ci_test-acts.yml
+++ b/.github/workflows/ci_test-acts.yml
@@ -47,10 +47,9 @@ jobs:
     - name: Install requirements
       run: |
         pip --version
-        pip install -q -r .actions/requires.txt
-        pip install -q "pytest==6.*" coverage jupytext
+        pip install -q -r .actions/requires.txt -r _requirements/test.txt
         # this is needed to be able to run package version parsing test
-        pip install -q matplotlib -r _requirements/default.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install -q -r _requirements/default.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
 
     - name: Prepare dummy inputs
       run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c65cb648f..96b84495f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -60,7 +60,8 @@ repos:
           - mdformat-black
           - mdformat_frontmatter
 
-  - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.259
     hooks:
-      - id: flake8
+      - id: ruff
+        args: ["--fix"]
diff --git a/_requirements/default.txt b/_requirements/default.txt
index 42e73331b..0abab9137 100644
--- a/_requirements/default.txt
+++ b/_requirements/default.txt
@@ -1,4 +1,5 @@
 setuptools==67.4.0
+matplotlib>=3.0.0, <3.4.0
 ipython[notebook]>=8.0.0, <8.12.0
 torch>=1.8.1, <1.14.0
 pytorch-lightning>=1.4, <2.0.0
diff --git a/_requirements/test.txt b/_requirements/test.txt
new file mode 100644
index 000000000..e505fefe7
--- /dev/null
+++ b/_requirements/test.txt
@@ -0,0 +1,5 @@
+coverage>=5.0
+codecov>=2.1
+pytest>=6.0
+pytest-cov
+jupytext
diff --git a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py
index c692d1402..aad5d7276 100644
--- a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py
+++ b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py
@@ -601,10 +601,11 @@ def forward(self, x):
 
 class XORDataset(data.Dataset):
     def __init__(self, size, std=0.1):
-        """
-        Inputs:
-            size - Number of data points we want to generate
-            std - Standard deviation of the noise (see generate_continuous_xor function)
+        """XORDataset.
+
+        Args:
+            size: Number of data points we want to generate
+            std: Standard deviation of the noise (see generate_continuous_xor function)
         """
         super().__init__()
         self.size = size
diff --git a/course_UvA-DL/02-activation-functions/Activation_Functions.py b/course_UvA-DL/02-activation-functions/Activation_Functions.py
index 1abd12699..db900dfb7 100644
--- a/course_UvA-DL/02-activation-functions/Activation_Functions.py
+++ b/course_UvA-DL/02-activation-functions/Activation_Functions.py
@@ -213,6 +213,7 @@ def get_grads(act_fn, x):
     Args:
         act_fn: An object of the class "ActivationFunction" with an implemented forward pass.
         x: 1D input tensor.
+
     Returns:
         A tensor with the same size of x containing the gradients of act_fn at x.
     """
@@ -277,7 +278,8 @@ def vis_act_fn(act_fn, ax, x):
 # %%
 class BaseNetwork(nn.Module):
     def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 256, 256, 128]):
-        """
+        """Base Network.
+
         Args:
             act_fn: Object of the activation function that should be used as non-linearity in the network.
             input_size: Size of the input images in pixels
@@ -429,7 +431,8 @@ def save_model(model, model_path, model_name):
 
 # %%
 def visualize_gradients(net, color="C0"):
-    """
+    """Visualize gradients.
+
     Args:
         net: Object of class BaseNetwork
         color: Color in which we want to visualize the histogram (for easier separation of activation functions)
diff --git a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py
index 4fd6e47a0..ea8788ed5 100644
--- a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py
+++ b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py
@@ -151,7 +151,8 @@
 # %%
 class BaseNetwork(nn.Module):
     def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 256, 256, 128]):
-        """
+        """Base Network.
+
         Args:
             act_fn: Object of the activation function that should be used as non-linearity in the network.
             input_size: Size of the input images in pixels
diff --git a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py
index ffee8ff48..5d5def356 100644
--- a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py
+++ b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py
@@ -217,12 +217,13 @@
 # %%
 class CIFARModule(L.LightningModule):
     def __init__(self, model_name, model_hparams, optimizer_name, optimizer_hparams):
-        """
-        Inputs:
-            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
-            model_hparams - Hyperparameters for the model, as dictionary.
-            optimizer_name - Name of the optimizer to use. Currently supported: Adam, SGD
-            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
+        """CIFARModule.
+
+        Args:
+            model_name: Name of the model/CNN to run. Used for creating the model (see function below)
+            model_hparams: Hyperparameters for the model, as dictionary.
+            optimizer_name: Name of the optimizer to use. Currently supported: Adam, SGD
+            optimizer_hparams: Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
         """
         super().__init__()
         # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
@@ -337,10 +338,11 @@ def create_model(model_name, model_hparams):
 
 # %%
 def train_model(model_name, save_name=None, **kwargs):
-    """
-    Inputs:
-        model_name - Name of the model you want to run. Is used to look up the class in "model_dict"
-        save_name (optional) - If specified, this name will be used for creating the checkpoint and logging directory.
+    """Train model.
+
+    Args:
+        model_name: Name of the model you want to run. Is used to look up the class in "model_dict"
+        save_name (optional): If specified, this name will be used for creating the checkpoint and logging directory.
     """
     if save_name is None:
         save_name = model_name
@@ -417,12 +419,13 @@ def train_model(model_name, save_name=None, **kwargs):
 # %%
 class InceptionBlock(nn.Module):
     def __init__(self, c_in, c_red: dict, c_out: dict, act_fn):
-        """
-        Inputs:
-            c_in - Number of input feature maps from the previous layers
-            c_red - Dictionary with keys "3x3" and "5x5" specifying the output of the dimensionality reducing 1x1 convolutions
-            c_out - Dictionary with keys "1x1", "3x3", "5x5", and "max"
-            act_fn - Activation class constructor (e.g. nn.ReLU)
+        """InceptionBlock.
+
+        Args:
+            c_in: Number of input feature maps from the previous layers
+            c_red: Dictionary with keys "3x3" and "5x5" specifying the output of the dimensionality reducing 1x1 convolutions
+            c_out: Dictionary with keys "1x1", "3x3", "5x5", and "max"
+            act_fn: Activation class constructor (e.g. nn.ReLU)
         """
         super().__init__()
 
@@ -666,10 +669,11 @@ def forward(self, x):
 
 class ResNetBlock(nn.Module):
     def __init__(self, c_in, act_fn, subsample=False, c_out=-1):
-        """
-        Inputs:
-            c_in - Number of input features
-            act_fn - Activation class constructor (e.g. nn.ReLU)
+        """ResNetBlock.
+
+        Args:
+            c_in: Number of input features
+            act_fn: Activation class constructor (e.g. nn.ReLU)
             subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width
             c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in
         """
@@ -711,8 +715,9 @@ def forward(self, x):
 # %%
 class PreActResNetBlock(nn.Module):
     def __init__(self, c_in, act_fn, subsample=False, c_out=-1):
-        """
-        Inputs:
+        """PreAct ResNet Block.
+
+        Args:
             c_in - Number of input features
             act_fn - Activation class constructor (e.g. nn.ReLU)
             subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width
@@ -780,8 +785,9 @@ def __init__(
         block_name="ResNetBlock",
         **kwargs,
     ):
-        """
-        Inputs:
+        """ResNet.
+
+        Args:
             num_classes - Number of classification outputs (10 for CIFAR10)
             num_blocks - List with the number of ResNet blocks to use. The first block of each group uses downsampling, except the first.
             c_hidden - List with the hidden dimensionalities in the different blocks. Usually multiplied by 2 the deeper we go.
@@ -949,8 +955,9 @@ def forward(self, x):
 # %%
 class DenseLayer(nn.Module):
     def __init__(self, c_in, bn_size, growth_rate, act_fn):
-        """
-        Inputs:
+        """DenseLayer.
+
+        Args:
             c_in - Number of input channels
             bn_size - Bottleneck size (factor of growth rate) for the output of the 1x1 convolution. Typically between 2 and 4.
             growth_rate - Number of output channels of the 3x3 convolution
@@ -980,8 +987,9 @@ def forward(self, x):
 # %%
 class DenseBlock(nn.Module):
     def __init__(self, c_in, num_layers, bn_size, growth_rate, act_fn):
-        """
-        Inputs:
+        """Dense Block.
+
+        Args:
             c_in - Number of input channels
             num_layers - Number of dense layers to apply in the block
             bn_size - Bottleneck size to use in the dense layers
diff --git a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py
index f74ed35b1..753b368db 100644
--- a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py
+++ b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py
@@ -463,7 +463,8 @@ def forward(self, x, mask=None, return_attention=False):
 # %%
 class EncoderBlock(nn.Module):
     def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
-        """
+        """EncoderBlock.
+
         Args:
             input_dim: Dimensionality of the input
             num_heads: Number of heads to use in the attention block
@@ -572,8 +573,9 @@ def get_attention_maps(self, x, mask=None):
 # %%
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model, max_len=5000):
-        """
-        Args
+        """Positional Encoding.
+
+        Args:
             d_model: Hidden dimensionality of the input.
             max_len: Maximum length of a sequence to expect.
         """
@@ -758,7 +760,8 @@ def __init__(
         dropout=0.0,
         input_dropout=0.0,
     ):
-        """
+        """TransformerPredictor.
+
         Args:
             input_dim: Hidden dimensionality of the input
             model_dim: Hidden dimensionality to use inside the Transformer
diff --git a/course_UvA-DL/06-graph-neural-networks/GNN_overview.py b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py
index 443f4421f..1e693573e 100644
--- a/course_UvA-DL/06-graph-neural-networks/GNN_overview.py
+++ b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py
@@ -162,7 +162,8 @@ def __init__(self, c_in, c_out):
         self.projection = nn.Linear(c_in, c_out)
 
     def forward(self, node_feats, adj_matrix):
-        """
+        """Forward.
+
         Args:
             node_feats: Tensor with node features of shape [batch_size, num_nodes, c_in]
             adj_matrix: Batch of adjacency matrices of the graph. If there is an edge from i to j,
@@ -317,7 +318,8 @@ def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
         nn.init.xavier_uniform_(self.a.data, gain=1.414)
 
     def forward(self, node_feats, adj_matrix, print_attn_probs=False):
-        """
+        """Forward.
+
         Args:
             node_feats: Input features of the node. Shape: [batch_size, c_in]
             adj_matrix: Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
@@ -497,7 +499,8 @@ def __init__(
         dp_rate=0.1,
         **kwargs,
     ):
-        """
+        """GNNModel.
+
         Args:
             c_in: Dimension of input features
             c_hidden: Dimension of hidden features
@@ -523,7 +526,8 @@ def __init__(
         self.layers = nn.ModuleList(layers)
 
     def forward(self, x, edge_index):
-        """
+        """Forward.
+
         Args:
             x: Input features per node
             edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
@@ -549,7 +553,8 @@ def forward(self, x, edge_index):
 # %%
 class MLPModel(nn.Module):
     def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
-        """
+        """MLPModel.
+
         Args:
             c_in: Dimension of input features
             c_hidden: Dimension of hidden features
@@ -567,7 +572,8 @@ def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
         self.layers = nn.Sequential(*layers)
 
     def forward(self, x, *args, **kwargs):
-        """
+        """Forward.
+
         Args:
             x: Input features per node
         """
@@ -844,7 +850,8 @@ def print_results(result_dict):
 # %%
 class GraphGNNModel(nn.Module):
     def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
-        """
+        """GraphGNNModel.
+
         Args:
             c_in: Dimension of input features
             c_hidden: Dimension of hidden features
@@ -857,7 +864,8 @@ def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
         self.head = nn.Sequential(nn.Dropout(dp_rate_linear), nn.Linear(c_hidden, c_out))
 
     def forward(self, x, edge_index, batch_idx):
-        """
+        """Forward.
+
         Args:
             x: Input features per node
             edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py
index 6cd07a40a..862653362 100644
--- a/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py
+++ b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py
@@ -335,7 +335,8 @@ def forward(self, x):
 # %%
 class Sampler:
     def __init__(self, model, img_shape, sample_size, max_len=8192):
-        """
+        """Sampler.
+
         Args:
             model: Neural network to use for modeling E_theta
             img_shape: Shape of the images to model
diff --git a/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py b/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py
index da289b9e6..6d4fbf625 100644
--- a/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py
+++ b/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py
@@ -131,7 +131,8 @@ def get_train_images(num):
 # %%
 class Encoder(nn.Module):
     def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU):
-        """
+        """Encoder.
+
         Args:
            num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
            base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
@@ -190,7 +191,8 @@ def forward(self, x):
 # %%
 class Decoder(nn.Module):
     def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU):
-        """
+        """Decoder.
+
         Args:
            num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
            base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
@@ -263,7 +265,7 @@ def forward(self, x):
         return x_hat
 
     def _get_reconstruction_loss(self, batch):
-        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)"""
+        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)."""
         x, _ = batch  # We do not need the labels
         x_hat = self.forward(x)
         loss = F.mse_loss(x, x_hat, reduction="none")
diff --git a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py
index 28821f6e8..446229006 100644
--- a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py
+++ b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py
@@ -261,7 +261,8 @@ def show_imgs(imgs, title=None, row_size=4):
 # %%
 class ImageFlow(L.LightningModule):
     def __init__(self, flows, import_samples=8):
-        """
+        """ImageFlow.
+
         Args:
             flows: A list of flows (each a nn.Module) that should be applied on the images.
             import_samples: Number of importance samples to use during testing (see explanation below). Can be changed at any time
@@ -401,7 +402,8 @@ def test_step(self, batch, batch_idx):
 # %%
 class Dequantization(nn.Module):
     def __init__(self, alpha=1e-5, quants=256):
-        """
+        """Dequantization.
+
         Args:
             alpha: small constant that is used to scale the original input.
                     Prevents dealing with values very close to 0 and 1 when inverting the sigmoid
@@ -588,7 +590,8 @@ def visualize_dequantization(quants, prior=None):
 # %%
 class VariationalDequantization(Dequantization):
     def __init__(self, var_flows, alpha=1e-5):
-        """
+        """Variational Dequantization.
+
         Args:
             var_flows: A list of flow transformations to use for modeling q(u|x)
             alpha: Small constant, see Dequantization for details
@@ -673,14 +676,15 @@ def __init__(self, network, mask, c_in):
         self.register_buffer("mask", mask)
 
     def forward(self, z, ldj, reverse=False, orig_img=None):
-        """
+        """Forward.
+
         Args:
             z: Latent input to the flow
-            ldj: The current ldj of the previous flows.
-                  The ldj of this layer will be added to this tensor.
+            ldj:
+                The current ldj of the previous flows. The ldj of this layer will be added to this tensor.
             reverse: If True, we apply the inverse of the layer.
-            orig_img (optional): Only needed in VarDeq. Allows external
-                                  input to condition the flow on (e.g. original image)
+            orig_img:
+                Only needed in VarDeq. Allows external input to condition the flow on (e.g. original image)
         """
         # Apply network to masked input
         z_in = z * self.mask
@@ -800,11 +804,11 @@ def forward(self, x):
 
 class LayerNormChannels(nn.Module):
     def __init__(self, c_in, eps=1e-5):
-        """
-        This module applies layer norm across channels in an image.
-        Inputs:
-            c_in - Number of channels of the input
-            eps - Small constant to stabilize std
+        """This module applies layer norm across channels in an image.
+
+        Args:
+            c_in: Number of channels of the input
+            eps: Small constant to stabilize std
         """
         super().__init__()
         self.gamma = nn.Parameter(torch.ones(1, c_in, 1, 1))
@@ -821,8 +825,8 @@ def forward(self, x):
 
 class GatedConv(nn.Module):
     def __init__(self, c_in, c_hidden):
-        """
-        This module applies a two-layer convolutional ResNet block with input gate
+        """This module applies a two-layer convolutional ResNet block with input gate.
+
         Args:
             c_in: Number of channels of the input
             c_hidden: Number of hidden dimensions we want to model (usually similar to c_in)
@@ -1251,7 +1255,8 @@ def print_num_params(model):
 # %%
 @torch.no_grad()
 def interpolate(model, img1, img2, num_steps=8):
-    """
+    """Interpolate.
+
     Args:
         model: object of ImageFlow class that represents the (trained) flow model
         img1, img2: Image tensors of shape [1, 28, 28]. Images between which should be interpolated.
@@ -1325,7 +1330,8 @@ def interpolate(model, img1, img2, num_steps=8):
 
 # %%
 def visualize_dequant_distribution(model: ImageFlow, imgs: Tensor, title: str = None):
-    """
+    """Visualize dequant distribution.
+
     Args:
         model: The flow of which we want to visualize the dequantization distribution
         imgs: Example training images of which we want to visualize the dequantization distribution
diff --git a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py
index 9d8cef8c9..a7c419ad5 100644
--- a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py
+++ b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py
@@ -154,10 +154,10 @@
 # %%
 def img_to_patch(x, patch_size, flatten_channels=True):
     """
-    Inputs:
-        x - Tensor representing the image of shape [B, C, H, W]
-        patch_size - Number of pixels per dimension of the patches (integer)
-        flatten_channels - If True, the patches will be returned in a flattened format
+    Args:
+        x: Tensor representing the image of shape [B, C, H, W]
+        patch_size: Number of pixels per dimension of the patches (integer)
+        flatten_channels: If True, the patches will be returned in a flattened format
                            as a feature vector instead of a image grid.
     """
     B, C, H, W = x.shape
@@ -209,13 +209,14 @@ def img_to_patch(x, patch_size, flatten_channels=True):
 # %%
 class AttentionBlock(nn.Module):
     def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
-        """
-        Inputs:
-            embed_dim - Dimensionality of input and attention feature vectors
-            hidden_dim - Dimensionality of hidden layer in feed-forward network
+        """Attention Block.
+
+        Args:
+            embed_dim: Dimensionality of input and attention feature vectors
+            hidden_dim: Dimensionality of hidden layer in feed-forward network
                          (usually 2-4x larger than embed_dim)
-            num_heads - Number of heads to use in the Multi-Head Attention block
-            dropout - Amount of dropout to apply in the feed-forward network
+            num_heads: Number of heads to use in the Multi-Head Attention block
+            dropout: Amount of dropout to apply in the feed-forward network
         """
         super().__init__()
 
@@ -268,18 +269,19 @@ def __init__(
         num_patches,
         dropout=0.0,
     ):
-        """
-        Inputs:
-            embed_dim - Dimensionality of the input feature vectors to the Transformer
-            hidden_dim - Dimensionality of the hidden layer in the feed-forward networks
+        """Vision Transformer.
+
+        Args:
+            embed_dim: Dimensionality of the input feature vectors to the Transformer
+            hidden_dim: Dimensionality of the hidden layer in the feed-forward networks
                          within the Transformer
-            num_channels - Number of channels of the input (3 for RGB)
-            num_heads - Number of heads to use in the Multi-Head Attention block
-            num_layers - Number of layers to use in the Transformer
-            num_classes - Number of classes to predict
-            patch_size - Number of pixels that the patches have per dimension
-            num_patches - Maximum number of patches an image can have
-            dropout - Amount of dropout to apply in the feed-forward network and
+            num_channels: Number of channels of the input (3 for RGB)
+            num_heads: Number of heads to use in the Multi-Head Attention block
+            num_layers: Number of layers to use in the Transformer
+            num_classes: Number of classes to predict
+            patch_size: Number of pixels that the patches have per dimension
+            num_patches: Maximum number of patches an image can have
+            dropout: Amount of dropout to apply in the feed-forward network and
                       on the input encoding
         """
         super().__init__()
diff --git a/course_UvA-DL/12-meta-learning/Meta_Learning.py b/course_UvA-DL/12-meta-learning/Meta_Learning.py
index 5fdd66ad2..bf87dd801 100644
--- a/course_UvA-DL/12-meta-learning/Meta_Learning.py
+++ b/course_UvA-DL/12-meta-learning/Meta_Learning.py
@@ -153,10 +153,10 @@
 class ImageDataset(data.Dataset):
     def __init__(self, imgs, targets, img_transform=None):
         """
-        Inputs:
-            imgs - Numpy array of shape [N,32,32,3] containing all images.
-            targets - PyTorch array of shape [N] containing all labels.
-            img_transform - A torchvision transformation that should be applied
+        Args:
+            imgs: Numpy array of shape [N,32,32,3] containing all images.
+            targets: PyTorch array of shape [N] containing all labels.
+            img_transform: A torchvision transformation that should be applied
                             to the images before returning. If none, no transformation
                             is applied.
         """
@@ -257,18 +257,19 @@ def dataset_from_labels(imgs, targets, class_set, **kwargs):
 # %%
 class FewShotBatchSampler:
     def __init__(self, dataset_targets, N_way, K_shot, include_query=False, shuffle=True, shuffle_once=False):
-        """
-        Inputs:
-            dataset_targets - PyTorch tensor of the labels of the data elements.
-            N_way - Number of classes to sample per batch.
-            K_shot - Number of examples to sample per class in the batch.
-            include_query - If True, returns batch of size N_way*K_shot*2, which
+        """FewShot Batch Sampler.
+
+        Args:
+            dataset_targets: PyTorch tensor of the labels of the data elements.
+            N_way: Number of classes to sample per batch.
+            K_shot: Number of examples to sample per class in the batch.
+            include_query: If True, returns batch of size N_way*K_shot*2, which
                             can be split into support and query set. Simplifies
                             the implementation of sampling the same classes but
                             distinct examples for support and query set.
-            shuffle - If True, examples and classes are newly shuffled in each
+            shuffle: If True, examples and classes are newly shuffled in each
                       iteration (for training)
-            shuffle_once - If True, examples and classes are shuffled once in
+            shuffle_once: If True, examples and classes are shuffled once in
                            the beginning, but kept constant across iterations
                            (for validation)
         """
@@ -477,10 +478,11 @@ def get_convnet(output_size):
 # %%
 class ProtoNet(L.LightningModule):
     def __init__(self, proto_dim, lr):
-        """Inputs.
+        """ProtoNet.
 
-        proto_dim - Dimensionality of prototype feature space
-        lr - Learning rate of Adam optimizer
+        Args:
+            proto_dim: Dimensionality of prototype feature space
+            lr: Learning rate of Adam optimizer
         """
         super().__init__()
         self.save_hyperparameters()
@@ -628,15 +630,16 @@ def train_model(model_class, train_loader, val_loader, **kwargs):
 # %%
 @torch.no_grad()
 def test_proto_net(model, dataset, data_feats=None, k_shot=4):
-    """Inputs.
-
-    model - Pretrained ProtoNet model
-    dataset - The dataset on which the test should be performed.
-              Should be instance of ImageDataset
-    data_feats - The encoded features of all images in the dataset.
-                 If None, they will be newly calculated, and returned
-                 for later usage.
-    k_shot - Number of examples per class in the support set.
+    """Test proto net.
+
+    Args:
+        model: Pretrained ProtoNet model
+        dataset: The dataset on which the test should be performed.
+                  Should be instance of ImageDataset
+        data_feats: The encoded features of all images in the dataset.
+                     If None, they will be newly calculated, and returned
+                     for later usage.
+        k_shot: Number of examples per class in the support set.
     """
     model = model.to(device)
     model.eval()
@@ -847,13 +850,14 @@ def plot_few_shot(acc_dict, name, color=None, ax=None):
 # %%
 class ProtoMAML(L.LightningModule):
     def __init__(self, proto_dim, lr, lr_inner, lr_output, num_inner_steps):
-        """Inputs.
-
-        proto_dim - Dimensionality of prototype feature space
-        lr - Learning rate of the outer loop Adam optimizer
-        lr_inner - Learning rate of the inner loop SGD optimizer
-        lr_output - Learning rate for the output layer in the inner loop
-        num_inner_steps - Number of inner loop updates to perform
+        """ProtoMAML.
+
+        Args:
+            proto_dim: Dimensionality of prototype feature space
+            lr: Learning rate of the outer loop Adam optimizer
+            lr_inner: Learning rate of the inner loop SGD optimizer
+            lr_output: Learning rate for the output layer in the inner loop
+            num_inner_steps: Number of inner loop updates to perform
         """
         super().__init__()
         self.save_hyperparameters()
@@ -967,17 +971,18 @@ def validation_step(self, batch, batch_idx):
 # %%
 class TaskBatchSampler:
     def __init__(self, dataset_targets, batch_size, N_way, K_shot, include_query=False, shuffle=True):
-        """
-        Inputs:
-            dataset_targets - PyTorch tensor of the labels of the data elements.
-            batch_size - Number of tasks to aggregate in a batch
-            N_way - Number of classes to sample per batch.
-            K_shot - Number of examples to sample per class in the batch.
-            include_query - If True, returns batch of size N_way*K_shot*2, which
+        """Task Batch Sampler.
+
+        Args:
+            dataset_targets: PyTorch tensor of the labels of the data elements.
+            batch_size: Number of tasks to aggregate in a batch
+            N_way: Number of classes to sample per batch.
+            K_shot: Number of examples to sample per class in the batch.
+            include_query: If True, returns batch of size N_way*K_shot*2, which
                             can be split into support and query set. Simplifies
                             the implementation of sampling the same classes but
                             distinct examples for support and query set.
-            shuffle - If True, examples and classes are newly shuffled in each
+            shuffle: If True, examples and classes are newly shuffled in each
                       iteration (for training)
         """
         super().__init__()
diff --git a/lightning_examples/reinforce-learning-DQN/.meta.yml b/lightning_examples/reinforce-learning-DQN/.meta.yml
index abf466a6d..ac693b1b3 100644
--- a/lightning_examples/reinforce-learning-DQN/.meta.yml
+++ b/lightning_examples/reinforce-learning-DQN/.meta.yml
@@ -13,7 +13,7 @@ description: |
   2. Handle unsupervised learning by using an IterableDataset where the dataset itself is constantly updated during training
   3. Each training step carries has the agent taking an action in the environment and storing the experience in the IterableDataset
 requirements:
-  - gym
+  - gym <0.24
   - pygame
   - pandas
   - seaborn
diff --git a/lightning_examples/reinforce-learning-DQN/dqn.py b/lightning_examples/reinforce-learning-DQN/dqn.py
index 357f2035f..16a431ad6 100644
--- a/lightning_examples/reinforce-learning-DQN/dqn.py
+++ b/lightning_examples/reinforce-learning-DQN/dqn.py
@@ -21,10 +21,9 @@
 
 # %%
 class DQN(nn.Module):
-    """Simple MLP network."""
-
     def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128):
-        """
+        """Simple MLP network.
+
         Args:
             obs_size: observation/state size of the environment
             n_actions: number of discrete actions available in the environment
@@ -113,10 +112,9 @@ def __iter__(self) -> Iterator[Tuple]:
 
 # %%
 class Agent:
-    """Base Agent class handeling the interaction with the environment."""
-
     def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:
-        """
+        """Base Agent class handeling the interaction with the environment.
+
         Args:
             env: training environment
             replay_buffer: replay buffer storing experiences
@@ -172,10 +170,13 @@ def play_step(
         Returns:
             reward, done
         """
-
         action = self.get_action(net, epsilon, device)
 
         # do step in the environment
+        # So, in the deprecated version of gym, the env.step() has 4 values unpacked which is
+        #     obs, reward, done, info = env.step(action)
+        # In the latest version of gym, the step() function returns back an additional variable which is truncated.
+        #     obs, reward, terminated, truncated, info = env.step(action)
         new_state, reward, done, _ = self.env.step(action)
 
         exp = Experience(self.state, action, reward, done, new_state)
@@ -194,8 +195,6 @@ def play_step(
 
 # %%
 class DQNLightning(LightningModule):
-    """Basic DQN Model."""
-
     def __init__(
         self,
         batch_size: int = 16,
@@ -211,7 +210,8 @@ def __init__(
         episode_length: int = 200,
         warm_start_steps: int = 1000,
     ) -> None:
-        """
+        """Basic DQN Model.
+
         Args:
             batch_size: size of the batches")
             lr: learning rate
diff --git a/lightning_examples/text-transformers/text-transformers.py b/lightning_examples/text-transformers/text-transformers.py
index a514a0666..a570f82eb 100644
--- a/lightning_examples/text-transformers/text-transformers.py
+++ b/lightning_examples/text-transformers/text-transformers.py
@@ -219,7 +219,7 @@ def on_validation_epoch_end(self):
         self.outputs.clear()
 
     def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
+        """Prepare optimizer and schedule (linear warmup and decay)."""
         model = self.model
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
diff --git a/pyproject.toml b/pyproject.toml
index 8e49741c9..d867478cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,43 @@
+[metadata]
+license_file = "LICENSE"
+description-file = "README.md"
+
+
+[tool.check-manifest]
+ignore = [
+    "*.yml",
+    ".github",
+    ".github/*"
+]
+
+
+[tool.pytest.ini_options]
+norecursedirs = [
+    ".git",
+    ".github",
+    "dist",
+    "build",
+    "docs",
+]
+addopts = [
+  "--strict-markers",
+  "--doctest-modules",
+  "--color=yes",
+  "--disable-pytest-warnings",
+]
+filterwarnings = [
+    "error::FutureWarning",
+]
+xfail_strict = true
+junit_duration_report = "call"
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "pass",
+]
+
+
 [tool.black]
 # https://github.com/psf/black
 line-length = 120
@@ -8,5 +48,52 @@ skip_glob = []
 profile = "black"
 line_length = 120
 
-[tool.autopep8]
-ignore = ["E731"]
+
+[tool.ruff]
+line-length = 120
+# Enable Pyflakes `E` and `F` codes by default.
+select = [
+    "E", "W",  # see: https://pypi.org/project/pycodestyle
+    "F",  # see: https://pypi.org/project/pyflakes
+#    "D",  # see: https://pypi.org/project/pydocstyle
+#    "N",  # see: https://pypi.org/project/pep8-naming
+]
+#extend-select = [
+#    "C4",  # see: https://pypi.org/project/flake8-comprehensions
+#    "PT",  # see: https://pypi.org/project/flake8-pytest-style
+#    "RET",  # see: https://pypi.org/project/flake8-return
+#    "SIM",  # see: https://pypi.org/project/flake8-simplify
+#]
+ignore = [
+    "E731",  # Do not assign a lambda expression, use a def
+    # TODO: we shall format all long comments as it comes from text cells
+    "E501",  # Line too long
+]
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".eggs",
+    ".git",
+    ".ruff_cache",
+    "__pypackages__",
+    "_build",
+    "build",
+    "dist",
+    "docs"
+]
+ignore-init-module-imports = true
+
+[tool.ruff.per-file-ignores]
+"setup.py" = ["D100", "SIM115"]
+"__about__.py" = ["D100"]
+"__init__.py" = ["D100"]
+
+[tool.ruff.pydocstyle]
+# Use Google-style docstrings.
+convention = "google"
+
+[tool.ruff.pycodestyle]
+ignore-overlong-task-comments = true
+
+[tool.ruff.mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index a7ed4be2b..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,49 +0,0 @@
-[tool:pytest]
-norecursedirs =
-    .git
-    .github
-    dist
-    build
-addopts =
-    --strict
-    --doctest-modules
-    --color=yes
-
-[coverage:report]
-exclude_lines =
-    pragma: no-cover
-    pass
-
-
-[flake8]
-max-line-length = 120
-exclude =
-    *.egg
-    build
-    temp
-select = E,W,F
-doctests = True
-verbose = 2
-# https://pep8.readthedocs.io/en/latest/intro.html#error-codes
-format = pylint
-# see: https://www.flake8rules.com/
-ignore =
-    # line too long
-    E501
-    # whitespace before ':'
-    E203
-
-
-# setup.cfg or tox.ini
-[check-manifest]
-ignore =
-    *.yml
-    .github
-    .github/*
-
-
-[metadata]
-license_file = LICENSE
-description-file = README.md
-# long_description = file:README.md
-# long_description_content_type = text/markdown