diff --git a/169. AdamW Optimizer step.py b/169. AdamW Optimizer step.py new file mode 100644 index 00000000..6b94044b --- /dev/null +++ b/169. AdamW Optimizer step.py @@ -0,0 +1,35 @@ +import numpy as np + +def adamw_update(w, g, m, v, t, lr, beta1, beta2, epsilon, weight_decay): + """ + Perform one AdamW optimizer step. + Args: + w: parameter vector (np.ndarray) + g: gradient vector (np.ndarray) + m: first moment vector (np.ndarray) + v: second moment vector (np.ndarray) + t: integer, current time step + lr: float, learning rate + beta1: float, beta1 parameter + beta2: float, beta2 parameter + epsilon: float, small constant + weight_decay: float, weight decay coefficient + Returns: + w_new, m_new, v_new + """ + + # Apply weight decay (decoupled from gradient) + w = w - lr * weight_decay * w + + # Update biased first and second moments + m = beta1 * m + (1 - beta1) * g + v = beta2 * v + (1 - beta2) * (g ** 2) + + # Compute bias-corrected estimates + m_hat = m / (1 - beta1 ** t) + v_hat = v / (1 - beta2 ** t) + + # Update parameters + w = w - lr * m_hat / (np.sqrt(v_hat) + epsilon) + + return w, m, v diff --git a/build/186.json b/build/186.json new file mode 100644 index 00000000..1054782a --- /dev/null +++ b/build/186.json @@ -0,0 +1,46 @@ +{ + "id": "186", + "title": "Gaussian Process for Regression", + "difficulty": "medium", + "category": "Machine Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/Coder1010ayush", + "name": "Ayush" + } + ], + "description": "## Problem\n\nProblem Statement: Task is to implement GaussianProcessRegression class which is a guassian process model for prediction regression problems.", + "learn_section": "# **Gaussian Processes (GP): From-Scratch Regression Example**\n\n## **1. What’s a Gaussian Process?**\n\nA **Gaussian Process** defines a distribution over functions $f(\\cdot)$.\nFor any finite set of inputs $( X = {x_i}_{i=1}^n )$, the function values $f(X)$ follow a multivariate normal:\n\n$$\nf(X) \\sim \\mathcal{N}\\big(0,; K(X,X)\\big)\n$$\n\nwhere ( K ) is a **kernel** (covariance) function encoding similarity between inputs.\nWith noisy targets $( y = f(X) + \\varepsilon, \\varepsilon \\sim \\mathcal{N}(0,\\sigma_n^2 I) )$,\nGP regression yields a closed-form posterior predictive mean and variance at new points $( X_* )$.\n\n---\n\n## **2. The Implementation at a Glance**\n\nThe provided code builds a minimal yet complete GP regression stack:\n\n* **Kernels implemented**\n\n * Radial Basis Function (RBF / Squared Exponential)\n * Matérn $(( \\nu = 0.5, 1.5, 2.5 ), or general ( \\nu ))$\n * Periodic\n * Linear\n * Rational Quadratic\n\n* **Core GP classes**\n\n * `_GaussianProcessBase`: kernel selection & covariance matrix computation\n * `GaussianProcessRegression`:\n\n * `fit`: $builds ( K )$, does **Cholesky decomposition**, $solves ( \\alpha )$\n * `predict`: returns posterior mean & variance\n * `log_marginal_likelihood`: computes GP evidence\n * `optimize_hyperparameters`: basic optimizer (for RBF hyperparams)\n\n---\n\n## **3. Kernel Cheat-Sheet**\n\nLet $( x, x' \\in \\mathbb{R}^d ), ( r = \\lVert x - x' \\rVert )$.\n\n* **RBF (SE):**\n $$\n k_{\\text{RBF}}(x,x') = \\sigma^2 \\exp!\\left(-\\tfrac{1}{2}\\tfrac{r^2}{\\ell^2}\\right)\n $$\n\n* **Matérn (( \\nu = 1.5 )):**\n $$\n k(x,x') = \\Big(1 + \\tfrac{\\sqrt{3},r}{\\ell}\\Big)\\exp!\\Big(-\\tfrac{\\sqrt{3},r}{\\ell}\\Big)\n $$\n\n* **Periodic:**\n $$\n k(x,x') = \\sigma^2 \\exp!\\left(-\\tfrac{2}{\\ell^2}\\sin^2!\\Big(\\tfrac{\\pi r}{p}\\Big)\\right)\n $$\n\n* **Linear:**\n $$\n k(x,x') = \\sigma_b^2 + \\sigma_v^2,x^\\top x'\n $$\n\n* **Rational Quadratic:**\n $$\n k(x,x') = \\sigma^2\\Big(1 + \\tfrac{r^2}{2\\alpha \\ell^2}\\Big)^{-\\alpha}\n $$\n\n---\n\n## **4. GP Regression Mechanics**\n\n### **Training**\n\n1. Build covariance:\n $$\n K = K(X,X) + \\sigma_n^2 I\n $$\n\n2. Cholesky factorization:\n $$\n K = L L^\\top\n $$\n\n3. Solve ( \\alpha ):\n $$\n L L^\\top \\alpha = y\n $$\n\n### **Prediction**\n\nAt new inputs ( X_* ):\n\n* $( K_* = K(X, X_*) ), ( K_{**} = K(X_*, X_*) )$\n\n* **Mean:**\n $$\n \\mu_* = K_*^\\top \\alpha\n $$\n\n* **Covariance:**\n $$\n \\Sigma_* = K_{**} - V^\\top V, \\quad V = L^{-1} K_*\n $$\n\n### **Model Selection**\n\n* **Log Marginal Likelihood (LML):**\n $$\n \\log p(y \\mid X) = -\\tfrac{1}{2} y^\\top \\alpha - \\sum\\nolimits_i \\log L_{ii} - \\tfrac{n}{2}\\log(2\\pi)\n $$\n\n---\n\n## **5. Worked Example (Linear Kernel)**\n\n```python\nimport numpy as np\ngp = GaussianProcessRegression(kernel='linear',\n kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0},\n noise=1e-8)\n\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9]) # y = 2x + 1\ngp.fit(X_train, y_train)\n\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\") # -> 7.0000\n```\n\n---\n\n## **6. When to Use GP Regression**\n\n* **Small-to-medium datasets** where uncertainty estimates are valuable\n* Cases requiring **predictive intervals** (not just point predictions)\n* **Nonparametric modeling** with kernel priors\n* Automatic hyperparameter tuning via **marginal likelihood**\n\n---\n\n## **7. Practical Tips**\n\n* Always add **jitter** $10^{-6}$ to the diagonal for numerical stability\n* **Standardize inputs/outputs** before training\n* Be aware: Exact GP has complexity **$\\mathcal{O}(n^3)$** in time and **$\\mathcal{O}(n^2)$** in memory\n* Choose kernels to match problem structure:\n\n * **RBF:** smooth functions\n * **Matérn:** rougher functions\n * **Periodic:** seasonal/cyclical data\n * **Linear:** global linear trends", + "starter_code": "import math # ---------------------------------------- utf-8 encoding ---------------------------------\n\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\n\n\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n pass\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n pass\n\n\ndef periodic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n pass\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n pass\n\n\ndef rational_quadratic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n pass\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n pass\n\n def _select_kernel(self, x1, x2):\n \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n pass\n\n def _compute_covariance(self, X1, X2):\n \"\"\"\n Computes the covariance matrix between two sets of points.\n This method fixes the vectorization bug from the original code.\n \"\"\"\n pass\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n def fit(self, X, y):\n pass\n\n def predict(self, X_test, return_std=False):\n pass\n\n def log_marginal_likelihood(self):\n pass\n\n def optimize_hyperparameters(self):\n pass", + "solution": "# ---------------------------------------- utf-8 encoding ---------------------------------\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\nfrom scipy.spatial.distance import euclidean\nfrom scipy.special import kv as bessel_kv\nfrom scipy.special import gamma\nfrom scipy.linalg import cholesky, solve_triangular\nfrom scipy.optimize import minimize\nfrom scipy.special import expit, softmax\n\n\n# --- KERNEL FUNCTIONS --------------------------------------------------------\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n d = euclidean(x, x_prime)\n if d == 0:\n return 1.0 # Covariance with self is 1 before scaling\n if nu == 0.5:\n return np.exp(-d / length_scale)\n elif nu == 1.5:\n return (1 + np.sqrt(3) * d / length_scale) * np.exp(\n -np.sqrt(3) * d / length_scale\n )\n elif nu == 2.5:\n return (\n 1 + np.sqrt(5) * d / length_scale + 5 * d**2 / (3 * length_scale**2)\n ) * np.exp(-np.sqrt(5) * d / length_scale)\n else:\n factor = (2 ** (1 - nu)) / gamma(nu)\n scaled_d = np.sqrt(2 * nu) * d / length_scale\n return factor * (scaled_d**nu) * bessel_kv(nu, scaled_d)\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n # This is a squared exponential kernel\n\n # Calculate the squared euclidean distance\n sq_norm = np.linalg.norm(x - x_prime) ** 2\n\n # Correctly implement the formula\n return sigma**2 * np.exp(-sq_norm / (2 * length_scale**2))\n\n\ndef periodic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n return sigma**2 * np.exp(\n -2 * np.sin(np.pi * np.linalg.norm(x - x_prime) / period) ** 2 / length_scale**2\n )\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n return sigma_b**2 + sigma_v**2 * np.dot(x, x_prime)\n\n\ndef rational_quadratic_kernel(\n x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n return sigma**2 * (\n 1 + np.linalg.norm(x - x_prime) ** 2 / (2 * alpha * length_scale**2)\n ) ** (-alpha)\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n self.kernel_name = kernel\n self.noise = noise\n self.kernel_params = kernel_params if kernel_params else {}\n self.X_train = None\n self.y_train = None\n self.K = None\n\n def _select_kernel(self, x1, x2):\n \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n if self.kernel_name == \"rbf\":\n return rbf_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"matern\":\n return matern_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"periodic\":\n return periodic_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"linear\":\n return linear_kernel(x1, x2, **self.kernel_params)\n elif self.kernel_name == \"rational_quadratic\":\n return rational_quadratic_kernel(x1, x2, **self.kernel_params)\n else:\n raise ValueError(\n \"Unsupported kernel. Choose from ['rbf', 'matern', 'periodic', 'linear', 'rational_quadratic'].\"\n )\n\n def _compute_covariance(self, X1, X2):\n \"\"\"\n Computes the covariance matrix between two sets of points.\n This method fixes the vectorization bug from the original code.\n \"\"\"\n # Ensuring X1 and X2 are 2D arrays\n X1 = np.atleast_2d(X1)\n X2 = np.atleast_2d(X2)\n\n n1, _ = X1.shape\n n2, _ = X2.shape\n K = np.zeros((n1, n2))\n for i in range(n1):\n for j in range(n2):\n K[i, j] = self._select_kernel(X1[i], X2[j])\n return K\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n def fit(self, X, y):\n self.X_train = np.asarray(X)\n self.y_train = np.asarray(y)\n self.K = self._compute_covariance(\n self.X_train, self.X_train\n ) + self.noise * np.eye(len(self.X_train))\n\n # Compute Cholesky decomposition for stable inversion\n self.L = cholesky(self.K, lower=True)\n # alpha = K_inv * y\n self.alpha = solve_triangular(\n self.L.T, solve_triangular(self.L, self.y_train, lower=True)\n )\n\n def predict(self, X_test, return_std=False):\n X_test = np.atleast_2d(X_test)\n K_s = self._compute_covariance(self.X_train, X_test)\n K_ss = self._compute_covariance(X_test, X_test)\n\n # Compute predictive mean\n mu = K_s.T @ self.alpha\n\n # Compute predictive variance\n v = solve_triangular(self.L, K_s, lower=True)\n cov = K_ss - v.T @ v\n\n if return_std:\n return mu, np.sqrt(np.diag(cov))\n return mu\n\n def log_marginal_likelihood(self):\n return (\n -0.5 * (self.y_train.T @ self.alpha)\n - np.sum(np.log(np.diag(self.L)))\n - len(self.X_train) / 2 * np.log(2 * np.pi)\n )\n\n def optimize_hyperparameters(self):\n # NOTE: This is a simplified optimizer for 'rbf' kernel's params.\n def objective(params):\n self.kernel_params = {\n \"length_scale\": np.exp(params[0]),\n \"sigma\": np.exp(params[1]),\n }\n self.fit(self.X_train, self.y_train)\n return -self.log_marginal_likelihood()\n\n init_params = np.log(\n [\n self.kernel_params.get(\"length_scale\", 1.0),\n self.kernel_params.get(\"sigma\", 1.0),\n ]\n )\n res = minimize(\n objective, init_params, method=\"L-BFGS-B\", bounds=[(-5, 5), (-5, 5)]\n )\n\n self.kernel_params = {\n \"length_scale\": np.exp(res.x[0]),\n \"sigma\": np.exp(res.x[1]),\n }\n # Re-fit with optimal hyperparameters\n self.fit(self.X_train, self.y_train)\n\n\nif __name__ == \"__main__\":\n gp = GaussianProcessRegression(\n kernel=\"linear\", kernel_params={\"sigma_b\": 0.0, \"sigma_v\": 1.0}, noise=1e-8\n )\n X_train = np.array([[1], [2], [4]])\n y_train = np.array([3, 5, 9])\n gp.fit(X_train, y_train)\n X_test = np.array([[3.0]])\n mu = gp.predict(X_test)", + "example": { + "input": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "output": "7.0000", + "reasoning": "A Gaussian Process with a linear kernel is trained on perfectly linear data that follows the function y = 2x + 1. When asked to predict the value at x=3, the model perfectly interpolates the linear function it has learned, resulting in a prediction of 2*3 + 1 = 7. The near-zero noise ensures the prediction is exact." + }, + "test_cases": [ + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "0.2814" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=0.2814, std=0.7734" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0]])\ny_train = np.array([1.0, 3.0, 1.5])\ngp.fit(X_train, y_train)\nX_test = np.array([[2.5]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")", + "expected_output": "mu=3.0000, std=0.0001" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.1, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "7.0000" + }, + { + "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.5}, noise=1e-8)\nX_train = np.array([[1, 2], [3, 4], [5, 1]])\ny_train = np.sum(X_train, axis=1)\ngp.fit(X_train, y_train)\nX_test = np.array([[2, 3]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")", + "expected_output": "5.5553" + } + ] +} \ No newline at end of file diff --git a/build/188.json b/build/188.json new file mode 100644 index 00000000..716c0d0a --- /dev/null +++ b/build/188.json @@ -0,0 +1,78 @@ +{ + "id": "188", + "title": "Log-Softmax & Cross-Entropy", + "difficulty": "medium", + "category": "Deep Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://www.linkedin.com/in/preetham-a-k-18b97931b/", + "name": "Preetham AK" + } + ], + "pytorch_difficulty": "medium", + "description": "# Log-Softmax and Cross-Entropy Loss Implementation\n\nImplement a numerically stable log-softmax function and use it to compute the cross-entropy loss from scratch in PyTorch.\n\nYou are **not allowed** to use `torch.nn.functional.log_softmax` or `torch.nn.functional.cross_entropy`. You may only use basic PyTorch tensor operations.\n\nYour function should support:\n- 1D or 2D input tensors\n- Batch computation\n- Numerical stability using the log-sum-exp trick", + "learn_section": "# Learning Goals\n\n1. Understand how **softmax** converts logits to probabilities.\n2. Learn why **log-softmax** is preferred for numerical stability.\n3. Implement **cross-entropy loss** manually.\n4. Apply batch computations in PyTorch.\n5. Understand the connection between logits, probabilities, and loss functions.", + "starter_code": "import torch\nfrom typing import List\n\ndef log_softmax_stable(x: torch.Tensor) -> torch.Tensor:\n \"\"\"\n Compute log-softmax of x in a numerically stable way.\n Args:\n x: torch.Tensor of shape (N,) or (N, C)\n Returns:\n log-softmax tensor of same shape\n \"\"\"\n # Your code here\n pass\n\ndef cross_entropy_loss(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:\n \"\"\"\n Compute cross-entropy loss given true labels and predicted logits.\n Args:\n y_true: torch.Tensor of shape (N,)\n y_pred: torch.Tensor of shape (N, C)\n Returns:\n scalar mean loss\n \"\"\"\n # Your code here\n pass", + "solution": "import torch\n\ndef log_softmax_stable(x: torch.Tensor) -> torch.Tensor:\n x_max = x.max(dim=-1, keepdim=True).values\n log_probs = x - x_max - torch.log(torch.exp(x - x_max).sum(dim=-1, keepdim=True))\n return log_probs\n\ndef cross_entropy_loss(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:\n log_probs = log_softmax_stable(y_pred)\n loss = -log_probs[range(len(y_true)), y_true].mean()\n return loss", + "example": { + "input": { + "y_true": [ + 0, + 1 + ], + "y_pred": [ + [ + 2.0, + 1.0, + 0.1 + ], + [ + 0.5, + 2.5, + 0.3 + ] + ] + }, + "output": { + "log_softmax": [ + [ + -0.5514, + -1.5514, + -2.4514 + ], + [ + -2.2808, + -0.2808, + -2.4808 + ] + ], + "cross_entropy_loss": 0.4161 + }, + "reasoning": "Compute log-softmax using the numerical stability trick (subtract max of each row before exponentiating), then compute the mean negative log probability of the correct classes to get the cross-entropy loss." + }, + "test_cases": [ + { + "test": "import torch\nfrom starter_code import log_softmax_stable, cross_entropy_loss\n\ny_true = torch.tensor([0,1])\ny_pred = torch.tensor([[2.0,1.0,0.1],[0.5,2.5,0.3]])\n\n# Test log-softmax\nexpected_log_softmax = torch.tensor([[-0.5514,-1.5514,-2.4514],[-2.2808,-0.2808,-2.4808]])\nassert torch.allclose(log_softmax_stable(y_pred), expected_log_softmax, atol=1e-3)\n\n# Test cross-entropy loss\nexpected_loss = 0.4161\nassert abs(cross_entropy_loss(y_true, y_pred).item() - expected_loss) < 1e-3", + "expected_output": "pass" + } + ], + "tinygrad_starter_code": "def your_function(...):\n pass", + "tinygrad_solution": "def your_function(...):\n ...", + "tinygrad_test_cases": [ + { + "test": "print(your_function(...))", + "expected_output": "..." + } + ], + "pytorch_starter_code": "def your_function(...):\n pass", + "pytorch_solution": "def your_function(...):\n ...", + "pytorch_test_cases": [ + { + "test": "print(your_function(...))", + "expected_output": "..." + } + ] +} \ No newline at end of file diff --git a/questions/188_logsoftmax_crossentropy/description.md b/questions/188_logsoftmax_crossentropy/description.md new file mode 100644 index 00000000..f1f2cdfd --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/description.md @@ -0,0 +1,10 @@ +# Log-Softmax and Cross-Entropy Loss Implementation + +Implement a numerically stable log-softmax function and use it to compute the cross-entropy loss from scratch in PyTorch. + +You are **not allowed** to use `torch.nn.functional.log_softmax` or `torch.nn.functional.cross_entropy`. You may only use basic PyTorch tensor operations. + +Your function should support: +- 1D or 2D input tensors +- Batch computation +- Numerical stability using the log-sum-exp trick diff --git a/questions/188_logsoftmax_crossentropy/example.json b/questions/188_logsoftmax_crossentropy/example.json new file mode 100644 index 00000000..030eb326 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/example.json @@ -0,0 +1,13 @@ +{ + "input": { + "y_true": [0, 1], + "y_pred": [[2.0, 1.0, 0.1], + [0.5, 2.5, 0.3]] + }, + "output": { + "log_softmax": [[-0.5514, -1.5514, -2.4514], + [-2.2808, -0.2808, -2.4808]], + "cross_entropy_loss": 0.4161 + }, + "reasoning": "Compute log-softmax using the numerical stability trick (subtract max of each row before exponentiating), then compute the mean negative log probability of the correct classes to get the cross-entropy loss." +} diff --git a/questions/188_logsoftmax_crossentropy/learn.md b/questions/188_logsoftmax_crossentropy/learn.md new file mode 100644 index 00000000..448a709c --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/learn.md @@ -0,0 +1,7 @@ +# Learning Goals + +1. Understand how **softmax** converts logits to probabilities. +2. Learn why **log-softmax** is preferred for numerical stability. +3. Implement **cross-entropy loss** manually. +4. Apply batch computations in PyTorch. +5. Understand the connection between logits, probabilities, and loss functions. diff --git a/questions/188_logsoftmax_crossentropy/meta.json b/questions/188_logsoftmax_crossentropy/meta.json new file mode 100644 index 00000000..ea79c99c --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/meta.json @@ -0,0 +1,16 @@ +{ + "id": "188", + "title": "Log-Softmax & Cross-Entropy", + "difficulty": "medium", + "category": "Deep Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://www.linkedin.com/in/preetham-a-k-18b97931b/", + "name": "Preetham AK" + } + ], + "pytorch_difficulty": "medium" +} diff --git a/questions/188_logsoftmax_crossentropy/pytorch/solution.py b/questions/188_logsoftmax_crossentropy/pytorch/solution.py new file mode 100644 index 00000000..9b74bcbd --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/pytorch/solution.py @@ -0,0 +1,2 @@ +def your_function(...): + ... diff --git a/questions/188_logsoftmax_crossentropy/pytorch/starter_code.py b/questions/188_logsoftmax_crossentropy/pytorch/starter_code.py new file mode 100644 index 00000000..d3e5beb5 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/pytorch/starter_code.py @@ -0,0 +1,2 @@ +def your_function(...): + pass diff --git a/questions/188_logsoftmax_crossentropy/pytorch/tests.json b/questions/188_logsoftmax_crossentropy/pytorch/tests.json new file mode 100644 index 00000000..e4e4b180 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/pytorch/tests.json @@ -0,0 +1,6 @@ +[ + { + "test": "print(your_function(...))", + "expected_output": "..." + } +] diff --git a/questions/188_logsoftmax_crossentropy/solution.py b/questions/188_logsoftmax_crossentropy/solution.py new file mode 100644 index 00000000..b7e5a2c5 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/solution.py @@ -0,0 +1,11 @@ +import torch + +def log_softmax_stable(x: torch.Tensor) -> torch.Tensor: + x_max = x.max(dim=-1, keepdim=True).values + log_probs = x - x_max - torch.log(torch.exp(x - x_max).sum(dim=-1, keepdim=True)) + return log_probs + +def cross_entropy_loss(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor: + log_probs = log_softmax_stable(y_pred) + loss = -log_probs[range(len(y_true)), y_true].mean() + return loss \ No newline at end of file diff --git a/questions/188_logsoftmax_crossentropy/starter_code.py b/questions/188_logsoftmax_crossentropy/starter_code.py new file mode 100644 index 00000000..c263d66d --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/starter_code.py @@ -0,0 +1,25 @@ +import torch +from typing import List + +def log_softmax_stable(x: torch.Tensor) -> torch.Tensor: + """ + Compute log-softmax of x in a numerically stable way. + Args: + x: torch.Tensor of shape (N,) or (N, C) + Returns: + log-softmax tensor of same shape + """ + # Your code here + pass + +def cross_entropy_loss(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor: + """ + Compute cross-entropy loss given true labels and predicted logits. + Args: + y_true: torch.Tensor of shape (N,) + y_pred: torch.Tensor of shape (N, C) + Returns: + scalar mean loss + """ + # Your code here + pass diff --git a/questions/188_logsoftmax_crossentropy/tests.json b/questions/188_logsoftmax_crossentropy/tests.json new file mode 100644 index 00000000..b941f461 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/tests.json @@ -0,0 +1,6 @@ +[ + { + "test": "import torch\nfrom starter_code import log_softmax_stable, cross_entropy_loss\n\ny_true = torch.tensor([0,1])\ny_pred = torch.tensor([[2.0,1.0,0.1],[0.5,2.5,0.3]])\n\n# Test log-softmax\nexpected_log_softmax = torch.tensor([[-0.5514,-1.5514,-2.4514],[-2.2808,-0.2808,-2.4808]])\nassert torch.allclose(log_softmax_stable(y_pred), expected_log_softmax, atol=1e-3)\n\n# Test cross-entropy loss\nexpected_loss = 0.4161\nassert abs(cross_entropy_loss(y_true, y_pred).item() - expected_loss) < 1e-3", + "expected_output": "pass" + } +] diff --git a/questions/188_logsoftmax_crossentropy/tinygrad/solution.py b/questions/188_logsoftmax_crossentropy/tinygrad/solution.py new file mode 100644 index 00000000..9b74bcbd --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/tinygrad/solution.py @@ -0,0 +1,2 @@ +def your_function(...): + ... diff --git a/questions/188_logsoftmax_crossentropy/tinygrad/starter_code.py b/questions/188_logsoftmax_crossentropy/tinygrad/starter_code.py new file mode 100644 index 00000000..d3e5beb5 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/tinygrad/starter_code.py @@ -0,0 +1,2 @@ +def your_function(...): + pass diff --git a/questions/188_logsoftmax_crossentropy/tinygrad/tests.json b/questions/188_logsoftmax_crossentropy/tinygrad/tests.json new file mode 100644 index 00000000..e4e4b180 --- /dev/null +++ b/questions/188_logsoftmax_crossentropy/tinygrad/tests.json @@ -0,0 +1,6 @@ +[ + { + "test": "print(your_function(...))", + "expected_output": "..." + } +]