added new question on mlops

Open-Deep-ML · moe18 · Oct 23, 2025 · Sep 15, 2025 · Oct 7, 2025 · Oct 7, 2025
commit 784bf192bb1a659c0b5ec023b721f0ee40059c64
diff --git a/build/184.json b/build/184.json
@@ -12,8 +12,6 @@
       "name": "Jeet Mukherjee"
     }
   ],
-  "tinygrad_difficulty": "medium",
-  "pytorch_difficulty": "medium",
   "description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.",
   "learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.",
   "starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError",
@@ -36,37 +34,5 @@
       "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
       "expected_output": "[('u1', 3.0)]"
     }
-  ],
-  "tinygrad_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError",
-  "tinygrad_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
-  "tinygrad_test_cases": [
-    {
-      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))",
-      "expected_output": "[('u1', 15.0), ('u2', 3.5)]"
-    },
-    {
-      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))",
-      "expected_output": "[]"
-    },
-    {
-      "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
-      "expected_output": "[('u1', 3.0)]"
-    }
-  ],
-  "pytorch_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError",
-  "pytorch_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
-  "pytorch_test_cases": [
-    {
-      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))",
-      "expected_output": "[('u1', 15.0), ('u2', 3.5)]"
-    },
-    {
-      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))",
-      "expected_output": "[]"
-    },
-    {
-      "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
-      "expected_output": "[('u1', 3.0)]"
-    }
   ]
 }
diff --git a/build/185.json b/build/185.json
@@ -0,0 +1,38 @@
+{
+  "id": "185",
+  "title": "Basic Data Drift Check: Mean and Variance Thresholds",
+  "difficulty": "easy",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/Jeet009",
+      "name": "Jeet Mukherjee"
+    }
+  ],
+  "description": "## Problem\n\nImplement a basic data drift check comparing two numeric datasets (reference vs. current).\n\nWrite a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:\n\n- Accepts two lists of numbers `ref` and `cur`.\n- Computes the absolute difference in means and variances.\n- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:\n\t- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`\n\t- `var_drift  = abs(var(ref)  - var(cur))  > var_threshold`\n\nAssume population variance (divide by N). Handle empty inputs by returning `(False, False)`.",
+  "learn_section": "## Solution Explanation\n\nWe compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.\n\n### Definitions\n- Mean: \\( \\mu = \\frac{1}{N}\\sum_i x_i \\)\n- Population variance: \\( \\sigma^2 = \\frac{1}{N}\\sum_i (x_i - \\mu)^2 \\)\n\n### Drift rules\n- Mean drift if \\(|\\mu_{ref} - \\mu_{cur}| > \\tau_{mean}\\)\n- Variance drift if \\(|\\sigma^2_{ref} - \\sigma^2_{cur}| > \\tau_{var}\\)\n\n### Edge cases\n- If either sample is empty, return `(False, False)` to avoid false alarms.\n- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.\n\n### Complexity\n- O(N + M) to compute stats; O(1) extra space.",
+  "starter_code": "from typing import List, Tuple\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\t\"\"\"Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.\n\n\tUse population variance.\n\t\"\"\"\n\t# TODO: handle empty inputs; compute means and variances; compare with thresholds\n\traise NotImplementedError",
+  "solution": "from typing import List, Tuple\n\n\ndef _mean(xs: List[float]) -> float:\n\treturn sum(xs) / len(xs) if xs else 0.0\n\n\ndef _var(xs: List[float]) -> float:\n\tif not xs:\n\t\treturn 0.0\n\tm = _mean(xs)\n\treturn sum((x - m) * (x - m) for x in xs) / len(xs)\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\tif not ref or not cur:\n\t\treturn (False, False)\n\tmean_ref = _mean(ref)\n\tmean_cur = _mean(cur)\n\tvar_ref = _var(ref)\n\tvar_cur = _var(cur)\n\tmean_drift = abs(mean_ref - mean_cur) > mean_threshold\n\tvar_drift = abs(var_ref - var_cur) > var_threshold\n\treturn (mean_drift, var_drift)",
+  "example": {
+    "input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)",
+    "output": "(True, True)",
+    "reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1."
+  },
+  "test_cases": [
+    {
+      "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))",
+      "expected_output": "(True, True)"
+    },
+    {
+      "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))",
+      "expected_output": "(False, False)"
+    },
+    {
+      "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))",
+      "expected_output": "(False, False)"
+    }
+  ]
+}
diff --git a/questions/185_data-drift-basic/description.md b/questions/185_data-drift-basic/description.md
@@ -0,0 +1,13 @@
+## Problem
+
+Implement a basic data drift check comparing two numeric datasets (reference vs. current).
+
+Write a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:
+
+- Accepts two lists of numbers `ref` and `cur`.
+- Computes the absolute difference in means and variances.
+- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:
+	- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`
+	- `var_drift  = abs(var(ref)  - var(cur))  > var_threshold`
+
+Assume population variance (divide by N). Handle empty inputs by returning `(False, False)`.
diff --git a/questions/185_data-drift-basic/example.json b/questions/185_data-drift-basic/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)",
+  "output": "(True, True)",
+  "reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1."
+}
diff --git a/questions/185_data-drift-basic/learn.md b/questions/185_data-drift-basic/learn.md
@@ -0,0 +1,18 @@
+## Solution Explanation
+
+We compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.
+
+### Definitions
+- Mean: \( \mu = \frac{1}{N}\sum_i x_i \)
+- Population variance: \( \sigma^2 = \frac{1}{N}\sum_i (x_i - \mu)^2 \)
+
+### Drift rules
+- Mean drift if \(|\mu_{ref} - \mu_{cur}| > \tau_{mean}\)
+- Variance drift if \(|\sigma^2_{ref} - \sigma^2_{cur}| > \tau_{var}\)
+
+### Edge cases
+- If either sample is empty, return `(False, False)` to avoid false alarms.
+- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.
+
+### Complexity
+- O(N + M) to compute stats; O(1) extra space.
diff --git a/questions/185_data-drift-basic/meta.json b/questions/185_data-drift-basic/meta.json
@@ -0,0 +1,12 @@
+{
+  "id": "185",
+  "title": "Basic Data Drift Check: Mean and Variance Thresholds",
+  "difficulty": "easy",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" }
+  ]
+}
diff --git a/questions/185_data-drift-basic/solution.py b/questions/185_data-drift-basic/solution.py
@@ -0,0 +1,24 @@
+from typing import List, Tuple
+
+
+def _mean(xs: List[float]) -> float:
+	return sum(xs) / len(xs) if xs else 0.0
+
+
+def _var(xs: List[float]) -> float:
+	if not xs:
+		return 0.0
+	m = _mean(xs)
+	return sum((x - m) * (x - m) for x in xs) / len(xs)
+
+
+def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:
+	if not ref or not cur:
+		return (False, False)
+	mean_ref = _mean(ref)
+	mean_cur = _mean(cur)
+	var_ref = _var(ref)
+	var_cur = _var(cur)
+	mean_drift = abs(mean_ref - mean_cur) > mean_threshold
+	var_drift = abs(var_ref - var_cur) > var_threshold
+	return (mean_drift, var_drift)
diff --git a/questions/185_data-drift-basic/starter_code.py b/questions/185_data-drift-basic/starter_code.py
@@ -0,0 +1,10 @@
+from typing import List, Tuple
+
+
+def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:
+	"""Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.
+
+	Use population variance.
+	"""
+	# TODO: handle empty inputs; compute means and variances; compare with thresholds
+	raise NotImplementedError
diff --git a/questions/185_data-drift-basic/tests.json b/questions/185_data-drift-basic/tests.json
@@ -0,0 +1,5 @@
+[
+  { "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))", "expected_output": "(True, True)" },
+  { "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))", "expected_output": "(False, False)" },
+  { "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))", "expected_output": "(False, False)" }
+]