Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added new question on mlops
  • Loading branch information
Jeet009 committed Oct 7, 2025
commit 784bf192bb1a659c0b5ec023b721f0ee40059c64
34 changes: 0 additions & 34 deletions build/184.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
"name": "Jeet Mukherjee"
}
],
"tinygrad_difficulty": "medium",
"pytorch_difficulty": "medium",
"description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.",
"learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.",
"starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError",
Expand All @@ -36,37 +34,5 @@
"test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
"expected_output": "[('u1', 3.0)]"
}
],
"tinygrad_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError",
"tinygrad_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
"tinygrad_test_cases": [
{
"test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))",
"expected_output": "[('u1', 15.0), ('u2', 3.5)]"
},
{
"test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))",
"expected_output": "[]"
},
{
"test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
"expected_output": "[('u1', 3.0)]"
}
],
"pytorch_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError",
"pytorch_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
"pytorch_test_cases": [
{
"test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))",
"expected_output": "[('u1', 15.0), ('u2', 3.5)]"
},
{
"test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))",
"expected_output": "[]"
},
{
"test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
"expected_output": "[('u1', 3.0)]"
}
]
}
38 changes: 38 additions & 0 deletions build/185.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"id": "185",
"title": "Basic Data Drift Check: Mean and Variance Thresholds",
"difficulty": "easy",
"category": "MLOps",
"video": "",
"likes": "0",
"dislikes": "0",
"contributor": [
{
"profile_link": "https://github.com/Jeet009",
"name": "Jeet Mukherjee"
}
],
"description": "## Problem\n\nImplement a basic data drift check comparing two numeric datasets (reference vs. current).\n\nWrite a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:\n\n- Accepts two lists of numbers `ref` and `cur`.\n- Computes the absolute difference in means and variances.\n- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:\n\t- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`\n\t- `var_drift = abs(var(ref) - var(cur)) > var_threshold`\n\nAssume population variance (divide by N). Handle empty inputs by returning `(False, False)`.",
"learn_section": "## Solution Explanation\n\nWe compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.\n\n### Definitions\n- Mean: \\( \\mu = \\frac{1}{N}\\sum_i x_i \\)\n- Population variance: \\( \\sigma^2 = \\frac{1}{N}\\sum_i (x_i - \\mu)^2 \\)\n\n### Drift rules\n- Mean drift if \\(|\\mu_{ref} - \\mu_{cur}| > \\tau_{mean}\\)\n- Variance drift if \\(|\\sigma^2_{ref} - \\sigma^2_{cur}| > \\tau_{var}\\)\n\n### Edge cases\n- If either sample is empty, return `(False, False)` to avoid false alarms.\n- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.\n\n### Complexity\n- O(N + M) to compute stats; O(1) extra space.",
"starter_code": "from typing import List, Tuple\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\t\"\"\"Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.\n\n\tUse population variance.\n\t\"\"\"\n\t# TODO: handle empty inputs; compute means and variances; compare with thresholds\n\traise NotImplementedError",
"solution": "from typing import List, Tuple\n\n\ndef _mean(xs: List[float]) -> float:\n\treturn sum(xs) / len(xs) if xs else 0.0\n\n\ndef _var(xs: List[float]) -> float:\n\tif not xs:\n\t\treturn 0.0\n\tm = _mean(xs)\n\treturn sum((x - m) * (x - m) for x in xs) / len(xs)\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\tif not ref or not cur:\n\t\treturn (False, False)\n\tmean_ref = _mean(ref)\n\tmean_cur = _mean(cur)\n\tvar_ref = _var(ref)\n\tvar_cur = _var(cur)\n\tmean_drift = abs(mean_ref - mean_cur) > mean_threshold\n\tvar_drift = abs(var_ref - var_cur) > var_threshold\n\treturn (mean_drift, var_drift)",
"example": {
"input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)",
"output": "(True, True)",
"reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1."
},
"test_cases": [
{
"test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))",
"expected_output": "(True, True)"
},
{
"test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))",
"expected_output": "(False, False)"
},
{
"test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))",
"expected_output": "(False, False)"
}
]
}
13 changes: 13 additions & 0 deletions questions/185_data-drift-basic/description.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Problem

Implement a basic data drift check comparing two numeric datasets (reference vs. current).

Write a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:

- Accepts two lists of numbers `ref` and `cur`.
- Computes the absolute difference in means and variances.
- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:
- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`
- `var_drift = abs(var(ref) - var(cur)) > var_threshold`

Assume population variance (divide by N). Handle empty inputs by returning `(False, False)`.
5 changes: 5 additions & 0 deletions questions/185_data-drift-basic/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)",
"output": "(True, True)",
"reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1."
}
18 changes: 18 additions & 0 deletions questions/185_data-drift-basic/learn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## Solution Explanation

We compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.

### Definitions
- Mean: \( \mu = \frac{1}{N}\sum_i x_i \)
- Population variance: \( \sigma^2 = \frac{1}{N}\sum_i (x_i - \mu)^2 \)

### Drift rules
- Mean drift if \(|\mu_{ref} - \mu_{cur}| > \tau_{mean}\)
- Variance drift if \(|\sigma^2_{ref} - \sigma^2_{cur}| > \tau_{var}\)

### Edge cases
- If either sample is empty, return `(False, False)` to avoid false alarms.
- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.

### Complexity
- O(N + M) to compute stats; O(1) extra space.
12 changes: 12 additions & 0 deletions questions/185_data-drift-basic/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"id": "185",
"title": "Basic Data Drift Check: Mean and Variance Thresholds",
"difficulty": "easy",
"category": "MLOps",
"video": "",
"likes": "0",
"dislikes": "0",
"contributor": [
{ "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" }
]
}
24 changes: 24 additions & 0 deletions questions/185_data-drift-basic/solution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import List, Tuple


def _mean(xs: List[float]) -> float:
return sum(xs) / len(xs) if xs else 0.0


def _var(xs: List[float]) -> float:
if not xs:
return 0.0
m = _mean(xs)
return sum((x - m) * (x - m) for x in xs) / len(xs)


def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:
if not ref or not cur:
return (False, False)
mean_ref = _mean(ref)
mean_cur = _mean(cur)
var_ref = _var(ref)
var_cur = _var(cur)
mean_drift = abs(mean_ref - mean_cur) > mean_threshold
var_drift = abs(var_ref - var_cur) > var_threshold
return (mean_drift, var_drift)
10 changes: 10 additions & 0 deletions questions/185_data-drift-basic/starter_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List, Tuple


def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:
"""Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.

Use population variance.
"""
# TODO: handle empty inputs; compute means and variances; compare with thresholds
raise NotImplementedError
5 changes: 5 additions & 0 deletions questions/185_data-drift-basic/tests.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{ "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))", "expected_output": "(True, True)" },
{ "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))", "expected_output": "(False, False)" },
{ "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))", "expected_output": "(False, False)" }
]