diff --git a/questions/186_one-sample-z-test-hypothesis-testing/description.md b/questions/186_one-sample-z-test-hypothesis-testing/description.md new file mode 100644 index 00000000..754111c7 --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/description.md @@ -0,0 +1,18 @@ +Implement a function to perform a one-sample Z-test for a population mean when the population standard deviation is known. Your function must support both one-tailed and two-tailed alternatives. + +Implement a function with the signature: +- one_sample_z_test(sample_mean, population_mean, population_std, n, alternative="two-sided") + +Where: +- sample_mean: The observed sample mean (float) +- population_mean: The hypothesized population mean under H0 (float) +- population_std: The known population standard deviation (float > 0) +- n: Sample size (int > 0) +- alternative: One of {"two-sided", "greater", "less"} + +Return a dictionary with: +- "z": the computed Z statistic rounded to 4 decimals +- "p_value": the corresponding p-value rounded to 4 decimals + +Use the standard normal distribution for the p-value. Handle invalid inputs minimally by assuming valid types and values. + diff --git a/questions/186_one-sample-z-test-hypothesis-testing/example.json b/questions/186_one-sample-z-test-hypothesis-testing/example.json new file mode 100644 index 00000000..d6f0d1a2 --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/example.json @@ -0,0 +1,6 @@ +{ + "input": "sample_mean=103.0, population_mean=100.0, population_std=15.0, n=36, alternative='greater'", + "output": "{'z': 1.2, 'p_value': 0.1151}", + "reasoning": "Standard error = 15/sqrt(36)=2.5. Z=(103-100)/2.5=1.2. For a 'greater' test, p=1-CDF(1.2)=0.1151." +} + diff --git a/questions/186_one-sample-z-test-hypothesis-testing/learn.md b/questions/186_one-sample-z-test-hypothesis-testing/learn.md new file mode 100644 index 00000000..d2dfddc8 --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/learn.md @@ -0,0 +1,20 @@ +A one-sample Z-test assesses whether the mean of a population differs from a hypothesized value when the population standard deviation is known. It is appropriate for large samples (by CLT) or when normality is assumed and the population standard deviation is known. + +Test statistic: +- z = (x̄ − μ0) / (σ / √n) + - x̄: sample mean + - μ0: hypothesized mean under H0 + - σ: known population standard deviation + - n: sample size + +P-value computation uses the standard normal distribution: +- Two-sided (H1: μ ≠ μ0): p = 2 · min(Φ(z), 1 − Φ(z)) +- Right-tailed (H1: μ > μ0): p = 1 − Φ(z) +- Left-tailed (H1: μ < μ0): p = Φ(z) + +Decision at level α: +- Reject H0 if p ≤ α; otherwise, fail to reject H0. + +Notes: +- If σ is unknown, use a one-sample t-test with the sample standard deviation instead. + diff --git a/questions/187_mlops-etl-pipeline/meta.json b/questions/186_one-sample-z-test-hypothesis-testing/meta.json similarity index 54% rename from questions/187_mlops-etl-pipeline/meta.json rename to questions/186_one-sample-z-test-hypothesis-testing/meta.json index d31f85bb..1ea1e7f5 100644 --- a/questions/187_mlops-etl-pipeline/meta.json +++ b/questions/186_one-sample-z-test-hypothesis-testing/meta.json @@ -1,8 +1,8 @@ { - "id": "187", - "title": "Build a Simple ETL Pipeline (MLOps)", - "difficulty": "medium", - "category": "MLOps", + "id": "186", + "title": "One-Sample Z-Test for Mean (One and Two-Tailed)", + "difficulty": "easy", + "category": "Statistics", "video": "", "likes": "0", "dislikes": "0", @@ -10,3 +10,4 @@ { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" } ] } + diff --git a/questions/186_one-sample-z-test-hypothesis-testing/solution.py b/questions/186_one-sample-z-test-hypothesis-testing/solution.py new file mode 100644 index 00000000..7019e23e --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/solution.py @@ -0,0 +1,40 @@ +from math import erf, sqrt + +def _standard_normal_cdf(x): + return 0.5 * (1.0 + erf(x / sqrt(2.0))) + +def one_sample_z_test(sample_mean, population_mean, population_std, n, alternative="two-sided"): + """ + Perform a one-sample Z-test for a population mean with known population std. + + Parameters + ---------- + sample_mean : float + population_mean : float + population_std : float + n : int + alternative : str + One of {"two-sided", "greater", "less"} + + Returns + ------- + dict with keys: + - "z": Z-statistic rounded to 4 decimals + - "p_value": p-value rounded to 4 decimals + """ + standard_error = population_std / sqrt(n) + z = (sample_mean - population_mean) / standard_error + cdf = _standard_normal_cdf(z) + + if alternative == "two-sided": + p = 2.0 * min(cdf, 1.0 - cdf) + elif alternative == "greater": + p = 1.0 - cdf + elif alternative == "less": + p = cdf + else: + # Fallback to two-sided if unexpected input + p = 2.0 * min(cdf, 1.0 - cdf) + + return {"z": round(z, 4), "p_value": round(p, 4)} + diff --git a/questions/186_one-sample-z-test-hypothesis-testing/starter_code.py b/questions/186_one-sample-z-test-hypothesis-testing/starter_code.py new file mode 100644 index 00000000..66dd39e1 --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/starter_code.py @@ -0,0 +1,33 @@ +from math import erf, sqrt + +def _standard_normal_cdf(x): + return 0.5 * (1.0 + erf(x / sqrt(2.0))) + +def one_sample_z_test(sample_mean, population_mean, population_std, n, alternative="two-sided"): + """ + Perform a one-sample Z-test for a population mean with known population std. + + Parameters + ---------- + sample_mean : float + population_mean : float + population_std : float + n : int + alternative : str + One of {"two-sided", "greater", "less"} + + Returns + ------- + dict with keys: + - "z": Z-statistic rounded to 4 decimals + - "p_value": p-value rounded to 4 decimals + """ + # TODO: Implement the Z statistic and p-value computation + # z = (sample_mean - population_mean) / (population_std / sqrt(n)) + # Use _standard_normal_cdf for CDF of standard normal. + # For alternative: + # - "two-sided": p = 2 * min(P(Z<=z), P(Z>=z)) = 2 * min(cdf(z), 1-cdf(z)) + # - "greater": p = 1 - cdf(z) + # - "less": p = cdf(z) + return {"z": 0.0, "p_value": 1.0} + diff --git a/questions/186_one-sample-z-test-hypothesis-testing/tests.json b/questions/186_one-sample-z-test-hypothesis-testing/tests.json new file mode 100644 index 00000000..b1eac6b4 --- /dev/null +++ b/questions/186_one-sample-z-test-hypothesis-testing/tests.json @@ -0,0 +1,27 @@ +[ + { + "test": "one_sample_z_test(103.0, 100.0, 15.0, 36, alternative='two-sided')", + "expected_output": "{'z': 1.2, 'p_value': 0.2296}" + }, + { + "test": "one_sample_z_test(103.0, 100.0, 15.0, 36, alternative='greater')", + "expected_output": "{'z': 1.2, 'p_value': 0.1151}" + }, + { + "test": "one_sample_z_test(103.0, 100.0, 15.0, 36, alternative='less')", + "expected_output": "{'z': 1.2, 'p_value': 0.8849}" + }, + { + "test": "one_sample_z_test(97.0, 100.0, 10.0, 25, alternative='two-sided')", + "expected_output": "{'z': -1.5, 'p_value': 0.1336}" + }, + { + "test": "one_sample_z_test(97.0, 100.0, 10.0, 25, alternative='less')", + "expected_output": "{'z': -1.5, 'p_value': 0.0668}" + }, + { + "test": "one_sample_z_test(97.0, 100.0, 10.0, 25, alternative='greater')", + "expected_output": "{'z': -1.5, 'p_value': 0.9332}" + } +] + diff --git a/questions/187_mlops-etl-pipeline/description.md b/questions/187_mlops-etl-pipeline/description.md deleted file mode 100644 index 5adfa2df..00000000 --- a/questions/187_mlops-etl-pipeline/description.md +++ /dev/null @@ -1,14 +0,0 @@ -## Problem - -Implement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation. - -Given a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that: - -1. Extracts rows from the raw CSV text. -2. Transforms data by: - - Filtering only rows where `event_type == "purchase"`. - - Converting `value` to float and dropping invalid rows. - - Aggregating total purchase `value` per `user_id`. -3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending. - -Assume small inputs (no external libs), handle extra whitespace, and ignore blank lines. diff --git a/questions/187_mlops-etl-pipeline/example.json b/questions/187_mlops-etl-pipeline/example.json deleted file mode 100644 index 84952417..00000000 --- a/questions/187_mlops-etl-pipeline/example.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")", - "output": "[('u1', 15.0), ('u2', 3.5)]", - "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id." -} diff --git a/questions/187_mlops-etl-pipeline/learn.md b/questions/187_mlops-etl-pipeline/learn.md deleted file mode 100644 index d523e6a1..00000000 --- a/questions/187_mlops-etl-pipeline/learn.md +++ /dev/null @@ -1,24 +0,0 @@ -## Solution Explanation - -This task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling. - -### ETL breakdown -- Extract: parse raw CSV text, ignore blanks, and split into header and rows. -- Transform: - - Filter only relevant records (event_type == "purchase"). - - Cast `value` to float; discard invalid rows to maintain data quality. - - Aggregate total purchase value per user to create compact features. -- Load: return a deterministic, sorted list of `(user_id, total_value)`. - -### Why this design? -- Input sanitation prevents runtime errors and poor-quality features. -- Aggregation compresses event-level logs into user-level features commonly used in models. -- Sorting produces stable, testable outputs. - -### Complexity -- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U). - -### Extensions -- Add schema validation and logging. -- Write outputs to files or databases. -- Schedule ETL runs and add monitoring for drift and freshness. diff --git a/questions/187_mlops-etl-pipeline/solution.py b/questions/187_mlops-etl-pipeline/solution.py deleted file mode 100644 index 19b9a275..00000000 --- a/questions/187_mlops-etl-pipeline/solution.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import List, Tuple - - -def run_etl(csv_text: str) -> List[Tuple[str, float]]: - """Reference ETL implementation. - - - Extract: parse CSV text, skip header, strip whitespace, ignore blanks - - Transform: keep event_type == "purchase"; parse value as float; aggregate per user - - Load: return sorted list of (user_id, total_value) by user_id asc - """ - lines = [line.strip() for line in csv_text.splitlines() if line.strip()] - if not lines: - return [] - # header - header = lines[0] - rows = lines[1:] - - # indices from header (allow varying order and case) - headers = [h.strip().lower() for h in header.split(",")] - try: - idx_user = headers.index("user_id") - idx_event = headers.index("event_type") - idx_value = headers.index("value") - except ValueError: - # header missing required columns - return [] - - aggregates: dict[str, float] = {} - for row in rows: - parts = [c.strip() for c in row.split(",")] - if len(parts) <= max(idx_user, idx_event, idx_value): - continue - user_id = parts[idx_user] - event_type = parts[idx_event].lower() - if event_type != "purchase": - continue - try: - value = float(parts[idx_value]) - except ValueError: - continue - aggregates[user_id] = aggregates.get(user_id, 0.0) + value - - return sorted(aggregates.items(), key=lambda kv: kv[0]) diff --git a/questions/187_mlops-etl-pipeline/starter_code.py b/questions/187_mlops-etl-pipeline/starter_code.py deleted file mode 100644 index 65002026..00000000 --- a/questions/187_mlops-etl-pipeline/starter_code.py +++ /dev/null @@ -1,9 +0,0 @@ -# Implement your function below. - -def run_etl(csv_text: str) -> list[tuple[str, float]]: - """Run a simple ETL pipeline over CSV text with header user_id,event_type,value. - - Returns a sorted list of (user_id, total_value) for event_type == "purchase". - """ - # TODO: implement extract, transform, and load steps - raise NotImplementedError diff --git a/questions/187_mlops-etl-pipeline/tests.json b/questions/187_mlops-etl-pipeline/tests.json deleted file mode 100644 index 781c1b28..00000000 --- a/questions/187_mlops-etl-pipeline/tests.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "test": "print(run_etl('user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n'))", - "expected_output": "[('u1', 15.0), ('u2', 3.5)]" - }, - { - "test": "print(run_etl('user_id,event_type,value'))", - "expected_output": "[]" - }, - { - "test": "print(run_etl('value,event_type,user_id\\n 1.0, purchase, u1\\n 2.0, purchase, u1\\n'))", - "expected_output": "[('u1', 3.0)]" - } -]