Uploaded capstone project

husnusensoy · omarreis01 · Feb 3, 2025 · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025
commit b744108dae7a17f64e6c6ec5b657bffdfd44a84b
diff --git a/.DS_Store b/.DS_Store
diff --git a/week2/.DS_Store b/week2/.DS_Store
diff --git a/week2/capstone/.DS_Store b/week2/capstone/.DS_Store
diff --git a/week2/capstone/backup/.DS_Store b/week2/capstone/backup/.DS_Store
diff --git a/week2/capstone/backup/a/main.ipynb b/week2/capstone/backup/a/main.ipynb
diff --git a/week2/capstone/backup/main.ipynb b/week2/capstone/backup/main.ipynb
diff --git a/week2/capstone/main.ipynb b/week2/capstone/main.ipynb
diff --git a/week2/capstone/main.py b/week2/capstone/main.py
@@ -0,0 +1,77 @@
+# main.py
+import os
+from src.data_loader import load_data
+from src.preprocessing import preprocess
+from src.models import cross_validate_lgb, train_final_lgb
+from src.evaluation import print_all_metrics
+from src.optuna_tuning import run_optuna
+from src.feature_importance import plot_feature_importance
+
+def main():
+    # Define paths for your data
+    train_path = os.path.join("data", "train.csv")
+    test_path = os.path.join("data", "new_test.csv")
+
+    # Load the data
+    train_df, test_df = load_data(train_path, test_path)
+
+    # Preprocess the data
+    processed_train, mlb, encoder = preprocess(train_df, train=True)
+    processed_test, _, _ = preprocess(test_df, train=False, mlb=mlb, encoder=encoder)
+
+    # Split features and target
+    x_train = processed_train.drop(columns=["churn"])
+    y_train = processed_train["churn"]
+    x_test = processed_test.drop(columns=["churn"])
+    y_test = processed_test["churn"]
+
+    # Define initial LightGBM parameters
+    lgb_params = {
+        "objective": "binary",
+        "boosting_type": "gbdt",
+        'n_estimators': 309,
+        'learning_rate': 0.01716029728096218,
+        'num_leaves': 22,
+        'max_depth': 6,
+        'min_child_samples': 10,
+        'subsample': 0.737440646376928,
+        'colsample_bytree': 0.9862597807809604,
+        'reg_alpha': 8.985515005642144e-07,
+        'reg_lambda': 1.00016444438518084528,
+        "random_state": 42,
+        "verbose": -1,
+        "n_jobs": -1
+    }
+
+    # Cross-validation
+    #print("Starting cross-validation...")
+    #models, metrics = cross_validate_lgb(x_train, y_train, lgb_params)
+
+    # Train final model on the full training set
+    print("Training final model on full training data...")
+    final_model = train_final_lgb(x_train, y_train, lgb_params)
+
+    # Evaluate the final model on the test set
+    y_pred = final_model.predict(x_test)
+    y_pred_proba = final_model.predict_proba(x_test)[:, 1]
+    print("Evaluation on test set:")
+    print_all_metrics(y_test, y_pred, y_pred_proba)
+
+    # Hyperparameter tuning with Optuna
+    #print("Starting hyperparameter tuning with Optuna...")
+    #best_params = run_optuna(x_train, y_train, n_trials=50)
+
+    # Optionally retrain using the best hyperparameters and evaluate again
+    #print("Retraining model with best hyperparameters...")
+    #final_model_best = train_final_lgb(x_train, y_train, best_params)
+    #y_pred_best = final_model_best.predict(x_test)
+    #y_pred_proba_best = final_model_best.predict_proba(x_test)[:, 1]
+    #print("Evaluation on test set with tuned hyperparameters:")
+    #print_all_metrics(y_test, y_pred_best, y_pred_proba_best)
+
+    # Plot feature importance
+    print("Plotting feature importance...")
+    plot_feature_importance(final_model, x_train.columns)
+
+if __name__ == "__main__":
+    main()
diff --git a/week2/capstone/modules/concaneate.py b/week2/capstone/modules/concaneate.py
@@ -0,0 +1,11 @@
+import pandas as pd
+
+# Read the CSV files into DataFrames
+test_df = pd.read_csv("data/test.csv")
+val_df = pd.read_csv("data/val.csv")
+
+# Concatenate the DataFrames vertically (row-wise)
+combined_df = pd.concat([test_df, val_df], axis=0, ignore_index=True)
+
+# Save the combined DataFrame to a new CSV file
+combined_df.to_csv("new_test.csv", index=False)
diff --git a/week2/capstone/modules/create_csv_file.py b/week2/capstone/modules/create_csv_file.py
@@ -0,0 +1,13 @@
+import glob 
+import pandas as pd
+
+all_files = glob.glob("capstone.*.jsonl")
+df_list = []
+for f in all_files:
+    df_temp = pd.read_json(f, lines=True)
+    df_list.append(df_temp)
+    print(f"f:{f}, df_temp.shape:{df_temp.shape}")
+
+df = pd.concat(df_list, ignore_index=True)
+
+df.to_csv("data/capstone.csv", index=False)
diff --git a/week2/capstone/src/data_loader.py b/week2/capstone/src/data_loader.py
@@ -0,0 +1,11 @@
+# src/data_loader.py
+import os
+import pandas as pd
+
+def load_data(train_path, test_path):
+    """
+    Load the train and test datasets.
+    """
+    train = pd.read_csv(train_path)
+    test = pd.read_csv(test_path)
+    return train, test
diff --git a/week2/capstone/src/evaluation.py b/week2/capstone/src/evaluation.py
@@ -0,0 +1,28 @@
+# src/evaluation.py
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
+
+def print_all_metrics(y_true, y_pred, y_pred_proba):
+    """
+    Print evaluation metrics and display a confusion matrix.
+    """
+    acc = accuracy_score(y_true, y_pred)
+    prec = precision_score(y_true, y_pred)
+    rec = recall_score(y_true, y_pred)
+    f1 = f1_score(y_true, y_pred)
+    auc = roc_auc_score(y_true, y_pred_proba)
+    cm = confusion_matrix(y_true, y_pred)
+
+    print("Accuracy:", acc)
+    print("Precision:", prec)
+    print("Recall:", rec)
+    print("F1 Score:", f1)
+    print("ROC AUC:", auc)
+    print("Confusion Matrix:\n", cm)
+
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title('Confusion Matrix')
+    plt.show()
diff --git a/week2/capstone/src/feature_importance.py b/week2/capstone/src/feature_importance.py
@@ -0,0 +1,19 @@
+# src/feature_importance.py
+import matplotlib.pyplot as plt
+import pandas as pd
+
+def plot_feature_importance(model, feature_names, top_n=20):
+    importances = model.feature_importances_
+    importance_df = pd.DataFrame({
+        'feature': feature_names,
+        'importance': importances
+    }).sort_values(by='importance', ascending=False)
+
+    top_features = importance_df.head(top_n)
+    plt.figure(figsize=(8, 6))
+    plt.barh(top_features['feature'], top_features['importance'], color='skyblue')
+    plt.xlabel("Feature Importance")
+    plt.title("Top Feature Importances")
+    plt.gca().invert_yaxis()
+    plt.tight_layout()
+    plt.show()
diff --git a/week2/capstone/src/models.py b/week2/capstone/src/models.py
@@ -0,0 +1,63 @@
+# src/models.py
+import numpy as np
+import lightgbm as lgb
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
+from src.preprocessing import undersample_data
+
+def cross_validate_lgb(x_train, y_train, params, n_splits=5, majority_multiplier=5, random_state=42):
+    """
+    Perform cross-validation using LightGBM.
+    """
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+    models = []
+    metrics = []
+
+    for fold, (train_index, val_index) in enumerate(skf.split(x_train, y_train), 1):
+        X_train_fold = x_train.iloc[train_index]
+        y_train_fold = y_train.iloc[train_index]
+        X_val_fold = x_train.iloc[val_index]
+        y_val_fold = y_train.iloc[val_index]
+
+        # Apply undersampling on the training fold
+        X_train_res, y_train_res = undersample_data(X_train_fold, y_train_fold, majority_multiplier, random_state)
+
+        model = lgb.LGBMClassifier(**params)
+        model.fit(X_train_res, y_train_res)
+        models.append(model)
+
+        y_val_pred = model.predict(X_val_fold)
+        y_val_proba = model.predict_proba(X_val_fold)[:, 1]
+
+        acc = accuracy_score(y_val_fold, y_val_pred)
+        prec = precision_score(y_val_fold, y_val_pred)
+        rec = recall_score(y_val_fold, y_val_pred)
+        f1 = f1_score(y_val_fold, y_val_pred)
+        roc_auc = roc_auc_score(y_val_fold, y_val_proba)
+        cm = confusion_matrix(y_val_fold, y_val_pred)
+
+        print(f"Fold {fold}:")
+        print(f"  Accuracy: {acc:.4f}")
+        print(f"  Precision: {prec:.4f}")
+        print(f"  Recall: {rec:.4f}")
+        print(f"  F1 Score: {f1:.4f}")
+        print(f"  ROC AUC: {roc_auc:.4f}")
+        print(f"  Confusion Matrix:\n{cm}\n")
+
+        metrics.append({
+            "accuracy": acc,
+            "precision": prec,
+            "recall": rec,
+            "f1": f1,
+            "roc_auc": roc_auc,
+            "conf_matrix": cm
+        })
+    return models, metrics
+
+def train_final_lgb(x_train, y_train, params):
+    """
+    Train a final LightGBM model on the full training set.
+    """
+    model = lgb.LGBMClassifier(**params)
+    model.fit(x_train, y_train)
+    return model
diff --git a/week2/capstone/src/optuna_tuning.py b/week2/capstone/src/optuna_tuning.py
@@ -0,0 +1,55 @@
+# src/optuna_tuning.py
+import optuna
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import f1_score
+import lightgbm as lgb
+from src.preprocessing import undersample_data
+
+def objective(trial, x_train, y_train, majority_multiplier=5, random_state=42):
+    params = {
+        "objective": "binary",
+        "boosting_type": "gbdt",
+        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
+        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
+        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
+        'max_depth': trial.suggest_int('max_depth', 3, 10),
+        'min_child_samples': trial.suggest_int('min_child_samples', 5, 20),
+        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
+        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1e-1, log=True),
+        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1e-1, log=True),
+        "random_state": random_state,
+        "verbose": -1,
+        "n_jobs": -1
+    }
+
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
+    f1_scores = []
+
+    for train_idx, val_idx in skf.split(x_train, y_train):
+        X_train_fold = x_train.iloc[train_idx]
+        y_train_fold = y_train.iloc[train_idx]
+        X_val_fold = x_train.iloc[val_idx]
+        y_val_fold = y_train.iloc[val_idx]
+
+        X_train_res, y_train_res = undersample_data(X_train_fold, y_train_fold, majority_multiplier, random_state)
+        model = lgb.LGBMClassifier(**params)
+        model.fit(X_train_res, y_train_res)
+
+        y_val_pred = model.predict(X_val_fold)
+        f1_scores.append(f1_score(y_val_fold, y_val_pred))
+
+    return np.mean(f1_scores)
+
+def run_optuna(x_train, y_train, n_trials=50):
+    study = optuna.create_study(direction="maximize")
+    study.optimize(lambda trial: objective(trial, x_train, y_train), n_trials=n_trials)
+
+    print("\nBest trial:")
+    best_trial = study.best_trial
+    print(f"  F1 Score: {best_trial.value:.4f}")
+    print("  Best hyperparameters:")
+    for key, value in best_trial.params.items():
+        print(f"    {key}: {value}")
+    return best_trial.params