Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Uploaded capstone project
  • Loading branch information
Ömer faruk Çelik authored and Ömer faruk Çelik committed Feb 8, 2025
commit b744108dae7a17f64e6c6ec5b657bffdfd44a84b
Binary file modified .DS_Store
Binary file not shown.
Binary file added week2/.DS_Store
Binary file not shown.
Binary file added week2/capstone/.DS_Store
Binary file not shown.
Binary file added week2/capstone/backup/.DS_Store
Binary file not shown.
6,810 changes: 6,810 additions & 0 deletions week2/capstone/backup/a/main.ipynb

Large diffs are not rendered by default.

7,867 changes: 7,867 additions & 0 deletions week2/capstone/backup/main.ipynb

Large diffs are not rendered by default.

2,363 changes: 2,363 additions & 0 deletions week2/capstone/main.ipynb

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions week2/capstone/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# main.py
import os
from src.data_loader import load_data
from src.preprocessing import preprocess
from src.models import cross_validate_lgb, train_final_lgb
from src.evaluation import print_all_metrics
from src.optuna_tuning import run_optuna
from src.feature_importance import plot_feature_importance

def main():
# Define paths for your data
train_path = os.path.join("data", "train.csv")
test_path = os.path.join("data", "new_test.csv")

# Load the data
train_df, test_df = load_data(train_path, test_path)

# Preprocess the data
processed_train, mlb, encoder = preprocess(train_df, train=True)
processed_test, _, _ = preprocess(test_df, train=False, mlb=mlb, encoder=encoder)

# Split features and target
x_train = processed_train.drop(columns=["churn"])
y_train = processed_train["churn"]
x_test = processed_test.drop(columns=["churn"])
y_test = processed_test["churn"]

# Define initial LightGBM parameters
lgb_params = {
"objective": "binary",
"boosting_type": "gbdt",
'n_estimators': 309,
'learning_rate': 0.01716029728096218,
'num_leaves': 22,
'max_depth': 6,
'min_child_samples': 10,
'subsample': 0.737440646376928,
'colsample_bytree': 0.9862597807809604,
'reg_alpha': 8.985515005642144e-07,
'reg_lambda': 1.00016444438518084528,
"random_state": 42,
"verbose": -1,
"n_jobs": -1
}

# Cross-validation
#print("Starting cross-validation...")
#models, metrics = cross_validate_lgb(x_train, y_train, lgb_params)

# Train final model on the full training set
print("Training final model on full training data...")
final_model = train_final_lgb(x_train, y_train, lgb_params)

# Evaluate the final model on the test set
y_pred = final_model.predict(x_test)
y_pred_proba = final_model.predict_proba(x_test)[:, 1]
print("Evaluation on test set:")
print_all_metrics(y_test, y_pred, y_pred_proba)

# Hyperparameter tuning with Optuna
#print("Starting hyperparameter tuning with Optuna...")
#best_params = run_optuna(x_train, y_train, n_trials=50)

# Optionally retrain using the best hyperparameters and evaluate again
#print("Retraining model with best hyperparameters...")
#final_model_best = train_final_lgb(x_train, y_train, best_params)
#y_pred_best = final_model_best.predict(x_test)
#y_pred_proba_best = final_model_best.predict_proba(x_test)[:, 1]
#print("Evaluation on test set with tuned hyperparameters:")
#print_all_metrics(y_test, y_pred_best, y_pred_proba_best)

# Plot feature importance
print("Plotting feature importance...")
plot_feature_importance(final_model, x_train.columns)

if __name__ == "__main__":
main()
11 changes: 11 additions & 0 deletions week2/capstone/modules/concaneate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd

# Read the CSV files into DataFrames
test_df = pd.read_csv("data/test.csv")
val_df = pd.read_csv("data/val.csv")

# Concatenate the DataFrames vertically (row-wise)
combined_df = pd.concat([test_df, val_df], axis=0, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("new_test.csv", index=False)
13 changes: 13 additions & 0 deletions week2/capstone/modules/create_csv_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import glob
import pandas as pd

all_files = glob.glob("capstone.*.jsonl")
df_list = []
for f in all_files:
df_temp = pd.read_json(f, lines=True)
df_list.append(df_temp)
print(f"f:{f}, df_temp.shape:{df_temp.shape}")

df = pd.concat(df_list, ignore_index=True)

df.to_csv("data/capstone.csv", index=False)
11 changes: 11 additions & 0 deletions week2/capstone/src/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# src/data_loader.py
import os
import pandas as pd

def load_data(train_path, test_path):
"""
Load the train and test datasets.
"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
return train, test
28 changes: 28 additions & 0 deletions week2/capstone/src/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# src/evaluation.py
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def print_all_metrics(y_true, y_pred, y_pred_proba):
"""
Print evaluation metrics and display a confusion matrix.
"""
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred_proba)
cm = confusion_matrix(y_true, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("ROC AUC:", auc)
print("Confusion Matrix:\n", cm)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
19 changes: 19 additions & 0 deletions week2/capstone/src/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# src/feature_importance.py
import matplotlib.pyplot as plt
import pandas as pd

def plot_feature_importance(model, feature_names, top_n=20):
importances = model.feature_importances_
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values(by='importance', ascending=False)

top_features = importance_df.head(top_n)
plt.figure(figsize=(8, 6))
plt.barh(top_features['feature'], top_features['importance'], color='skyblue')
plt.xlabel("Feature Importance")
plt.title("Top Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
63 changes: 63 additions & 0 deletions week2/capstone/src/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# src/models.py
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from src.preprocessing import undersample_data

def cross_validate_lgb(x_train, y_train, params, n_splits=5, majority_multiplier=5, random_state=42):
"""
Perform cross-validation using LightGBM.
"""
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
models = []
metrics = []

for fold, (train_index, val_index) in enumerate(skf.split(x_train, y_train), 1):
X_train_fold = x_train.iloc[train_index]
y_train_fold = y_train.iloc[train_index]
X_val_fold = x_train.iloc[val_index]
y_val_fold = y_train.iloc[val_index]

# Apply undersampling on the training fold
X_train_res, y_train_res = undersample_data(X_train_fold, y_train_fold, majority_multiplier, random_state)

model = lgb.LGBMClassifier(**params)
model.fit(X_train_res, y_train_res)
models.append(model)

y_val_pred = model.predict(X_val_fold)
y_val_proba = model.predict_proba(X_val_fold)[:, 1]

acc = accuracy_score(y_val_fold, y_val_pred)
prec = precision_score(y_val_fold, y_val_pred)
rec = recall_score(y_val_fold, y_val_pred)
f1 = f1_score(y_val_fold, y_val_pred)
roc_auc = roc_auc_score(y_val_fold, y_val_proba)
cm = confusion_matrix(y_val_fold, y_val_pred)

print(f"Fold {fold}:")
print(f" Accuracy: {acc:.4f}")
print(f" Precision: {prec:.4f}")
print(f" Recall: {rec:.4f}")
print(f" F1 Score: {f1:.4f}")
print(f" ROC AUC: {roc_auc:.4f}")
print(f" Confusion Matrix:\n{cm}\n")

metrics.append({
"accuracy": acc,
"precision": prec,
"recall": rec,
"f1": f1,
"roc_auc": roc_auc,
"conf_matrix": cm
})
return models, metrics

def train_final_lgb(x_train, y_train, params):
"""
Train a final LightGBM model on the full training set.
"""
model = lgb.LGBMClassifier(**params)
model.fit(x_train, y_train)
return model
55 changes: 55 additions & 0 deletions week2/capstone/src/optuna_tuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# src/optuna_tuning.py
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
from src.preprocessing import undersample_data

def objective(trial, x_train, y_train, majority_multiplier=5, random_state=42):
params = {
"objective": "binary",
"boosting_type": "gbdt",
'n_estimators': trial.suggest_int('n_estimators', 100, 500),
'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
'num_leaves': trial.suggest_int('num_leaves', 10, 50),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 20),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1e-1, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1e-1, log=True),
"random_state": random_state,
"verbose": -1,
"n_jobs": -1
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
f1_scores = []

for train_idx, val_idx in skf.split(x_train, y_train):
X_train_fold = x_train.iloc[train_idx]
y_train_fold = y_train.iloc[train_idx]
X_val_fold = x_train.iloc[val_idx]
y_val_fold = y_train.iloc[val_idx]

X_train_res, y_train_res = undersample_data(X_train_fold, y_train_fold, majority_multiplier, random_state)
model = lgb.LGBMClassifier(**params)
model.fit(X_train_res, y_train_res)

y_val_pred = model.predict(X_val_fold)
f1_scores.append(f1_score(y_val_fold, y_val_pred))

return np.mean(f1_scores)

def run_optuna(x_train, y_train, n_trials=50):
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, x_train, y_train), n_trials=n_trials)

print("\nBest trial:")
best_trial = study.best_trial
print(f" F1 Score: {best_trial.value:.4f}")
print(" Best hyperparameters:")
for key, value in best_trial.params.items():
print(f" {key}: {value}")
return best_trial.params
Loading