Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
capstone
  • Loading branch information
Kutay4 committed Feb 8, 2025
commit 103ae059194d936ae24d7436e92a89bbb5d25293
Binary file added week2/capstone/best_model.pkl
Binary file not shown.
727 changes: 727 additions & 0 deletions week2/capstone/capstone.ipynb

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions week2/capstone/capstone_queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
create table if not exists churn_data as
select *
from read_json(
"capstone_data/capstone.*.jsonl",
columns = {"id":"varchar",
"age":"int",
"tenure":"int",
"service_type":"varchar",
"avg_call_duration":"float",
"data_usage":"float",
"roaming_usage":"float",
"monthly_charge":"float",
"overdue_payments":"int",
"auto_payment":"bool",
"avg_top_up_count":"float",
"call_drops":"int",
"customer_support_calls":"int",
"satisfaction_score":"float",
"apps":"varchar",
"churn":"bool"
}
);

CREATE OR REPLACE TABLE churn_data as (SELECT a.*,
b.izlego, b.ritimgo, b.cuzdanx, b.hizlipazar, b.konusalim,
b.prepaid, b.postpaid, b.broadband
FROM churn_data a
LEFT JOIN (
SELECT
id,
apps,
service_type,
CASE WHEN apps LIKE '%İzleGo%' THEN 1 ELSE 0 END AS izlego,
CASE WHEN apps LIKE '%RitimGo%' THEN 1 ELSE 0 END AS ritimgo,
CASE WHEN apps LIKE '%CüzdanX%' THEN 1 ELSE 0 END AS cuzdanx,
CASE WHEN apps LIKE '%HızlıPazar%' THEN 1 ELSE 0 END AS hizlipazar,
CASE WHEN apps LIKE '%Konuşalım%' THEN 1 ELSE 0 END AS konusalim,
CASE WHEN service_type = 'Prepaid' THEN 1 ELSE 0 END AS prepaid,
CASE WHEN service_type = 'Postpaid' THEN 1 ELSE 0 END AS postpaid,
CASE WHEN service_type = 'Broadband' THEN 1 ELSE 0 END AS broadband
FROM churn_data
) b
USING(id));

-- boş veriler için sorgular
SELECT
'id' AS column_name, COUNT(*) - COUNT(id) AS null_count FROM churn_data
UNION ALL
SELECT
'age', COUNT(*) - COUNT(age) FROM churn_data
UNION ALL
SELECT
'tenure', COUNT(*) - COUNT(tenure) FROM churn_data
UNION ALL
SELECT
'service_type', COUNT(*) - COUNT(service_type) FROM churn_data
UNION ALL
SELECT
'avg_call_duration', COUNT(*) - COUNT(avg_call_duration) FROM churn_data
UNION ALL
SELECT
'data_usage', COUNT(*) - COUNT(data_usage) FROM churn_data
UNION ALL
SELECT
'roaming_usage', COUNT(*) - COUNT(roaming_usage) FROM churn_data
UNION ALL
SELECT
'monthly_charge', COUNT(*) - COUNT(monthly_charge) FROM churn_data
UNION ALL
SELECT
'overdue_payments', COUNT(*) - COUNT(overdue_payments) FROM churn_data
UNION ALL
SELECT
'auto_payment', COUNT(*) - COUNT(auto_payment) FROM churn_data
UNION ALL
SELECT
'avg_top_up_count', COUNT(*) - COUNT(avg_top_up_count) FROM churn_data
UNION ALL
SELECT
'call_drops', COUNT(*) - COUNT(call_drops) FROM churn_data
UNION ALL
SELECT
'customer_support_calls', COUNT(*) - COUNT(customer_support_calls) FROM churn_data
UNION ALL
SELECT
'satisfaction_score', COUNT(*) - COUNT(satisfaction_score) FROM churn_data
UNION ALL
SELECT
'apps', COUNT(*) - COUNT(apps) FROM churn_data
UNION ALL
SELECT
'churn', COUNT(*) - COUNT(churn) FROM churn_data;
118 changes: 118 additions & 0 deletions week2/capstone/file_handling.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import duckdb\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"pd.set_option('future.no_silent_downcasting', True)\n",
"\n",
"conn = duckdb.connect(database=\"capstone.db\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = conn.execute(\"SELECT * FROM churn_data\").fetchdf()\n",
"df[[\"auto_payment\", \"churn\"]] = df[[\"auto_payment\", \"churn\"]].apply(lambda x: x.map({True: 1, False: 0}))\n",
"X, y = df.drop(columns=[\"churn\", \"apps\", \"service_type\"]), df[\"churn\"] \n",
"\n",
"X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']] = X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']].fillna(0)\n",
"X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'] = X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def fill_missing_values(df_train, df_test, condition_column, columns):\n",
"\n",
" for condition in condition_column:\n",
" \n",
" df_train_filtered = df_train[df_train[condition] == 1]\n",
" df_test_filtered = df_test[df_test[condition] == 1]\n",
"\n",
" for col in columns:\n",
" min_val, max_val = df_train_filtered[col].min(), df_train_filtered[col].max()\n",
"\n",
" \n",
" missing_train_idx = df_train_filtered[df_train_filtered[col].isnull()].index\n",
" missing_test_idx = df_test_filtered[df_test_filtered[col].isnull()].index\n",
"\n",
" train_random_values = np.random.uniform(min_val, max_val, size=len(missing_train_idx)).astype(np.float32)\n",
" test_random_values = np.random.uniform(min_val, max_val, size=len(missing_test_idx)).astype(np.float32)\n",
"\n",
" df_train.loc[missing_train_idx, col] = train_random_values\n",
" df_test.loc[missing_test_idx, col] = test_random_values\n",
"\n",
" return df_train, df_test\n",
"\n",
"X_train, X_test = fill_missing_values(X_train, X_test, ['broadband','prepaid','postpaid'], ['avg_call_duration', 'data_usage', 'monthly_charge'])\n",
"\n",
"X_train.drop(columns=['prepaid'], inplace=True) \n",
"X_test.drop(columns=['prepaid'], inplace=True) # dummy variable trap\n",
"\n",
"X_train.fillna({'auto_payment': 0}, inplace=True)\n",
"X_test.fillna({'auto_payment': 0}, inplace=True)\n",
"\n",
"X_train.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)\n",
"X_test.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.concat([X_train, y_train], axis=1)\n",
"test_data = pd.concat([X_test, y_test], axis=1)\n",
"\n",
"train_data.to_parquet(\"train_data.parquet\", engine='pyarrow')\n",
"test_data.to_parquet(\"test_data.parquet\", engine='pyarrow')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py310",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
102 changes: 102 additions & 0 deletions you_do_1/SQL_Queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,107 @@ GROUP BY a.Title, a.MovieId
ORDER BY AdjustedScore DESC
LIMIT 30;

WITH UserMap AS ( SELECT UserId, ROW_NUMBER() OVER (ORDER BY UserId) - 1 AS user_index FROM (SELECT DISTINCT UserId FROM rating) AS unique_users),
MovieMap AS (SELECT MovieId, ROW_NUMBER() OVER (ORDER BY MovieId) - 1 AS movie_index FROM (SELECT DISTINCT MovieId FROM rating) AS unique_movies)
SELECT r.MovieId, m.movie_index, r.UserId, u.user_index
FROM rating r
JOIN UserMap u ON r.UserId = u.UserId
JOIN MovieMap m ON r.MovieId = m.MovieId;

-- churn veri setinin elde edilmesi
drop table churn_data;

create table if not exists churn_data as
select *
from read_json(
"capstone_data/capstone.*.jsonl",
columns = {"id":"varchar",
"age":"int",
"tenure":"int",
"service_type":"varchar",
"avg_call_duration":"float",
"data_usage":"float",
"roaming_usage":"float",
"monthly_charge":"float",
"overdue_payments":"int",
"auto_payment":"bool",
"avg_top_up_count":"float",
"call_drops":"int",
"customer_support_calls":"int",
"satisfaction_score":"float",
"apps":"varchar",
"churn":"bool"
}
);

CREATE OR REPLACE TABLE churn_data as (SELECT a.*,
b.izlego, b.ritimgo, b.cuzdanx, b.hizlipazar, b.konusalim,
b.prepaid, b.postpaid, b.broadband
FROM churn_data a
LEFT JOIN (
SELECT
id,
apps,
service_type,
CASE WHEN apps LIKE '%İzleGo%' THEN 1 ELSE 0 END AS izlego,
CASE WHEN apps LIKE '%RitimGo%' THEN 1 ELSE 0 END AS ritimgo,
CASE WHEN apps LIKE '%CüzdanX%' THEN 1 ELSE 0 END AS cuzdanx,
CASE WHEN apps LIKE '%HızlıPazar%' THEN 1 ELSE 0 END AS hizlipazar,
CASE WHEN apps LIKE '%Konuşalım%' THEN 1 ELSE 0 END AS konusalim,
CASE WHEN service_type = 'Prepaid' THEN 1 ELSE 0 END AS prepaid,
CASE WHEN service_type = 'Postpaid' THEN 1 ELSE 0 END AS postpaid,
CASE WHEN service_type = 'Broadband' THEN 1 ELSE 0 END AS broadband
FROM churn_data
) b
USING(id));

-- boş veriler için sorgular
SELECT
'id' AS column_name, COUNT(*) - COUNT(id) AS null_count FROM churn_data
UNION ALL
SELECT
'age', COUNT(*) - COUNT(age) FROM churn_data
UNION ALL
SELECT
'tenure', COUNT(*) - COUNT(tenure) FROM churn_data
UNION ALL
SELECT
'service_type', COUNT(*) - COUNT(service_type) FROM churn_data
UNION ALL
SELECT
'avg_call_duration', COUNT(*) - COUNT(avg_call_duration) FROM churn_data
UNION ALL
SELECT
'data_usage', COUNT(*) - COUNT(data_usage) FROM churn_data
UNION ALL
SELECT
'roaming_usage', COUNT(*) - COUNT(roaming_usage) FROM churn_data
UNION ALL
SELECT
'monthly_charge', COUNT(*) - COUNT(monthly_charge) FROM churn_data
UNION ALL
SELECT
'overdue_payments', COUNT(*) - COUNT(overdue_payments) FROM churn_data
UNION ALL
SELECT
'auto_payment', COUNT(*) - COUNT(auto_payment) FROM churn_data
UNION ALL
SELECT
'avg_top_up_count', COUNT(*) - COUNT(avg_top_up_count) FROM churn_data
UNION ALL
SELECT
'call_drops', COUNT(*) - COUNT(call_drops) FROM churn_data
UNION ALL
SELECT
'customer_support_calls', COUNT(*) - COUNT(customer_support_calls) FROM churn_data
UNION ALL
SELECT
'satisfaction_score', COUNT(*) - COUNT(satisfaction_score) FROM churn_data
UNION ALL
SELECT
'apps', COUNT(*) - COUNT(apps) FROM churn_data
UNION ALL
SELECT
'churn', COUNT(*) - COUNT(churn) FROM churn_data;


Loading