Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"sqltools.useNodeRuntime": true,
"sqltools.connections": [
{
"previewLimit": 50,
"driver": "SQLite",
"database": "${workspaceFolder:tt-bootcamp}/you_do_1/data/new_db.db",
"name": "connection 1"
}
]
}
Binary file added week2/capstone/best_model.pkl
Binary file not shown.
727 changes: 727 additions & 0 deletions week2/capstone/capstone.ipynb

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions week2/capstone/capstone_queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
create table if not exists churn_data as
select *
from read_json(
"capstone_data/capstone.*.jsonl",
columns = {"id":"varchar",
"age":"int",
"tenure":"int",
"service_type":"varchar",
"avg_call_duration":"float",
"data_usage":"float",
"roaming_usage":"float",
"monthly_charge":"float",
"overdue_payments":"int",
"auto_payment":"bool",
"avg_top_up_count":"float",
"call_drops":"int",
"customer_support_calls":"int",
"satisfaction_score":"float",
"apps":"varchar",
"churn":"bool"
}
);

CREATE OR REPLACE TABLE churn_data as (SELECT a.*,
b.izlego, b.ritimgo, b.cuzdanx, b.hizlipazar, b.konusalim,
b.prepaid, b.postpaid, b.broadband
FROM churn_data a
LEFT JOIN (
SELECT
id,
apps,
service_type,
CASE WHEN apps LIKE '%İzleGo%' THEN 1 ELSE 0 END AS izlego,
CASE WHEN apps LIKE '%RitimGo%' THEN 1 ELSE 0 END AS ritimgo,
CASE WHEN apps LIKE '%CüzdanX%' THEN 1 ELSE 0 END AS cuzdanx,
CASE WHEN apps LIKE '%HızlıPazar%' THEN 1 ELSE 0 END AS hizlipazar,
CASE WHEN apps LIKE '%Konuşalım%' THEN 1 ELSE 0 END AS konusalim,
CASE WHEN service_type = 'Prepaid' THEN 1 ELSE 0 END AS prepaid,
CASE WHEN service_type = 'Postpaid' THEN 1 ELSE 0 END AS postpaid,
CASE WHEN service_type = 'Broadband' THEN 1 ELSE 0 END AS broadband
FROM churn_data
) b
USING(id));

-- boş veriler için sorgular
SELECT
'id' AS column_name, COUNT(*) - COUNT(id) AS null_count FROM churn_data
UNION ALL
SELECT
'age', COUNT(*) - COUNT(age) FROM churn_data
UNION ALL
SELECT
'tenure', COUNT(*) - COUNT(tenure) FROM churn_data
UNION ALL
SELECT
'service_type', COUNT(*) - COUNT(service_type) FROM churn_data
UNION ALL
SELECT
'avg_call_duration', COUNT(*) - COUNT(avg_call_duration) FROM churn_data
UNION ALL
SELECT
'data_usage', COUNT(*) - COUNT(data_usage) FROM churn_data
UNION ALL
SELECT
'roaming_usage', COUNT(*) - COUNT(roaming_usage) FROM churn_data
UNION ALL
SELECT
'monthly_charge', COUNT(*) - COUNT(monthly_charge) FROM churn_data
UNION ALL
SELECT
'overdue_payments', COUNT(*) - COUNT(overdue_payments) FROM churn_data
UNION ALL
SELECT
'auto_payment', COUNT(*) - COUNT(auto_payment) FROM churn_data
UNION ALL
SELECT
'avg_top_up_count', COUNT(*) - COUNT(avg_top_up_count) FROM churn_data
UNION ALL
SELECT
'call_drops', COUNT(*) - COUNT(call_drops) FROM churn_data
UNION ALL
SELECT
'customer_support_calls', COUNT(*) - COUNT(customer_support_calls) FROM churn_data
UNION ALL
SELECT
'satisfaction_score', COUNT(*) - COUNT(satisfaction_score) FROM churn_data
UNION ALL
SELECT
'apps', COUNT(*) - COUNT(apps) FROM churn_data
UNION ALL
SELECT
'churn', COUNT(*) - COUNT(churn) FROM churn_data;
118 changes: 118 additions & 0 deletions week2/capstone/file_handling.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import duckdb\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"pd.set_option('future.no_silent_downcasting', True)\n",
"\n",
"conn = duckdb.connect(database=\"capstone.db\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = conn.execute(\"SELECT * FROM churn_data\").fetchdf()\n",
"df[[\"auto_payment\", \"churn\"]] = df[[\"auto_payment\", \"churn\"]].apply(lambda x: x.map({True: 1, False: 0}))\n",
"X, y = df.drop(columns=[\"churn\", \"apps\", \"service_type\"]), df[\"churn\"] \n",
"\n",
"X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']] = X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']].fillna(0)\n",
"X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'] = X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def fill_missing_values(df_train, df_test, condition_column, columns):\n",
"\n",
" for condition in condition_column:\n",
" \n",
" df_train_filtered = df_train[df_train[condition] == 1]\n",
" df_test_filtered = df_test[df_test[condition] == 1]\n",
"\n",
" for col in columns:\n",
" min_val, max_val = df_train_filtered[col].min(), df_train_filtered[col].max()\n",
"\n",
" \n",
" missing_train_idx = df_train_filtered[df_train_filtered[col].isnull()].index\n",
" missing_test_idx = df_test_filtered[df_test_filtered[col].isnull()].index\n",
"\n",
" train_random_values = np.random.uniform(min_val, max_val, size=len(missing_train_idx)).astype(np.float32)\n",
" test_random_values = np.random.uniform(min_val, max_val, size=len(missing_test_idx)).astype(np.float32)\n",
"\n",
" df_train.loc[missing_train_idx, col] = train_random_values\n",
" df_test.loc[missing_test_idx, col] = test_random_values\n",
"\n",
" return df_train, df_test\n",
"\n",
"X_train, X_test = fill_missing_values(X_train, X_test, ['broadband','prepaid','postpaid'], ['avg_call_duration', 'data_usage', 'monthly_charge'])\n",
"\n",
"X_train.drop(columns=['prepaid'], inplace=True) \n",
"X_test.drop(columns=['prepaid'], inplace=True) # dummy variable trap\n",
"\n",
"X_train.fillna({'auto_payment': 0}, inplace=True)\n",
"X_test.fillna({'auto_payment': 0}, inplace=True)\n",
"\n",
"X_train.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)\n",
"X_test.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.concat([X_train, y_train], axis=1)\n",
"test_data = pd.concat([X_test, y_test], axis=1)\n",
"\n",
"train_data.to_parquet(\"train_data.parquet\", engine='pyarrow')\n",
"test_data.to_parquet(\"test_data.parquet\", engine='pyarrow')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py310",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion week2/sunday/create_netflix.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
create table rating as select * from read_csv('../../you_do_1/data/rating_*.txt', columns = {'MovieId': 'int', 'UserId': 'int', 'Date': 'date', 'Rate':'int'} ) ;

-- This causes error because of fields containing ,
-- create table movie_title as select * from read_csv('../../you_do_1/data/movie_titles.csv', columns = {'MovieId':'int','PublishedYear':'int', 'Title':'varchar'}, header=false, delim=',', auto_detect=false);
-- create table movie_title as select * from read_csv('../../you_do_1/data/movie_titles.csv', columns = {'MovieId':'int','PublishedYear':'int', 'Title':'varchar'}, header=false, delim=',', auto_detect=false)
5 changes: 5 additions & 0 deletions week2/sunday/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import duckdb

conn = duckdb.connect(database="you_do_1/data/my_database.db")


Loading