Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Update 04_preprocessing_and_training.ipynb
Completed notebook w/o summary
  • Loading branch information
JLindsey96 authored Jun 27, 2024
commit bc94fc1db5691d46b5184bfd10a3c35aff9744d8
88 changes: 44 additions & 44 deletions Notebooks/04_preprocessing_and_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -986,10 +986,10 @@
"#Save the 'Name', 'state', and 'Region' columns from the train/test data into names_train and names_test\n",
"#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'\n",
"names_list = ['Name', 'state', 'Region']\n",
"names_train = X_train[___]\n",
"names_test = X_test[___]\n",
"X_train.___(columns=names_list, inplace=___)\n",
"X_test.___(columns=names_list, inplace=___)\n",
"names_train = X_train[names_list]\n",
"names_test = X_test[names_list]\n",
"X_train.drop(columns=names_list, inplace=True)\n",
"X_test.drop(columns=names_list, inplace=True)\n",
"X_train.shape, X_test.shape"
]
},
Expand All @@ -1001,7 +1001,7 @@
"source": [
"#Code task 2#\n",
"#Check the `dtypes` attribute of `X_train` to verify all features are numeric\n",
"X_train.___"
"X_train.dtypes"
]
},
{
Expand All @@ -1012,7 +1012,7 @@
"source": [
"#Code task 3#\n",
"#Repeat this check for the test split in `X_test`\n",
"X_test.___"
"X_test.dtypes"
]
},
{
Expand Down Expand Up @@ -1044,7 +1044,7 @@
"source": [
"#Code task 4#\n",
"#Calculate the mean of `y_train`\n",
"train_mean = y_train.___\n",
"train_mean = y_train.mean()\n",
"train_mean"
]
},
Expand All @@ -1066,8 +1066,8 @@
"#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments\n",
"#Then print the object's `constant_` attribute and verify it's the same as the mean above\n",
"dumb_reg = DummyRegressor(strategy='mean')\n",
"dumb_reg.___(___, ___)\n",
"dumb_reg.___"
"dumb_reg.fit('X_train', `y_train`)\n",
"dumb_reg.constant_"
]
},
{
Expand Down Expand Up @@ -1140,9 +1140,9 @@
" ypred -- the predicted values\n",
" \"\"\"\n",
" ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)\n",
" sum_sq_tot = np.___((y - ybar)**2) #total sum of squares error\n",
" sum_sq_res = np.___((y - ypred)**2) #residual sum of squares error\n",
" R2 = 1.0 - ___ / ___\n",
" sum_sq_tot = np.mean((y - ybar)**2) #total sum of squares error\n",
" sum_sq_res = np.mean((y - ypred)**2) #residual sum of squares error\n",
" R2 = 1.0 - sum_sq_tot / sum_sq_res\n",
" return R2"
]
},
Expand Down Expand Up @@ -1398,8 +1398,8 @@
" y -- the observed values\n",
" ypred -- the predicted values\n",
" \"\"\"\n",
" sq_error = (___ - ___)**2\n",
" mse = np.mean(___)\n",
" sq_error = (y_true - y_pred)**2\n",
" mse = np.mean(sq_error)\n",
" return mse"
]
},
Expand Down Expand Up @@ -1805,8 +1805,8 @@
"#Code task 9#\n",
"#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use\n",
"#Assign the results to `X_tr` and `X_te`, respectively\n",
"X_tr = X_train.___(___)\n",
"X_te = X_test.___(___)"
"X_tr = X_train.fillna(X_defaults_median)\n",
"X_te = X_test.fillna(X_defaults_median)"
]
},
{
Expand Down Expand Up @@ -1834,9 +1834,9 @@
"#then use it's `transform()` method to apply the scaling to both the train and test split\n",
"#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively\n",
"scaler = StandardScaler()\n",
"scaler.___(X_tr)\n",
"X_tr_scaled = scaler.___(X_tr)\n",
"X_te_scaled = scaler.___(X_te)"
"scaler.fit(X_tr)\n",
"X_tr_scaled = scaler.transform(X_tr)\n",
"X_te_scaled = scaler.transform(X_te)"
]
},
{
Expand Down Expand Up @@ -1871,8 +1871,8 @@
"#Code task 11#\n",
"#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data\n",
"#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively\n",
"y_tr_pred = lm.___(X_tr_scaled)\n",
"y_te_pred = lm.___(X_te_scaled)"
"y_tr_pred = lm.predict(X_tr_scaled)\n",
"y_te_pred = lm.predict(X_te_scaled)"
]
},
{
Expand Down Expand Up @@ -1921,7 +1921,7 @@
"#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function\n",
"# as we did above for R^2\n",
"# MAE - train, test\n",
"median_mae = ___(y_train, y_tr_pred), ___(y_test, y_te_pred)\n",
"median_mae = mae_score(y_train, y_tr_pred), mae_score(y_test, y_te_pred)\n",
"median_mae"
]
},
Expand All @@ -1941,7 +1941,7 @@
"#Code task 13#\n",
"#And also do the same using `sklearn`'s `mean_squared_error`\n",
"# MSE - train, test\n",
"median_mse = ___(___, ___), ___(___, ___)\n",
"median_mse = mse_score(y_train, y_tr_pred), mse_score(y_test, y_te_pred)\n",
"median_mse"
]
},
Expand Down Expand Up @@ -1975,7 +1975,7 @@
"#Code task 14#\n",
"#As we did for the median above, calculate mean values for imputing missing values\n",
"# These are the values we'll use to fill in any missing values\n",
"X_defaults_mean = X_train.___()\n",
"X_defaults_mean = X_train.mean()\n",
"X_defaults_mean"
]
},
Expand Down Expand Up @@ -2241,7 +2241,7 @@
"source": [
"#Code task 15#\n",
"#Call the pipe's `fit()` method with `X_train` and `y_train` as arguments\n",
"pipe.___(___, ___)"
"pipe.fit(X_train, y_train)"
]
},
{
Expand Down Expand Up @@ -2459,7 +2459,7 @@
"pipe = make_pipeline(\n",
" SimpleImputer(strategy='median'), \n",
" StandardScaler(),\n",
" ___(___),\n",
" f_regression(SelectKBest),\n",
" LinearRegression()\n",
")"
]
Expand Down Expand Up @@ -2577,7 +2577,7 @@
"pipe15 = make_pipeline(\n",
" SimpleImputer(strategy='median'), \n",
" StandardScaler(),\n",
" ___(___, k=___),\n",
" f_regression(SelectKBest, k=15),\n",
" LinearRegression()\n",
")"
]
Expand Down Expand Up @@ -2804,7 +2804,7 @@
"#Code task 18#\n",
"#Call `pipe`'s `get_params()` method to get a dict of available parameters and print their names\n",
"#using dict's `keys()` method\n",
"pipe.___.keys()"
"pipe.get_params().keys()"
]
},
{
Expand Down Expand Up @@ -2892,7 +2892,7 @@
"source": [
"#Code task 19#\n",
"#Print the `best_params_` attribute of `lr_grid_cv`\n",
"lr_grid_cv.___"
"lr_grid_cv.best_params_"
]
},
{
Expand All @@ -2903,7 +2903,7 @@
"source": [
"#Code task 20#\n",
"#Assign the value of k from the above dict of `best_params_` and assign it to `best_k`\n",
"___ = lr_grid_cv.___['selectkbest__k']\n",
"best_k = lr_grid_cv.best_params_['selectkbest__k']\n",
"plt.subplots(figsize=(10, 5))\n",
"plt.errorbar(cv_k, score_mean, yerr=score_std)\n",
"plt.axvline(x=best_k, c='r', ls='--', alpha=.5)\n",
Expand Down Expand Up @@ -2955,7 +2955,7 @@
"#sorting the values in descending order\n",
"coefs = lr_grid_cv.best_estimator_.named_steps.linearregression.coef_\n",
"features = X_train.columns[selected]\n",
"pd.Series(___, index=___).___(ascending=___)"
"pd.Series(coefs, index=features).sort(ascending=False)"
]
},
{
Expand Down Expand Up @@ -3000,9 +3000,9 @@
"#StandardScaler(),\n",
"#and then RandomForestRegressor() with a random state of 47\n",
"RF_pipe = make_pipeline(\n",
" ___(strategy=___),\n",
" ___,\n",
" ___(random_state=___)\n",
" SimpleImputer(strategy=median),\n",
" StandardScaler(),\n",
" RandomForestRegressor(random_state=47)\n",
")"
]
},
Expand All @@ -3023,7 +3023,7 @@
"#Call `cross_validate` to estimate the pipeline's performance.\n",
"#Pass it the random forest pipe object, `X_train` and `y_train`,\n",
"#and get it to use 5-fold cross-validation\n",
"rf_default_cv_results = cross_validate(___, ___, ___, cv=___)"
"rf_default_cv_results = cross_validate(RF_pipe, X_train, y_train, cv=5-fold)"
]
},
{
Expand Down Expand Up @@ -3137,7 +3137,7 @@
"#Code task 24#\n",
"#Call `GridSearchCV` with the random forest pipeline, passing in the above `grid_params`\n",
"#dict for parameters to evaluate, 5-fold cross-validation, and all available CPU cores (if desired)\n",
"rf_grid_cv = GridSearchCV(___, param_grid=___, cv=___, n_jobs=-1)"
"rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5-fold, n_jobs=-1)"
]
},
{
Expand All @@ -3149,7 +3149,7 @@
"#Code task 25#\n",
"#Now call the `GridSearchCV`'s `fit()` method with `X_train` and `y_train` as arguments\n",
"#to actually start the grid search. This may take a minute or two.\n",
"rf_grid_cv.___(___, ___)"
"rf_grid_cv.fit(X_train, y_train)"
]
},
{
Expand All @@ -3160,7 +3160,7 @@
"source": [
"#Code task 26#\n",
"#Print the best params (`best_params_` attribute) from the grid search\n",
"rf_grid_cv.___"
"rf_grid_cv.best_params_"
]
},
{
Expand Down Expand Up @@ -3233,7 +3233,7 @@
"#training data column names, sorting the values in descending order\n",
"plt.subplots(figsize=(10, 5))\n",
"imps = rf_grid_cv.best_estimator_.named_steps.randomforestregressor.___\n",
"rf_feat_imps = pd.Series(___, index=X_train.columns).sort_values(ascending=False)\n",
"rf_feat_imps = pd.Series(data=training, index=X_train.columns).sort_values(ascending=False)\n",
"rf_feat_imps.plot(kind='bar')\n",
"plt.xlabel('features')\n",
"plt.ylabel('importance')\n",
Expand Down Expand Up @@ -3492,12 +3492,12 @@
"#and the current datetime (`datetime.datetime.now()`) to the `build_datetime` attribute\n",
"#Let's call this model version '1.0'\n",
"best_model = rf_grid_cv.best_estimator_\n",
"best_model.version = ___\n",
"best_model.pandas_version = ___\n",
"best_model.numpy_version = ___\n",
"best_model.sklearn_version = ___\n",
"best_model.version = 1.0\n",
"best_model.pandas_version = pd.__version__\n",
"best_model.numpy_version = np.__version__\n",
"best_model.sklearn_version = sklearn_version\n",
"best_model.X_columns = [col for col in X_train.columns]\n",
"best_model.build_datetime = ___"
"best_model.build_datetime = datetime.datetime.now()"
]
},
{
Expand Down