Update 04_preprocessing_and_training.ipynb

Completed notebook w/o summary
springboard-curriculum · JLindsey96 · Jun 27, 2024 · Jul 3, 2024 · Jun 27, 2024 · bc94fc1db5691d46b5184bfd10a3c35aff9744d8
commit bc94fc1db5691d46b5184bfd10a3c35aff9744d8
diff --git a/Notebooks/04_preprocessing_and_training.ipynb b/Notebooks/04_preprocessing_and_training.ipynb
@@ -986,10 +986,10 @@
     "#Save the 'Name', 'state', and 'Region' columns from the train/test data into names_train and names_test\n",
     "#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'\n",
     "names_list = ['Name', 'state', 'Region']\n",
-    "names_train = X_train[___]\n",
-    "names_test = X_test[___]\n",
-    "X_train.___(columns=names_list, inplace=___)\n",
-    "X_test.___(columns=names_list, inplace=___)\n",
+    "names_train = X_train[names_list]\n",
+    "names_test = X_test[names_list]\n",
+    "X_train.drop(columns=names_list, inplace=True)\n",
+    "X_test.drop(columns=names_list, inplace=True)\n",
     "X_train.shape, X_test.shape"
    ]
   },
@@ -1001,7 +1001,7 @@
    "source": [
     "#Code task 2#\n",
     "#Check the `dtypes` attribute of `X_train` to verify all features are numeric\n",
-    "X_train.___"
+    "X_train.dtypes"
    ]
   },
   {
@@ -1012,7 +1012,7 @@
    "source": [
     "#Code task 3#\n",
     "#Repeat this check for the test split in `X_test`\n",
-    "X_test.___"
+    "X_test.dtypes"
    ]
   },
   {
@@ -1044,7 +1044,7 @@
    "source": [
     "#Code task 4#\n",
     "#Calculate the mean of `y_train`\n",
-    "train_mean = y_train.___\n",
+    "train_mean = y_train.mean()\n",
     "train_mean"
    ]
   },
@@ -1066,8 +1066,8 @@
     "#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments\n",
     "#Then print the object's `constant_` attribute and verify it's the same as the mean above\n",
     "dumb_reg = DummyRegressor(strategy='mean')\n",
-    "dumb_reg.___(___, ___)\n",
-    "dumb_reg.___"
+    "dumb_reg.fit('X_train', `y_train`)\n",
+    "dumb_reg.constant_"
    ]
   },
   {
@@ -1140,9 +1140,9 @@
     "    ypred -- the predicted values\n",
     "    \"\"\"\n",
     "    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)\n",
-    "    sum_sq_tot = np.___((y - ybar)**2) #total sum of squares error\n",
-    "    sum_sq_res = np.___((y - ypred)**2) #residual sum of squares error\n",
-    "    R2 = 1.0 - ___ / ___\n",
+    "    sum_sq_tot = np.mean((y - ybar)**2) #total sum of squares error\n",
+    "    sum_sq_res = np.mean((y - ypred)**2) #residual sum of squares error\n",
+    "    R2 = 1.0 - sum_sq_tot / sum_sq_res\n",
     "    return R2"
    ]
   },
@@ -1398,8 +1398,8 @@
     "    y -- the observed values\n",
     "    ypred -- the predicted values\n",
     "    \"\"\"\n",
-    "    sq_error = (___ - ___)**2\n",
-    "    mse = np.mean(___)\n",
+    "    sq_error = (y_true - y_pred)**2\n",
+    "    mse = np.mean(sq_error)\n",
     "    return mse"
    ]
   },
@@ -1805,8 +1805,8 @@
     "#Code task 9#\n",
     "#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use\n",
     "#Assign the results to `X_tr` and `X_te`, respectively\n",
-    "X_tr = X_train.___(___)\n",
-    "X_te = X_test.___(___)"
+    "X_tr = X_train.fillna(X_defaults_median)\n",
+    "X_te = X_test.fillna(X_defaults_median)"
    ]
   },
   {
@@ -1834,9 +1834,9 @@
     "#then use it's `transform()` method to apply the scaling to both the train and test split\n",
     "#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively\n",
     "scaler = StandardScaler()\n",
-    "scaler.___(X_tr)\n",
-    "X_tr_scaled = scaler.___(X_tr)\n",
-    "X_te_scaled = scaler.___(X_te)"
+    "scaler.fit(X_tr)\n",
+    "X_tr_scaled = scaler.transform(X_tr)\n",
+    "X_te_scaled = scaler.transform(X_te)"
    ]
   },
   {
@@ -1871,8 +1871,8 @@
     "#Code task 11#\n",
     "#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data\n",
     "#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively\n",
-    "y_tr_pred = lm.___(X_tr_scaled)\n",
-    "y_te_pred = lm.___(X_te_scaled)"
+    "y_tr_pred = lm.predict(X_tr_scaled)\n",
+    "y_te_pred = lm.predict(X_te_scaled)"
    ]
   },
   {
@@ -1921,7 +1921,7 @@
     "#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function\n",
     "# as we did above for R^2\n",
     "# MAE - train, test\n",
-    "median_mae = ___(y_train, y_tr_pred), ___(y_test, y_te_pred)\n",
+    "median_mae = mae_score(y_train, y_tr_pred), mae_score(y_test, y_te_pred)\n",
     "median_mae"
    ]
   },
@@ -1941,7 +1941,7 @@
     "#Code task 13#\n",
     "#And also do the same using `sklearn`'s `mean_squared_error`\n",
     "# MSE - train, test\n",
-    "median_mse = ___(___, ___), ___(___, ___)\n",
+    "median_mse = mse_score(y_train, y_tr_pred), mse_score(y_test, y_te_pred)\n",
     "median_mse"
    ]
   },
@@ -1975,7 +1975,7 @@
     "#Code task 14#\n",
     "#As we did for the median above, calculate mean values for imputing missing values\n",
     "# These are the values we'll use to fill in any missing values\n",
-    "X_defaults_mean = X_train.___()\n",
+    "X_defaults_mean = X_train.mean()\n",
     "X_defaults_mean"
    ]
   },
@@ -2241,7 +2241,7 @@
    "source": [
     "#Code task 15#\n",
     "#Call the pipe's `fit()` method with `X_train` and `y_train` as arguments\n",
-    "pipe.___(___, ___)"
+    "pipe.fit(X_train, y_train)"
    ]
   },
   {
@@ -2459,7 +2459,7 @@
     "pipe = make_pipeline(\n",
     "    SimpleImputer(strategy='median'), \n",
     "    StandardScaler(),\n",
-    "    ___(___),\n",
+    "    f_regression(SelectKBest),\n",
     "    LinearRegression()\n",
     ")"
    ]
@@ -2577,7 +2577,7 @@
     "pipe15 = make_pipeline(\n",
     "    SimpleImputer(strategy='median'), \n",
     "    StandardScaler(),\n",
-    "    ___(___, k=___),\n",
+    "    f_regression(SelectKBest, k=15),\n",
     "    LinearRegression()\n",
     ")"
    ]
@@ -2804,7 +2804,7 @@
     "#Code task 18#\n",
     "#Call `pipe`'s `get_params()` method to get a dict of available parameters and print their names\n",
     "#using dict's `keys()` method\n",
-    "pipe.___.keys()"
+    "pipe.get_params().keys()"
    ]
   },
   {
@@ -2892,7 +2892,7 @@
    "source": [
     "#Code task 19#\n",
     "#Print the `best_params_` attribute of `lr_grid_cv`\n",
-    "lr_grid_cv.___"
+    "lr_grid_cv.best_params_"
    ]
   },
   {
@@ -2903,7 +2903,7 @@
    "source": [
     "#Code task 20#\n",
     "#Assign the value of k from the above dict of `best_params_` and assign it to `best_k`\n",
-    "___ = lr_grid_cv.___['selectkbest__k']\n",
+    "best_k = lr_grid_cv.best_params_['selectkbest__k']\n",
     "plt.subplots(figsize=(10, 5))\n",
     "plt.errorbar(cv_k, score_mean, yerr=score_std)\n",
     "plt.axvline(x=best_k, c='r', ls='--', alpha=.5)\n",
@@ -2955,7 +2955,7 @@
     "#sorting the values in descending order\n",
     "coefs = lr_grid_cv.best_estimator_.named_steps.linearregression.coef_\n",
     "features = X_train.columns[selected]\n",
-    "pd.Series(___, index=___).___(ascending=___)"
+    "pd.Series(coefs, index=features).sort(ascending=False)"
    ]
   },
   {
@@ -3000,9 +3000,9 @@
     "#StandardScaler(),\n",
     "#and then RandomForestRegressor() with a random state of 47\n",
     "RF_pipe = make_pipeline(\n",
-    "    ___(strategy=___),\n",
-    "    ___,\n",
-    "    ___(random_state=___)\n",
+    "    SimpleImputer(strategy=median),\n",
+    "    StandardScaler(),\n",
+    "    RandomForestRegressor(random_state=47)\n",
     ")"
    ]
   },
@@ -3023,7 +3023,7 @@
     "#Call `cross_validate` to estimate the pipeline's performance.\n",
     "#Pass it the random forest pipe object, `X_train` and `y_train`,\n",
     "#and get it to use 5-fold cross-validation\n",
-    "rf_default_cv_results = cross_validate(___, ___, ___, cv=___)"
+    "rf_default_cv_results = cross_validate(RF_pipe, X_train, y_train, cv=5-fold)"
    ]
   },
   {
@@ -3137,7 +3137,7 @@
     "#Code task 24#\n",
     "#Call `GridSearchCV` with the random forest pipeline, passing in the above `grid_params`\n",
     "#dict for parameters to evaluate, 5-fold cross-validation, and all available CPU cores (if desired)\n",
-    "rf_grid_cv = GridSearchCV(___, param_grid=___, cv=___, n_jobs=-1)"
+    "rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5-fold, n_jobs=-1)"
    ]
   },
   {
@@ -3149,7 +3149,7 @@
     "#Code task 25#\n",
     "#Now call the `GridSearchCV`'s `fit()` method with `X_train` and `y_train` as arguments\n",
     "#to actually start the grid search. This may take a minute or two.\n",
-    "rf_grid_cv.___(___, ___)"
+    "rf_grid_cv.fit(X_train, y_train)"
    ]
   },
   {
@@ -3160,7 +3160,7 @@
    "source": [
     "#Code task 26#\n",
     "#Print the best params (`best_params_` attribute) from the grid search\n",
-    "rf_grid_cv.___"
+    "rf_grid_cv.best_params_"
    ]
   },
   {
@@ -3233,7 +3233,7 @@
     "#training data column names, sorting the values in descending order\n",
     "plt.subplots(figsize=(10, 5))\n",
     "imps = rf_grid_cv.best_estimator_.named_steps.randomforestregressor.___\n",
-    "rf_feat_imps = pd.Series(___, index=X_train.columns).sort_values(ascending=False)\n",
+    "rf_feat_imps = pd.Series(data=training, index=X_train.columns).sort_values(ascending=False)\n",
     "rf_feat_imps.plot(kind='bar')\n",
     "plt.xlabel('features')\n",
     "plt.ylabel('importance')\n",
@@ -3492,12 +3492,12 @@
     "#and the current datetime (`datetime.datetime.now()`) to the `build_datetime` attribute\n",
     "#Let's call this model version '1.0'\n",
     "best_model = rf_grid_cv.best_estimator_\n",
-    "best_model.version = ___\n",
-    "best_model.pandas_version = ___\n",
-    "best_model.numpy_version = ___\n",
-    "best_model.sklearn_version = ___\n",
+    "best_model.version = 1.0\n",
+    "best_model.pandas_version = pd.__version__\n",
+    "best_model.numpy_version = np.__version__\n",
+    "best_model.sklearn_version = sklearn_version\n",
     "best_model.X_columns = [col for col in X_train.columns]\n",
-    "best_model.build_datetime = ___"
+    "best_model.build_datetime = datetime.datetime.now()"
    ]
   },
   {