dragoneco
diff --git a/‎Chapter_6_sec_6.5_6.7.ipynb‎
Lines changed: 99 additions & 33 deletions b/‎Chapter_6_sec_6.5_6.7.ipynb‎
Lines changed: 99 additions & 33 deletions
@@ -17,7 +17,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\"This notebook contains the code for best subset selection, \n",
@@ -50,7 +52,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "Hitters = pd.read_csv('data/Hitters.csv', header=0, na_values='NA')\n",
@@ -65,7 +69,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print np.sum(pd.isnull(Hitters['Salary'])) # number of NAs in Salary column'\n",
@@ -80,7 +86,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "y = Hitters.Salary  # the response variable \n",
@@ -130,7 +138,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "models = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -144,7 +154,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\" this show an example to plot the RSS of best models with different number of parameters\"\"\"\n",
@@ -169,7 +181,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\"\n",
@@ -223,7 +237,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "models2 = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -236,7 +252,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\"we can compare the results of best subset selection and the forward selection\"\"\"\n",
@@ -275,7 +293,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\"\n",
@@ -382,7 +402,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "models_validation = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -403,7 +425,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print('Best max_feature variable from best subset selection on tranining')\n",
@@ -422,7 +446,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\" this show an example to plot the RSS of best models with different number of parameters for best subset with validation\"\"\"\n",
@@ -436,7 +462,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\" this show an example to plot the RSS of best models with different number of parameters for forward selection with validation\"\"\"\n",
@@ -481,7 +509,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "models_cv = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -499,7 +529,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "cv_errors_mean = cv_errors.mean(axis = 1)\n",
@@ -513,7 +545,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "\"\"\"From the above plot, we can see that the model with 5 variables yielded the smallest RSS.\n",
@@ -539,7 +573,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.preprocessing import scale \n",
@@ -589,7 +625,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "alphas = 10**np.linspace(10,-2,100)\n",
@@ -625,7 +663,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print len(coeffs)\n",
@@ -644,7 +684,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "ax = plt.gca()\n",
@@ -659,7 +701,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "print alphas[49]\n",
@@ -691,7 +735,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "ridge = Ridge(fit_intercept=True, normalize=True, alpha=4)\n",
@@ -711,7 +757,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "ridgecv =  RidgeCV(alphas, scoring='mean_squared_error', normalize = True)\n",
@@ -722,7 +770,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "ridge_best = Ridge(alpha=ridgecv.alpha_, normalize=True)\n",
@@ -740,7 +790,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pd.Series(ridge_best.coef_, index=X.columns)"
@@ -763,7 +815,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "lasso= Lasso(normalize=True, max_iter=1e5) \n",
@@ -786,7 +840,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "lassocv = LassoCV(alphas=None, cv=10, max_iter=1e5, normalize=True)\n",
@@ -800,7 +856,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Some of the coefficients should reduce to exact zero\n",
@@ -917,7 +975,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "mse_train = []\n",
@@ -957,7 +1017,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n",
@@ -984,7 +1046,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "np.cumsum(pca.explained_variance_ratio_) * 100"
@@ -1007,7 +1071,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n",