Updated blog post for standard scaler

mGalarnyk · mGalarnyk · commit 12a369acfa09 · 2017-10-15T17:51:19.000-07:00
diff --git a/Sklearn/PCA/PCA_MNIST_Logistic_Regression_Speeding_Machine_Learning.ipynb b/Sklearn/PCA/PCA_MNIST_Logistic_Regression_Speeding_Machine_Learning.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 45,
    "metadata": {
     "collapsed": true
    },
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -46,7 +46,7 @@
        " 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -66,7 +66,7 @@
        "(70000, 784)"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [
     {
@@ -87,7 +87,7 @@
        "(70000,)"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -97,39 +97,6 @@
     "mnist.target.shape"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Standardizing the Data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Notebook going over the importance of feature Scaling: http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# Standardize features by removing the mean and scaling to unit variance\n",
-    "mnist.data = StandardScaler().fit_transform(mnist.data)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -139,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 49,
    "metadata": {
     "collapsed": true
    },
@@ -151,71 +118,38 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 37,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(60000, 784)\n"
-     ]
-    }
-   ],
    "source": [
-    "print(train_img.shape)"
+    "## Standardizing the Data"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 38,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(60000,)\n"
-     ]
-    }
-   ],
    "source": [
-    "print(train_lbl.shape)"
+    "Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales."
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 39,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(10000, 784)\n"
-     ]
-    }
-   ],
    "source": [
-    "print(test_img.shape)"
+    "Notebook going over the importance of feature Scaling: http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(10000,)\n"
-     ]
-    }
-   ],
+   "execution_count": 50,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
    "source": [
-    "print(test_lbl.shape)"
+    "from sklearn.preprocessing import StandardScaler\n",
+    "scaler = StandardScaler()\n",
+    "train_img = scaler.fit_transform(train_img)\n",
+    "test_img = scaler.transform(test_img)"
    ]
   },
   {
@@ -234,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 51,
    "metadata": {
     "collapsed": true
    },
@@ -252,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 52,
    "metadata": {
     "collapsed": true
    },
@@ -270,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [
     {
@@ -280,7 +214,7 @@
        "  svd_solver='auto', tol=0.0, whiten=False)"
       ]
      },
-     "execution_count": 51,
+     "execution_count": 53,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -298,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 54,
    "metadata": {
     "collapsed": true
    },
@@ -324,7 +258,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 55,
    "metadata": {
     "collapsed": true
    },
@@ -342,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 56,
    "metadata": {
     "collapsed": true
    },
@@ -370,7 +304,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [
     {
@@ -382,7 +316,7 @@
        "          verbose=0, warm_start=False)"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 57,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -407,7 +341,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
@@ -416,7 +350,7 @@
        "array([ 1.])"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -429,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [
     {
@@ -438,7 +372,7 @@
        "array([ 1.,  9.,  2.,  2.,  7.,  1.,  8.,  3.,  3.,  7.])"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 59,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -471,21 +405,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.9195\n"
+      "0.92\n"
      ]
     }
    ],
    "source": [
     "score = logisticRegr.score(test_img, test_lbl)\n",
     "print(score)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {