Skip to content

Commit dac7696

Browse files
author
xqiuping
committed
complete Chapter 7 Sec 7.8
1 parent cdf5e1e commit dac7696

File tree

3 files changed

+335
-33
lines changed

3 files changed

+335
-33
lines changed

Chapter_6_sec_6.5_6.7.ipynb

Lines changed: 99 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
{
1818
"cell_type": "code",
1919
"execution_count": null,
20-
"metadata": {},
20+
"metadata": {
21+
"collapsed": true
22+
},
2123
"outputs": [],
2224
"source": [
2325
"\"\"\"This notebook contains the code for best subset selection, \n",
@@ -50,7 +52,9 @@
5052
{
5153
"cell_type": "code",
5254
"execution_count": null,
53-
"metadata": {},
55+
"metadata": {
56+
"collapsed": true
57+
},
5458
"outputs": [],
5559
"source": [
5660
"Hitters = pd.read_csv('data/Hitters.csv', header=0, na_values='NA')\n",
@@ -65,7 +69,9 @@
6569
{
6670
"cell_type": "code",
6771
"execution_count": null,
68-
"metadata": {},
72+
"metadata": {
73+
"collapsed": true
74+
},
6975
"outputs": [],
7076
"source": [
7177
"print np.sum(pd.isnull(Hitters['Salary'])) # number of NAs in Salary column'\n",
@@ -80,7 +86,9 @@
8086
{
8187
"cell_type": "code",
8288
"execution_count": null,
83-
"metadata": {},
89+
"metadata": {
90+
"collapsed": true
91+
},
8492
"outputs": [],
8593
"source": [
8694
"y = Hitters.Salary # the response variable \n",
@@ -130,7 +138,9 @@
130138
{
131139
"cell_type": "code",
132140
"execution_count": null,
133-
"metadata": {},
141+
"metadata": {
142+
"collapsed": true
143+
},
134144
"outputs": [],
135145
"source": [
136146
"models = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -144,7 +154,9 @@
144154
{
145155
"cell_type": "code",
146156
"execution_count": null,
147-
"metadata": {},
157+
"metadata": {
158+
"collapsed": true
159+
},
148160
"outputs": [],
149161
"source": [
150162
"\"\"\" this show an example to plot the RSS of best models with different number of parameters\"\"\"\n",
@@ -169,7 +181,9 @@
169181
{
170182
"cell_type": "code",
171183
"execution_count": null,
172-
"metadata": {},
184+
"metadata": {
185+
"collapsed": true
186+
},
173187
"outputs": [],
174188
"source": [
175189
"\"\"\"\n",
@@ -223,7 +237,9 @@
223237
{
224238
"cell_type": "code",
225239
"execution_count": null,
226-
"metadata": {},
240+
"metadata": {
241+
"collapsed": true
242+
},
227243
"outputs": [],
228244
"source": [
229245
"models2 = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -236,7 +252,9 @@
236252
{
237253
"cell_type": "code",
238254
"execution_count": null,
239-
"metadata": {},
255+
"metadata": {
256+
"collapsed": true
257+
},
240258
"outputs": [],
241259
"source": [
242260
"\"\"\"we can compare the results of best subset selection and the forward selection\"\"\"\n",
@@ -275,7 +293,9 @@
275293
{
276294
"cell_type": "code",
277295
"execution_count": null,
278-
"metadata": {},
296+
"metadata": {
297+
"collapsed": true
298+
},
279299
"outputs": [],
280300
"source": [
281301
"\"\"\"\n",
@@ -382,7 +402,9 @@
382402
{
383403
"cell_type": "code",
384404
"execution_count": null,
385-
"metadata": {},
405+
"metadata": {
406+
"collapsed": true
407+
},
386408
"outputs": [],
387409
"source": [
388410
"models_validation = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -403,7 +425,9 @@
403425
{
404426
"cell_type": "code",
405427
"execution_count": null,
406-
"metadata": {},
428+
"metadata": {
429+
"collapsed": true
430+
},
407431
"outputs": [],
408432
"source": [
409433
"print('Best max_feature variable from best subset selection on tranining')\n",
@@ -422,7 +446,9 @@
422446
{
423447
"cell_type": "code",
424448
"execution_count": null,
425-
"metadata": {},
449+
"metadata": {
450+
"collapsed": true
451+
},
426452
"outputs": [],
427453
"source": [
428454
"\"\"\" this show an example to plot the RSS of best models with different number of parameters for best subset with validation\"\"\"\n",
@@ -436,7 +462,9 @@
436462
{
437463
"cell_type": "code",
438464
"execution_count": null,
439-
"metadata": {},
465+
"metadata": {
466+
"collapsed": true
467+
},
440468
"outputs": [],
441469
"source": [
442470
"\"\"\" this show an example to plot the RSS of best models with different number of parameters for forward selection with validation\"\"\"\n",
@@ -481,7 +509,9 @@
481509
{
482510
"cell_type": "code",
483511
"execution_count": null,
484-
"metadata": {},
512+
"metadata": {
513+
"collapsed": true
514+
},
485515
"outputs": [],
486516
"source": [
487517
"models_cv = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n",
@@ -499,7 +529,9 @@
499529
{
500530
"cell_type": "code",
501531
"execution_count": null,
502-
"metadata": {},
532+
"metadata": {
533+
"collapsed": true
534+
},
503535
"outputs": [],
504536
"source": [
505537
"cv_errors_mean = cv_errors.mean(axis = 1)\n",
@@ -513,7 +545,9 @@
513545
{
514546
"cell_type": "code",
515547
"execution_count": null,
516-
"metadata": {},
548+
"metadata": {
549+
"collapsed": true
550+
},
517551
"outputs": [],
518552
"source": [
519553
"\"\"\"From the above plot, we can see that the model with 5 variables yielded the smallest RSS.\n",
@@ -539,7 +573,9 @@
539573
{
540574
"cell_type": "code",
541575
"execution_count": null,
542-
"metadata": {},
576+
"metadata": {
577+
"collapsed": true
578+
},
543579
"outputs": [],
544580
"source": [
545581
"from sklearn.preprocessing import scale \n",
@@ -589,7 +625,9 @@
589625
{
590626
"cell_type": "code",
591627
"execution_count": null,
592-
"metadata": {},
628+
"metadata": {
629+
"collapsed": true
630+
},
593631
"outputs": [],
594632
"source": [
595633
"alphas = 10**np.linspace(10,-2,100)\n",
@@ -625,7 +663,9 @@
625663
{
626664
"cell_type": "code",
627665
"execution_count": null,
628-
"metadata": {},
666+
"metadata": {
667+
"collapsed": true
668+
},
629669
"outputs": [],
630670
"source": [
631671
"print len(coeffs)\n",
@@ -644,7 +684,9 @@
644684
{
645685
"cell_type": "code",
646686
"execution_count": null,
647-
"metadata": {},
687+
"metadata": {
688+
"collapsed": true
689+
},
648690
"outputs": [],
649691
"source": [
650692
"ax = plt.gca()\n",
@@ -659,7 +701,9 @@
659701
{
660702
"cell_type": "code",
661703
"execution_count": null,
662-
"metadata": {},
704+
"metadata": {
705+
"collapsed": true
706+
},
663707
"outputs": [],
664708
"source": [
665709
"print alphas[49]\n",
@@ -691,7 +735,9 @@
691735
{
692736
"cell_type": "code",
693737
"execution_count": null,
694-
"metadata": {},
738+
"metadata": {
739+
"collapsed": true
740+
},
695741
"outputs": [],
696742
"source": [
697743
"ridge = Ridge(fit_intercept=True, normalize=True, alpha=4)\n",
@@ -711,7 +757,9 @@
711757
{
712758
"cell_type": "code",
713759
"execution_count": null,
714-
"metadata": {},
760+
"metadata": {
761+
"collapsed": true
762+
},
715763
"outputs": [],
716764
"source": [
717765
"ridgecv = RidgeCV(alphas, scoring='mean_squared_error', normalize = True)\n",
@@ -722,7 +770,9 @@
722770
{
723771
"cell_type": "code",
724772
"execution_count": null,
725-
"metadata": {},
773+
"metadata": {
774+
"collapsed": true
775+
},
726776
"outputs": [],
727777
"source": [
728778
"ridge_best = Ridge(alpha=ridgecv.alpha_, normalize=True)\n",
@@ -740,7 +790,9 @@
740790
{
741791
"cell_type": "code",
742792
"execution_count": null,
743-
"metadata": {},
793+
"metadata": {
794+
"collapsed": true
795+
},
744796
"outputs": [],
745797
"source": [
746798
"pd.Series(ridge_best.coef_, index=X.columns)"
@@ -763,7 +815,9 @@
763815
{
764816
"cell_type": "code",
765817
"execution_count": null,
766-
"metadata": {},
818+
"metadata": {
819+
"collapsed": true
820+
},
767821
"outputs": [],
768822
"source": [
769823
"lasso= Lasso(normalize=True, max_iter=1e5) \n",
@@ -786,7 +840,9 @@
786840
{
787841
"cell_type": "code",
788842
"execution_count": null,
789-
"metadata": {},
843+
"metadata": {
844+
"collapsed": true
845+
},
790846
"outputs": [],
791847
"source": [
792848
"lassocv = LassoCV(alphas=None, cv=10, max_iter=1e5, normalize=True)\n",
@@ -800,7 +856,9 @@
800856
{
801857
"cell_type": "code",
802858
"execution_count": null,
803-
"metadata": {},
859+
"metadata": {
860+
"collapsed": true
861+
},
804862
"outputs": [],
805863
"source": [
806864
"# Some of the coefficients should reduce to exact zero\n",
@@ -917,7 +975,9 @@
917975
{
918976
"cell_type": "code",
919977
"execution_count": null,
920-
"metadata": {},
978+
"metadata": {
979+
"collapsed": true
980+
},
921981
"outputs": [],
922982
"source": [
923983
"mse_train = []\n",
@@ -957,7 +1017,9 @@
9571017
{
9581018
"cell_type": "code",
9591019
"execution_count": null,
960-
"metadata": {},
1020+
"metadata": {
1021+
"collapsed": true
1022+
},
9611023
"outputs": [],
9621024
"source": [
9631025
"X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n",
@@ -984,7 +1046,9 @@
9841046
{
9851047
"cell_type": "code",
9861048
"execution_count": null,
987-
"metadata": {},
1049+
"metadata": {
1050+
"collapsed": true
1051+
},
9881052
"outputs": [],
9891053
"source": [
9901054
"np.cumsum(pca.explained_variance_ratio_) * 100"
@@ -1007,7 +1071,9 @@
10071071
{
10081072
"cell_type": "code",
10091073
"execution_count": null,
1010-
"metadata": {},
1074+
"metadata": {
1075+
"collapsed": true
1076+
},
10111077
"outputs": [],
10121078
"source": [
10131079
"X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n",

0 commit comments

Comments
 (0)