|
17 | 17 | { |
18 | 18 | "cell_type": "code", |
19 | 19 | "execution_count": null, |
20 | | - "metadata": {}, |
| 20 | + "metadata": { |
| 21 | + "collapsed": true |
| 22 | + }, |
21 | 23 | "outputs": [], |
22 | 24 | "source": [ |
23 | 25 | "\"\"\"This notebook contains the code for best subset selection, \n", |
|
50 | 52 | { |
51 | 53 | "cell_type": "code", |
52 | 54 | "execution_count": null, |
53 | | - "metadata": {}, |
| 55 | + "metadata": { |
| 56 | + "collapsed": true |
| 57 | + }, |
54 | 58 | "outputs": [], |
55 | 59 | "source": [ |
56 | 60 | "Hitters = pd.read_csv('data/Hitters.csv', header=0, na_values='NA')\n", |
|
65 | 69 | { |
66 | 70 | "cell_type": "code", |
67 | 71 | "execution_count": null, |
68 | | - "metadata": {}, |
| 72 | + "metadata": { |
| 73 | + "collapsed": true |
| 74 | + }, |
69 | 75 | "outputs": [], |
70 | 76 | "source": [ |
71 | 77 | "print np.sum(pd.isnull(Hitters['Salary'])) # number of NAs in Salary column'\n", |
|
80 | 86 | { |
81 | 87 | "cell_type": "code", |
82 | 88 | "execution_count": null, |
83 | | - "metadata": {}, |
| 89 | + "metadata": { |
| 90 | + "collapsed": true |
| 91 | + }, |
84 | 92 | "outputs": [], |
85 | 93 | "source": [ |
86 | 94 | "y = Hitters.Salary # the response variable \n", |
|
130 | 138 | { |
131 | 139 | "cell_type": "code", |
132 | 140 | "execution_count": null, |
133 | | - "metadata": {}, |
| 141 | + "metadata": { |
| 142 | + "collapsed": true |
| 143 | + }, |
134 | 144 | "outputs": [], |
135 | 145 | "source": [ |
136 | 146 | "models = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n", |
|
144 | 154 | { |
145 | 155 | "cell_type": "code", |
146 | 156 | "execution_count": null, |
147 | | - "metadata": {}, |
| 157 | + "metadata": { |
| 158 | + "collapsed": true |
| 159 | + }, |
148 | 160 | "outputs": [], |
149 | 161 | "source": [ |
150 | 162 | "\"\"\" this show an example to plot the RSS of best models with different number of parameters\"\"\"\n", |
|
169 | 181 | { |
170 | 182 | "cell_type": "code", |
171 | 183 | "execution_count": null, |
172 | | - "metadata": {}, |
| 184 | + "metadata": { |
| 185 | + "collapsed": true |
| 186 | + }, |
173 | 187 | "outputs": [], |
174 | 188 | "source": [ |
175 | 189 | "\"\"\"\n", |
|
223 | 237 | { |
224 | 238 | "cell_type": "code", |
225 | 239 | "execution_count": null, |
226 | | - "metadata": {}, |
| 240 | + "metadata": { |
| 241 | + "collapsed": true |
| 242 | + }, |
227 | 243 | "outputs": [], |
228 | 244 | "source": [ |
229 | 245 | "models2 = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n", |
|
236 | 252 | { |
237 | 253 | "cell_type": "code", |
238 | 254 | "execution_count": null, |
239 | | - "metadata": {}, |
| 255 | + "metadata": { |
| 256 | + "collapsed": true |
| 257 | + }, |
240 | 258 | "outputs": [], |
241 | 259 | "source": [ |
242 | 260 | "\"\"\"we can compare the results of best subset selection and the forward selection\"\"\"\n", |
|
275 | 293 | { |
276 | 294 | "cell_type": "code", |
277 | 295 | "execution_count": null, |
278 | | - "metadata": {}, |
| 296 | + "metadata": { |
| 297 | + "collapsed": true |
| 298 | + }, |
279 | 299 | "outputs": [], |
280 | 300 | "source": [ |
281 | 301 | "\"\"\"\n", |
|
382 | 402 | { |
383 | 403 | "cell_type": "code", |
384 | 404 | "execution_count": null, |
385 | | - "metadata": {}, |
| 405 | + "metadata": { |
| 406 | + "collapsed": true |
| 407 | + }, |
386 | 408 | "outputs": [], |
387 | 409 | "source": [ |
388 | 410 | "models_validation = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n", |
|
403 | 425 | { |
404 | 426 | "cell_type": "code", |
405 | 427 | "execution_count": null, |
406 | | - "metadata": {}, |
| 428 | + "metadata": { |
| 429 | + "collapsed": true |
| 430 | + }, |
407 | 431 | "outputs": [], |
408 | 432 | "source": [ |
409 | 433 | "print('Best max_feature variable from best subset selection on tranining')\n", |
|
422 | 446 | { |
423 | 447 | "cell_type": "code", |
424 | 448 | "execution_count": null, |
425 | | - "metadata": {}, |
| 449 | + "metadata": { |
| 450 | + "collapsed": true |
| 451 | + }, |
426 | 452 | "outputs": [], |
427 | 453 | "source": [ |
428 | 454 | "\"\"\" this show an example to plot the RSS of best models with different number of parameters for best subset with validation\"\"\"\n", |
|
436 | 462 | { |
437 | 463 | "cell_type": "code", |
438 | 464 | "execution_count": null, |
439 | | - "metadata": {}, |
| 465 | + "metadata": { |
| 466 | + "collapsed": true |
| 467 | + }, |
440 | 468 | "outputs": [], |
441 | 469 | "source": [ |
442 | 470 | "\"\"\" this show an example to plot the RSS of best models with different number of parameters for forward selection with validation\"\"\"\n", |
|
481 | 509 | { |
482 | 510 | "cell_type": "code", |
483 | 511 | "execution_count": null, |
484 | | - "metadata": {}, |
| 512 | + "metadata": { |
| 513 | + "collapsed": true |
| 514 | + }, |
485 | 515 | "outputs": [], |
486 | 516 | "source": [ |
487 | 517 | "models_cv = pd.DataFrame(columns=[\"RSS\", \"Model\"])\n", |
|
499 | 529 | { |
500 | 530 | "cell_type": "code", |
501 | 531 | "execution_count": null, |
502 | | - "metadata": {}, |
| 532 | + "metadata": { |
| 533 | + "collapsed": true |
| 534 | + }, |
503 | 535 | "outputs": [], |
504 | 536 | "source": [ |
505 | 537 | "cv_errors_mean = cv_errors.mean(axis = 1)\n", |
|
513 | 545 | { |
514 | 546 | "cell_type": "code", |
515 | 547 | "execution_count": null, |
516 | | - "metadata": {}, |
| 548 | + "metadata": { |
| 549 | + "collapsed": true |
| 550 | + }, |
517 | 551 | "outputs": [], |
518 | 552 | "source": [ |
519 | 553 | "\"\"\"From the above plot, we can see that the model with 5 variables yielded the smallest RSS.\n", |
|
539 | 573 | { |
540 | 574 | "cell_type": "code", |
541 | 575 | "execution_count": null, |
542 | | - "metadata": {}, |
| 576 | + "metadata": { |
| 577 | + "collapsed": true |
| 578 | + }, |
543 | 579 | "outputs": [], |
544 | 580 | "source": [ |
545 | 581 | "from sklearn.preprocessing import scale \n", |
|
589 | 625 | { |
590 | 626 | "cell_type": "code", |
591 | 627 | "execution_count": null, |
592 | | - "metadata": {}, |
| 628 | + "metadata": { |
| 629 | + "collapsed": true |
| 630 | + }, |
593 | 631 | "outputs": [], |
594 | 632 | "source": [ |
595 | 633 | "alphas = 10**np.linspace(10,-2,100)\n", |
|
625 | 663 | { |
626 | 664 | "cell_type": "code", |
627 | 665 | "execution_count": null, |
628 | | - "metadata": {}, |
| 666 | + "metadata": { |
| 667 | + "collapsed": true |
| 668 | + }, |
629 | 669 | "outputs": [], |
630 | 670 | "source": [ |
631 | 671 | "print len(coeffs)\n", |
|
644 | 684 | { |
645 | 685 | "cell_type": "code", |
646 | 686 | "execution_count": null, |
647 | | - "metadata": {}, |
| 687 | + "metadata": { |
| 688 | + "collapsed": true |
| 689 | + }, |
648 | 690 | "outputs": [], |
649 | 691 | "source": [ |
650 | 692 | "ax = plt.gca()\n", |
|
659 | 701 | { |
660 | 702 | "cell_type": "code", |
661 | 703 | "execution_count": null, |
662 | | - "metadata": {}, |
| 704 | + "metadata": { |
| 705 | + "collapsed": true |
| 706 | + }, |
663 | 707 | "outputs": [], |
664 | 708 | "source": [ |
665 | 709 | "print alphas[49]\n", |
|
691 | 735 | { |
692 | 736 | "cell_type": "code", |
693 | 737 | "execution_count": null, |
694 | | - "metadata": {}, |
| 738 | + "metadata": { |
| 739 | + "collapsed": true |
| 740 | + }, |
695 | 741 | "outputs": [], |
696 | 742 | "source": [ |
697 | 743 | "ridge = Ridge(fit_intercept=True, normalize=True, alpha=4)\n", |
|
711 | 757 | { |
712 | 758 | "cell_type": "code", |
713 | 759 | "execution_count": null, |
714 | | - "metadata": {}, |
| 760 | + "metadata": { |
| 761 | + "collapsed": true |
| 762 | + }, |
715 | 763 | "outputs": [], |
716 | 764 | "source": [ |
717 | 765 | "ridgecv = RidgeCV(alphas, scoring='mean_squared_error', normalize = True)\n", |
|
722 | 770 | { |
723 | 771 | "cell_type": "code", |
724 | 772 | "execution_count": null, |
725 | | - "metadata": {}, |
| 773 | + "metadata": { |
| 774 | + "collapsed": true |
| 775 | + }, |
726 | 776 | "outputs": [], |
727 | 777 | "source": [ |
728 | 778 | "ridge_best = Ridge(alpha=ridgecv.alpha_, normalize=True)\n", |
|
740 | 790 | { |
741 | 791 | "cell_type": "code", |
742 | 792 | "execution_count": null, |
743 | | - "metadata": {}, |
| 793 | + "metadata": { |
| 794 | + "collapsed": true |
| 795 | + }, |
744 | 796 | "outputs": [], |
745 | 797 | "source": [ |
746 | 798 | "pd.Series(ridge_best.coef_, index=X.columns)" |
|
763 | 815 | { |
764 | 816 | "cell_type": "code", |
765 | 817 | "execution_count": null, |
766 | | - "metadata": {}, |
| 818 | + "metadata": { |
| 819 | + "collapsed": true |
| 820 | + }, |
767 | 821 | "outputs": [], |
768 | 822 | "source": [ |
769 | 823 | "lasso= Lasso(normalize=True, max_iter=1e5) \n", |
|
786 | 840 | { |
787 | 841 | "cell_type": "code", |
788 | 842 | "execution_count": null, |
789 | | - "metadata": {}, |
| 843 | + "metadata": { |
| 844 | + "collapsed": true |
| 845 | + }, |
790 | 846 | "outputs": [], |
791 | 847 | "source": [ |
792 | 848 | "lassocv = LassoCV(alphas=None, cv=10, max_iter=1e5, normalize=True)\n", |
|
800 | 856 | { |
801 | 857 | "cell_type": "code", |
802 | 858 | "execution_count": null, |
803 | | - "metadata": {}, |
| 859 | + "metadata": { |
| 860 | + "collapsed": true |
| 861 | + }, |
804 | 862 | "outputs": [], |
805 | 863 | "source": [ |
806 | 864 | "# Some of the coefficients should reduce to exact zero\n", |
|
917 | 975 | { |
918 | 976 | "cell_type": "code", |
919 | 977 | "execution_count": null, |
920 | | - "metadata": {}, |
| 978 | + "metadata": { |
| 979 | + "collapsed": true |
| 980 | + }, |
921 | 981 | "outputs": [], |
922 | 982 | "source": [ |
923 | 983 | "mse_train = []\n", |
|
957 | 1017 | { |
958 | 1018 | "cell_type": "code", |
959 | 1019 | "execution_count": null, |
960 | | - "metadata": {}, |
| 1020 | + "metadata": { |
| 1021 | + "collapsed": true |
| 1022 | + }, |
961 | 1023 | "outputs": [], |
962 | 1024 | "source": [ |
963 | 1025 | "X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n", |
|
984 | 1046 | { |
985 | 1047 | "cell_type": "code", |
986 | 1048 | "execution_count": null, |
987 | | - "metadata": {}, |
| 1049 | + "metadata": { |
| 1050 | + "collapsed": true |
| 1051 | + }, |
988 | 1052 | "outputs": [], |
989 | 1053 | "source": [ |
990 | 1054 | "np.cumsum(pca.explained_variance_ratio_) * 100" |
|
1007 | 1071 | { |
1008 | 1072 | "cell_type": "code", |
1009 | 1073 | "execution_count": null, |
1010 | | - "metadata": {}, |
| 1074 | + "metadata": { |
| 1075 | + "collapsed": true |
| 1076 | + }, |
1011 | 1077 | "outputs": [], |
1012 | 1078 | "source": [ |
1013 | 1079 | "X_train, X_test , y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.66)\n", |
|
0 commit comments