|
93 | 93 |
|
94 | 94 | # set up dataset |
95 | 95 | n_samples = 100 |
96 | | -n_features = 1000 |
| 96 | +n_features = 300 |
97 | 97 |
|
98 | 98 | #L1 data (only 5 informative features) |
99 | 99 | X_1, y_1 = datasets.make_classification(n_samples=n_samples, n_features=n_features, |
100 | 100 | n_informative=5, random_state=1) |
101 | 101 |
|
102 | | -#L2 data |
103 | | -X_2 = 1 + rnd.randn(n_samples, n_features) |
104 | | -coef = np.ones(n_features) |
105 | | - |
106 | | -y_2 = np.dot(X_2, coef) |
107 | | -y_2 += .1 * rnd.randn(n_samples) * np.std(y_2) |
108 | | -y_2 = np.sign(y_2 - np.mean(y_2)) |
| 102 | +#L2 data: non sparse, but less features |
| 103 | +y_2 = np.sign(.5 - rnd.rand(n_samples)) |
| 104 | +X_2 = rnd.randn(n_samples, n_features/5) + y_2[:, np.newaxis] |
| 105 | +X_2 += 5 * rnd.randn(n_samples, n_features/5) |
109 | 106 |
|
110 | 107 | clf_sets = [(LinearSVC(penalty='L1', loss='L2', dual=False, |
111 | 108 | tol=1e-3), |
112 | | - np.logspace(-2.5, -1, 10), X_1, y_1), |
113 | | - (LinearSVC(penalty='L2', loss='L1', dual=True, |
114 | | - tol=1e-5, intercept_scaling=20), |
115 | | - np.logspace(-3.5, -2.5, 10), X_2, y_2)] |
| 109 | + np.logspace(-2.2, -1.2, 10), X_1, y_1), |
| 110 | + (LinearSVC(penalty='L2', loss='L2', dual=True, |
| 111 | + tol=1e-4), |
| 112 | + np.logspace(-4.5, -2, 10), X_2, y_2)] |
116 | 113 |
|
117 | 114 | colors = ['b', 'g', 'r', 'c'] |
118 | 115 |
|
|
123 | 120 | pl.xlabel('C') |
124 | 121 | pl.ylabel('CV Score') |
125 | 122 |
|
126 | | - for k, train_size in enumerate(np.arange(0.4, 0.7, 0.1)[::-1]): |
| 123 | + for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): |
127 | 124 | param_grid = dict(C=cs) |
| 125 | + # To get nice curve, we need a large number of iterations to |
| 126 | + # reduce the variance |
128 | 127 | grid = GridSearchCV(clf, refit=False, param_grid=param_grid, |
129 | 128 | cv=ShuffleSplit(n=n_samples, train_size=train_size, |
130 | | - n_iterations=45, random_state=1)) |
| 129 | + n_iterations=250, random_state=1)) |
131 | 130 | grid.fit(X, y) |
132 | 131 | scores = [x[1] for x in grid.grid_scores_] |
133 | 132 |
|
134 | 133 | scales = [(1, 'No scaling'), |
135 | | - ((np.sqrt(n_samples * train_size)), '1/sqrt(n_samples)'), |
136 | 134 | ((n_samples * train_size), '1/n_samples'), |
137 | 135 | ] |
138 | 136 |
|
139 | 137 | for subplotnum, (scaler, name) in enumerate(scales): |
140 | | - pl.subplot(3, 1, subplotnum + 1) |
| 138 | + pl.subplot(2, 1, subplotnum + 1) |
141 | 139 | grid_cs = cs * float(scaler) # scale the C's |
142 | 140 | pl.semilogx(grid_cs, scores, label="fraction %.2f" % |
143 | | - train_size) |
| 141 | + train_size) |
144 | 142 | pl.title('scaling=%s, penalty=%s, loss=%s' % (name, clf.penalty, clf.loss)) |
145 | | - ymin, ymax = pl.ylim() |
146 | | - pl.axvline(grid_cs[np.argmax(scores)], 0, 1, |
147 | | - color=colors[k]) |
148 | | - pl.ylim(ymin=ymin-0.0025, ymax=ymax+0.008) # adjust the y-axis |
149 | 143 |
|
150 | | - pl.legend(loc="lower right") |
| 144 | + #ymin, ymax = pl.ylim() |
| 145 | + #pl.axvline(grid_cs[np.argmax(scores)], 0, 1, |
| 146 | + # color=colors[k]) |
| 147 | + #pl.ylim(ymin=ymin-0.0025, ymax=ymax+0.008) # adjust the y-axis |
| 148 | + |
| 149 | + pl.legend(loc="best") |
151 | 150 | pl.show() |
152 | 151 |
|
0 commit comments