Skip to content

Commit d8f2053

Browse files
authored
Linear regression class definition script
1 parent 45961e1 commit d8f2053

File tree

1 file changed

+282
-0
lines changed

1 file changed

+282
-0
lines changed
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
4+
class Metrics:
5+
6+
def sse(self):
7+
'''returns sum of squared errors (model vs actual)'''
8+
squared_errors = (self.resid_) ** 2
9+
self.sq_error_ = np.sum(squared_errors)
10+
return self.sq_error_
11+
12+
def sst(self):
13+
'''returns total sum of squared errors (actual vs avg(actual))'''
14+
avg_y = np.mean(self.target_)
15+
squared_errors = (self.target_ - avg_y) ** 2
16+
self.sst_ = np.sum(squared_errors)
17+
return self.sst_
18+
19+
def r_squared(self):
20+
'''returns calculated value of r^2'''
21+
self.r_sq_ = 1 - self.sse()/self.sst()
22+
return self.r_sq_
23+
24+
def adj_r_squared(self):
25+
'''returns calculated value of adjusted r^2'''
26+
self.adj_r_sq_ = 1 - (self.sse()/self.dfe_) / (self.sst()/self.dft_)
27+
return self.adj_r_sq_
28+
29+
def mse(self):
30+
'''returns calculated value of mse'''
31+
self.mse_ = np.mean( (self.predict(self.features_) - self.target_) ** 2 )
32+
return self.mse_
33+
34+
def pretty_print_stats(self):
35+
'''returns report of statistics for a given model object'''
36+
items = ( ('sse:', self.sse()), ('sst:', self.sst()),
37+
('mse:', self.mse()), ('r^2:', self.r_squared()),
38+
('adj_r^2:', self.adj_r_squared()))
39+
for item in items:
40+
print('{0:8} {1:.4f}'.format(item[0], item[1]))
41+
42+
43+
class Diagnostics_plots:
44+
45+
def __init__():
46+
pass
47+
48+
def fitted_vs_residual(self):
49+
'''Plots fitted values vs. residuals'''
50+
plt.title("Fitted vs. residuals plot",fontsize=14)
51+
plt.scatter(self.fitted_,self.resid_,edgecolor='k')
52+
plt.hlines(y=0,xmin=np.amin(self.fitted_),xmax=np.amax(self.fitted_),color='k',linestyle='dashed')
53+
plt.xlabel("Fitted values")
54+
plt.ylabel("Residuals")
55+
plt.show()
56+
57+
def fitted_vs_features(self):
58+
'''Plots residuals vs all feature variables in a grid'''
59+
num_plots = self.features_.shape[1]
60+
if num_plots%3==0:
61+
nrows = int(num_plots/3)
62+
else:
63+
nrows = int(num_plots/3)+1
64+
ncols = 3
65+
fig, ax = plt.subplots(nrows, ncols, figsize=(15,nrows*3.5))
66+
axes = ax.ravel()
67+
for i in range(num_plots,nrows*ncols):
68+
axes[i].set_visible(False)
69+
for i in range(num_plots):
70+
axes[i].scatter(self.features_.T[i],self.resid_,color='orange',edgecolor='k',alpha=0.8)
71+
axes[i].grid(True)
72+
axes[i].set_xlabel("Feature X[{}]".format(i))
73+
axes[i].set_ylabel("Residuals")
74+
axes[i].hlines(y=0,xmin=np.amin(self.features_.T[i]),xmax=np.amax(self.features_.T[i]),
75+
color='k',linestyle='dashed')
76+
plt.show()
77+
78+
def histogram_resid(self,normalized=True):
79+
'''Plots a histogram of the residuals (can be normalized)'''
80+
if normalized:
81+
norm_r=self.resid_/np.linalg.norm(self.resid_)
82+
else:
83+
norm_r = self.resid_
84+
num_bins=min(20,int(np.sqrt(self.features_.shape[0])))
85+
plt.title("Histogram of the normalized residuals")
86+
plt.hist(norm_r,bins=num_bins,edgecolor='k')
87+
plt.xlabel("Normalized residuals")
88+
plt.ylabel("Count")
89+
plt.show()
90+
91+
def shapiro_test(self,normalized=True):
92+
'''Performs Shapiro-Wilk normality test on the residuals'''
93+
from scipy.stats import shapiro
94+
if normalized:
95+
norm_r=self.resid_/np.linalg.norm(self.resid_)
96+
else:
97+
norm_r = self.resid_
98+
_,p = shapiro(norm_r)
99+
if p > 0.01:
100+
print("The residuals seem to have come from a Gaussian process")
101+
else:
102+
print("The residuals does not seem to have come from a Gaussian process.\nNormality assumptions of the linear regression may have been violated.")
103+
104+
def qqplot_resid(self,normalized=True):
105+
'''Creates a quantile-quantile plot for residuals comparing with a normal distribution'''
106+
from scipy.stats import probplot
107+
if normalized:
108+
norm_r=self.resid_/np.linalg.norm(self.resid_)
109+
else:
110+
norm_r = self.resid_
111+
plt.title("Q-Q plot of the normalized residuals")
112+
probplot(norm_r,dist='norm',plot=plt)
113+
plt.xlabel("Theoretical quantiles")
114+
plt.ylabel("Residual quantiles")
115+
plt.show()
116+
117+
118+
class Data_plots:
119+
120+
def __init__():
121+
pass
122+
123+
def pairplot(self):
124+
'''Creates pairplot of all variables and the target using the Seaborn library'''
125+
126+
print ("This may take a little time. Have patience...")
127+
from seaborn import pairplot
128+
from pandas import DataFrame
129+
df = DataFrame(np.hstack((self.features_,self.target_.reshape(-1,1))))
130+
pairplot(df)
131+
plt.show()
132+
133+
def plot_fitted(self,reference_line=False):
134+
"""
135+
Plots fitted values against the true output values from the data
136+
137+
Arguments:
138+
reference_line: A Boolean switch to draw a 45-degree reference line on the plot
139+
"""
140+
plt.title("True vs. fitted values",fontsize=14)
141+
plt.scatter(y,self.fitted_,s=100,alpha=0.75,color='red',edgecolor='k')
142+
if reference_line:
143+
plt.plot(y,y,c='k',linestyle='dotted')
144+
plt.xlabel("True values")
145+
plt.ylabel("Fitted values")
146+
plt.grid(True)
147+
plt.show()
148+
149+
150+
class Outliers:
151+
152+
def __init__():
153+
pass
154+
155+
def cook_distance(self):
156+
'''Computes and plots Cook\'s distance'''
157+
import statsmodels.api as sm
158+
from statsmodels.stats.outliers_influence import OLSInfluence as influence
159+
lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
160+
inf=influence(lm)
161+
(c, p) = inf.cooks_distance
162+
plt.figure(figsize=(8,5))
163+
plt.title("Cook's distance plot for the residuals",fontsize=14)
164+
plt.stem(np.arange(len(c)), c, markerfmt=",", use_line_collection=True)
165+
plt.grid(True)
166+
plt.show()
167+
168+
def influence_plot(self):
169+
'''Creates the influence plot'''
170+
import statsmodels.api as sm
171+
lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
172+
fig, ax = plt.subplots(figsize=(10,8))
173+
fig = sm.graphics.influence_plot(lm, ax= ax, criterion="cooks")
174+
plt.show()
175+
176+
def leverage_resid_plot(self):
177+
'''Plots leverage vs normalized residuals' square'''
178+
import statsmodels.api as sm
179+
lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
180+
fig, ax = plt.subplots(figsize=(10,8))
181+
fig = sm.graphics.plot_leverage_resid2(lm, ax= ax)
182+
plt.show()
183+
184+
185+
class Multicollinearity:
186+
187+
def __init__():
188+
pass
189+
190+
def vif(self):
191+
'''Computes variance influence factors for each feature variable'''
192+
import statsmodels.api as sm
193+
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
194+
lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
195+
for i in range(self.features_.shape[1]):
196+
v=vif(np.matrix(self.features_),i)
197+
print("Variance inflation factor for feature {}: {}".format(i,round(v,2)))
198+
199+
class MyLinearRegression(Metrics, Diagnostics_plots,Data_plots,Outliers,Multicollinearity):
200+
201+
def __init__(self, fit_intercept=True):
202+
self.coef_ = None
203+
self.intercept_ = None
204+
self._fit_intercept = fit_intercept
205+
206+
def __repr__(self):
207+
return "I am a Linear Regression model!"
208+
209+
def fit(self, X, y):
210+
"""
211+
Fit model coefficients.
212+
213+
Arguments:
214+
X: 1D or 2D numpy array
215+
y: 1D numpy array
216+
"""
217+
218+
# check if X is 1D or 2D array
219+
if len(X.shape) == 1:
220+
X = X.reshape(-1,1)
221+
222+
# features and data
223+
self.features_ = X
224+
self.target_ = y
225+
226+
# degrees of freedom of population dependent variable variance
227+
self.dft_ = X.shape[0] - 1
228+
# degrees of freedom of population error variance
229+
self.dfe_ = X.shape[0] - X.shape[1] - 1
230+
231+
# add bias if fit_intercept is True
232+
if self._fit_intercept:
233+
X_biased = np.c_[np.ones(X.shape[0]), X]
234+
else:
235+
X_biased = X
236+
237+
# closed form solution
238+
xTx = np.dot(X_biased.T, X_biased)
239+
inverse_xTx = np.linalg.inv(xTx)
240+
xTy = np.dot(X_biased.T, y)
241+
coef = np.dot(inverse_xTx, xTy)
242+
243+
# set attributes
244+
if self._fit_intercept:
245+
self.intercept_ = coef[0]
246+
self.coef_ = coef[1:]
247+
else:
248+
self.intercept_ = 0
249+
self.coef_ = coef
250+
251+
# Predicted/fitted y
252+
self.fitted_ = np.dot(X,self.coef_) + self.intercept_
253+
254+
# Residuals
255+
residuals = self.target_ - self.fitted_
256+
self.resid_ = residuals
257+
258+
def predict(self, X):
259+
"""Output model prediction.
260+
261+
Arguments:
262+
X: 1D or 2D numpy array
263+
"""
264+
# check if X is 1D or 2D array
265+
if len(X.shape) == 1:
266+
X = X.reshape(-1,1)
267+
self.predicted_ = self.intercept_ + np.dot(X, self.coef_)
268+
return self.predicted_
269+
270+
def run_diagnostics(self):
271+
'''Runs diagnostics tests and plots'''
272+
Diagnostics_plots.fitted_vs_residual(self)
273+
Diagnostics_plots.histogram_resid(self)
274+
Diagnostics_plots.qqplot_resid(self)
275+
print()
276+
Diagnostics_plots.shapiro_test(self)
277+
278+
def outlier_plots(self):
279+
'''Creates various outlier plots'''
280+
Outliers.cook_distance(self)
281+
Outliers.influence_plot(self)
282+
Outliers.leverage_resid_plot(self)

0 commit comments

Comments
 (0)