1+ import numpy as np
2+ import matplotlib .pyplot as plt
3+
4+ class Metrics :
5+
6+ def sse (self ):
7+ '''returns sum of squared errors (model vs actual)'''
8+ squared_errors = (self .resid_ ) ** 2
9+ self .sq_error_ = np .sum (squared_errors )
10+ return self .sq_error_
11+
12+ def sst (self ):
13+ '''returns total sum of squared errors (actual vs avg(actual))'''
14+ avg_y = np .mean (self .target_ )
15+ squared_errors = (self .target_ - avg_y ) ** 2
16+ self .sst_ = np .sum (squared_errors )
17+ return self .sst_
18+
19+ def r_squared (self ):
20+ '''returns calculated value of r^2'''
21+ self .r_sq_ = 1 - self .sse ()/ self .sst ()
22+ return self .r_sq_
23+
24+ def adj_r_squared (self ):
25+ '''returns calculated value of adjusted r^2'''
26+ self .adj_r_sq_ = 1 - (self .sse ()/ self .dfe_ ) / (self .sst ()/ self .dft_ )
27+ return self .adj_r_sq_
28+
29+ def mse (self ):
30+ '''returns calculated value of mse'''
31+ self .mse_ = np .mean ( (self .predict (self .features_ ) - self .target_ ) ** 2 )
32+ return self .mse_
33+
34+ def pretty_print_stats (self ):
35+ '''returns report of statistics for a given model object'''
36+ items = ( ('sse:' , self .sse ()), ('sst:' , self .sst ()),
37+ ('mse:' , self .mse ()), ('r^2:' , self .r_squared ()),
38+ ('adj_r^2:' , self .adj_r_squared ()))
39+ for item in items :
40+ print ('{0:8} {1:.4f}' .format (item [0 ], item [1 ]))
41+
42+
43+ class Diagnostics_plots :
44+
45+ def __init__ ():
46+ pass
47+
48+ def fitted_vs_residual (self ):
49+ '''Plots fitted values vs. residuals'''
50+ plt .title ("Fitted vs. residuals plot" ,fontsize = 14 )
51+ plt .scatter (self .fitted_ ,self .resid_ ,edgecolor = 'k' )
52+ plt .hlines (y = 0 ,xmin = np .amin (self .fitted_ ),xmax = np .amax (self .fitted_ ),color = 'k' ,linestyle = 'dashed' )
53+ plt .xlabel ("Fitted values" )
54+ plt .ylabel ("Residuals" )
55+ plt .show ()
56+
57+ def fitted_vs_features (self ):
58+ '''Plots residuals vs all feature variables in a grid'''
59+ num_plots = self .features_ .shape [1 ]
60+ if num_plots % 3 == 0 :
61+ nrows = int (num_plots / 3 )
62+ else :
63+ nrows = int (num_plots / 3 )+ 1
64+ ncols = 3
65+ fig , ax = plt .subplots (nrows , ncols , figsize = (15 ,nrows * 3.5 ))
66+ axes = ax .ravel ()
67+ for i in range (num_plots ,nrows * ncols ):
68+ axes [i ].set_visible (False )
69+ for i in range (num_plots ):
70+ axes [i ].scatter (self .features_ .T [i ],self .resid_ ,color = 'orange' ,edgecolor = 'k' ,alpha = 0.8 )
71+ axes [i ].grid (True )
72+ axes [i ].set_xlabel ("Feature X[{}]" .format (i ))
73+ axes [i ].set_ylabel ("Residuals" )
74+ axes [i ].hlines (y = 0 ,xmin = np .amin (self .features_ .T [i ]),xmax = np .amax (self .features_ .T [i ]),
75+ color = 'k' ,linestyle = 'dashed' )
76+ plt .show ()
77+
78+ def histogram_resid (self ,normalized = True ):
79+ '''Plots a histogram of the residuals (can be normalized)'''
80+ if normalized :
81+ norm_r = self .resid_ / np .linalg .norm (self .resid_ )
82+ else :
83+ norm_r = self .resid_
84+ num_bins = min (20 ,int (np .sqrt (self .features_ .shape [0 ])))
85+ plt .title ("Histogram of the normalized residuals" )
86+ plt .hist (norm_r ,bins = num_bins ,edgecolor = 'k' )
87+ plt .xlabel ("Normalized residuals" )
88+ plt .ylabel ("Count" )
89+ plt .show ()
90+
91+ def shapiro_test (self ,normalized = True ):
92+ '''Performs Shapiro-Wilk normality test on the residuals'''
93+ from scipy .stats import shapiro
94+ if normalized :
95+ norm_r = self .resid_ / np .linalg .norm (self .resid_ )
96+ else :
97+ norm_r = self .resid_
98+ _ ,p = shapiro (norm_r )
99+ if p > 0.01 :
100+ print ("The residuals seem to have come from a Gaussian process" )
101+ else :
102+ print ("The residuals does not seem to have come from a Gaussian process.\n Normality assumptions of the linear regression may have been violated." )
103+
104+ def qqplot_resid (self ,normalized = True ):
105+ '''Creates a quantile-quantile plot for residuals comparing with a normal distribution'''
106+ from scipy .stats import probplot
107+ if normalized :
108+ norm_r = self .resid_ / np .linalg .norm (self .resid_ )
109+ else :
110+ norm_r = self .resid_
111+ plt .title ("Q-Q plot of the normalized residuals" )
112+ probplot (norm_r ,dist = 'norm' ,plot = plt )
113+ plt .xlabel ("Theoretical quantiles" )
114+ plt .ylabel ("Residual quantiles" )
115+ plt .show ()
116+
117+
118+ class Data_plots :
119+
120+ def __init__ ():
121+ pass
122+
123+ def pairplot (self ):
124+ '''Creates pairplot of all variables and the target using the Seaborn library'''
125+
126+ print ("This may take a little time. Have patience..." )
127+ from seaborn import pairplot
128+ from pandas import DataFrame
129+ df = DataFrame (np .hstack ((self .features_ ,self .target_ .reshape (- 1 ,1 ))))
130+ pairplot (df )
131+ plt .show ()
132+
133+ def plot_fitted (self ,reference_line = False ):
134+ """
135+ Plots fitted values against the true output values from the data
136+
137+ Arguments:
138+ reference_line: A Boolean switch to draw a 45-degree reference line on the plot
139+ """
140+ plt .title ("True vs. fitted values" ,fontsize = 14 )
141+ plt .scatter (y ,self .fitted_ ,s = 100 ,alpha = 0.75 ,color = 'red' ,edgecolor = 'k' )
142+ if reference_line :
143+ plt .plot (y ,y ,c = 'k' ,linestyle = 'dotted' )
144+ plt .xlabel ("True values" )
145+ plt .ylabel ("Fitted values" )
146+ plt .grid (True )
147+ plt .show ()
148+
149+
150+ class Outliers :
151+
152+ def __init__ ():
153+ pass
154+
155+ def cook_distance (self ):
156+ '''Computes and plots Cook\' s distance'''
157+ import statsmodels .api as sm
158+ from statsmodels .stats .outliers_influence import OLSInfluence as influence
159+ lm = sm .OLS (self .target_ , sm .add_constant (self .features_ )).fit ()
160+ inf = influence (lm )
161+ (c , p ) = inf .cooks_distance
162+ plt .figure (figsize = (8 ,5 ))
163+ plt .title ("Cook's distance plot for the residuals" ,fontsize = 14 )
164+ plt .stem (np .arange (len (c )), c , markerfmt = "," , use_line_collection = True )
165+ plt .grid (True )
166+ plt .show ()
167+
168+ def influence_plot (self ):
169+ '''Creates the influence plot'''
170+ import statsmodels .api as sm
171+ lm = sm .OLS (self .target_ , sm .add_constant (self .features_ )).fit ()
172+ fig , ax = plt .subplots (figsize = (10 ,8 ))
173+ fig = sm .graphics .influence_plot (lm , ax = ax , criterion = "cooks" )
174+ plt .show ()
175+
176+ def leverage_resid_plot (self ):
177+ '''Plots leverage vs normalized residuals' square'''
178+ import statsmodels .api as sm
179+ lm = sm .OLS (self .target_ , sm .add_constant (self .features_ )).fit ()
180+ fig , ax = plt .subplots (figsize = (10 ,8 ))
181+ fig = sm .graphics .plot_leverage_resid2 (lm , ax = ax )
182+ plt .show ()
183+
184+
185+ class Multicollinearity :
186+
187+ def __init__ ():
188+ pass
189+
190+ def vif (self ):
191+ '''Computes variance influence factors for each feature variable'''
192+ import statsmodels .api as sm
193+ from statsmodels .stats .outliers_influence import variance_inflation_factor as vif
194+ lm = sm .OLS (self .target_ , sm .add_constant (self .features_ )).fit ()
195+ for i in range (self .features_ .shape [1 ]):
196+ v = vif (np .matrix (self .features_ ),i )
197+ print ("Variance inflation factor for feature {}: {}" .format (i ,round (v ,2 )))
198+
199+ class MyLinearRegression (Metrics , Diagnostics_plots ,Data_plots ,Outliers ,Multicollinearity ):
200+
201+ def __init__ (self , fit_intercept = True ):
202+ self .coef_ = None
203+ self .intercept_ = None
204+ self ._fit_intercept = fit_intercept
205+
206+ def __repr__ (self ):
207+ return "I am a Linear Regression model!"
208+
209+ def fit (self , X , y ):
210+ """
211+ Fit model coefficients.
212+
213+ Arguments:
214+ X: 1D or 2D numpy array
215+ y: 1D numpy array
216+ """
217+
218+ # check if X is 1D or 2D array
219+ if len (X .shape ) == 1 :
220+ X = X .reshape (- 1 ,1 )
221+
222+ # features and data
223+ self .features_ = X
224+ self .target_ = y
225+
226+ # degrees of freedom of population dependent variable variance
227+ self .dft_ = X .shape [0 ] - 1
228+ # degrees of freedom of population error variance
229+ self .dfe_ = X .shape [0 ] - X .shape [1 ] - 1
230+
231+ # add bias if fit_intercept is True
232+ if self ._fit_intercept :
233+ X_biased = np .c_ [np .ones (X .shape [0 ]), X ]
234+ else :
235+ X_biased = X
236+
237+ # closed form solution
238+ xTx = np .dot (X_biased .T , X_biased )
239+ inverse_xTx = np .linalg .inv (xTx )
240+ xTy = np .dot (X_biased .T , y )
241+ coef = np .dot (inverse_xTx , xTy )
242+
243+ # set attributes
244+ if self ._fit_intercept :
245+ self .intercept_ = coef [0 ]
246+ self .coef_ = coef [1 :]
247+ else :
248+ self .intercept_ = 0
249+ self .coef_ = coef
250+
251+ # Predicted/fitted y
252+ self .fitted_ = np .dot (X ,self .coef_ ) + self .intercept_
253+
254+ # Residuals
255+ residuals = self .target_ - self .fitted_
256+ self .resid_ = residuals
257+
258+ def predict (self , X ):
259+ """Output model prediction.
260+
261+ Arguments:
262+ X: 1D or 2D numpy array
263+ """
264+ # check if X is 1D or 2D array
265+ if len (X .shape ) == 1 :
266+ X = X .reshape (- 1 ,1 )
267+ self .predicted_ = self .intercept_ + np .dot (X , self .coef_ )
268+ return self .predicted_
269+
270+ def run_diagnostics (self ):
271+ '''Runs diagnostics tests and plots'''
272+ Diagnostics_plots .fitted_vs_residual (self )
273+ Diagnostics_plots .histogram_resid (self )
274+ Diagnostics_plots .qqplot_resid (self )
275+ print ()
276+ Diagnostics_plots .shapiro_test (self )
277+
278+ def outlier_plots (self ):
279+ '''Creates various outlier plots'''
280+ Outliers .cook_distance (self )
281+ Outliers .influence_plot (self )
282+ Outliers .leverage_resid_plot (self )
0 commit comments