1- #########################################
2- # Model Training file
3- #########################################
4- """
51import numpy as np
62import pandas as pd
73
128
139import dill as pickle
1410
15- #from sklearn.externals import joblib
1611from sklearn .model_selection import train_test_split
1712from sklearn .linear_model import LinearRegression
1813from sklearn import metrics
1914
20- url="https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Datasets/ USA_Housing.csv"
21- s=requests.get(url).content
15+ data_filename = ' USA_Housing.csv'
16+ cwd = os . getcwd ()
2217
23- df = pd.read_csv(io.StringIO(s.decode('utf-8')))
24- print("Dataset is loaded from the URL: {}".format(url))
25- #print(df.head())
26- print()
18+ # Checks if the dataset is in the local '/data' folder
19+ # If not present, pulls from Github repo, otherwise reads from the local folder
20+ if 'data' not in os .listdir () or data_filename not in os .listdir ("data/" ):
21+ url = "https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Datasets/USA_Housing.csv"
22+ print ("Downloading data from {} " .format (url ))
23+ s = requests .get (url ).content
24+
25+ df = pd .read_csv (io .StringIO (s .decode ('utf-8' )))
26+ print ("Dataset is downloaded." )
27+ # Save the data in local '/data' folder
28+ os .makedirs (cwd + "/data" )
29+ df .to_csv ("data/USA_housing.csv" )
30+ print ()
31+ else :
32+ df = pd .read_csv ("data/USA_Housing.csv" )
33+ print ("Dataset loaded from local directory" )
34+ print ()
2735
2836# Make a list of data frame column names
2937l_column = list (df .columns ) # Making a list out of column names
3442X = df [l_column [0 :len_feature - 2 ]]
3543y = df [l_column [len_feature - 2 ]]
3644
37- print("Feature set size:",X.shape)
38- print("Variable set size:",y.shape)
39- print()
45+ # print("Feature set size:",X.shape)
46+ # print("Variable set size:",y.shape)
47+ # print()
4048print ("Features variables: " ,l_column [0 :len_feature - 2 ])
4149print ()
4250
6371train_pred = lm .predict (X_train )
6472print ("R-squared value of this fit (on the training set):" ,round (metrics .r2_score (y_train ,train_pred ),3 ))
6573# Test score
66- test_score=lm.score(X_test,y_test)
67- print("Test score: ",round(test_score,3))
74+ # test_score=lm.score(X_test,y_test)
75+ # print("Test score: ",round(test_score,3))
6876print ()
6977
70- # Main (model saving in pickle format)
78+ # Main
79+ # Trains and saves the model in a serialized format
80+ # If either the data or models directory does not exist, creates them
81+ # Saves test data in a CSV file in a local '/data' folder
7182if __name__ == '__main__' :
7283 filename = 'lm_model_v1.pk'
7384 print ("Now saving the model to a serialized format (pickle)..." )
85+ if not os .path .isdir (cwd + "/models" ):
86+ os .makedirs (cwd + "/models" )
7487 with open ('models/' + filename , 'wb' ) as file :
7588 pickle .dump (lm , file )
7689 # Save some of the test data in a CSV
7790 print ("Saving test data to a file..." )
7891 print ()
79- X_test.to_csv("data/housing_test.csv")
80-
92+ if os .path .isdir (cwd + "/data" ):
93+ X_test .to_csv ("data/housing_test.csv" )
94+ else :
95+ os .makedirs (cwd + "/data" )
96+ X_test .to_csv ("data/housing_test.csv" )
97+
0 commit comments