-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathNumberClassifier.py
More file actions
57 lines (45 loc) · 2.15 KB
/
NumberClassifier.py
File metadata and controls
57 lines (45 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import scipy as sp
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import numpy as np
try:
numberClassifier = pickle.load(open("numberClassifier1.pickle", "rb"))
except (OSError, IOError) as e:
# read training data from the csv file
df = pd.read_csv('D:\codes\datasets\kaggle\\numberClassification\\train.csv')
# seperate features('X') and label('y')
y = sp.array(df[['label']])
X = sp.array(df.drop(['label'], 1))
# divide training, cross validation data
X_train, X_cross_val, y_train, y_cross_val = train_test_split(X, y, test_size=0.0, random_state=0)
# train the data and obtain the parameters/theta
# this is multi class classification one versus all, so probably we want to obtain the
# classification parameters for all the different digits and
# Linear Equation goes like : y = theta*X
numberClassifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
# store data to some variable so that not every time needs to read file
# its good to save the classifier because for every prediction we do
# not want to entire training process
with open('numberClassifier.pickle', 'wb') as f:
pickle.dump(numberClassifier, f)
f.close()
# predict the values on cross validation data
y_predict = numberClassifier.predict(X_cross_val)
# find error of prediction on cross validation data
accuracy_score(y_cross_val, y_predict)
# load the classifier
# numberClassifier = pickle.load(numberClassifier)
# finally test the hypothesis on the test data
# find the percentage of correct classification
# check if particular digit is classified properly
df_X_test = pd.read_csv('D:\codes\datasets\kaggle\\numberClassification\\test.csv')
X_test = sp.array(df_X_test)
res = map(int, numberClassifier.predict(X_test))
result = sp.vstack((map(int, (range(1, len(res) + 1))), res)).T
with open("result.csv", "wb") as f:
f.write(b'ImageId,Label\n')
sp.savetxt(f, result.astype(int), fmt='%i', delimiter=",")