forked from raftay/ML_Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchallenge_basic.py
More file actions
79 lines (62 loc) · 2.46 KB
/
challenge_basic.py
File metadata and controls
79 lines (62 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
This Python file provides some useful code for reading the training file
"cleaned_data_combined.csv". You may adapt this code as you see fit. However,
keep in mind that the code provided does only basic feature transformations
to build a rudimentary kNN model in sklearn. Not all features are considered
in this code, and you should consider those features! Use this code
where appropriate, but don't stop here!
"""
import re
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
file_name = "cleaned_data_combined.csv"
random_state = 42
def to_numeric(s):
"""Converts string `s` to a float.
Invalid strings and NaN values will be converted to float('nan').
"""
if isinstance(s, str):
s = s.replace(",", '')
s = pd.to_numeric(s, errors="coerce")
return float(s)
if __name__ == "__main__":
df = pd.read_csv(file_name)
# Select a subset of features for the baseline model
selected_features = ["Q2: How many ingredients would you expect this food item to contain?",
"Q3: In what setting would you expect this food to be served? Please check all that apply",
"Q4: How much would you expect to pay for one serving of this food item?",
"Q6: What drink would you pair with this food item?"]
# Prepare the data for training
df = df[selected_features + ["Label"]]
# Handle missing values
df = df.fillna(0)
# Encode categorical features (if necessary)
for col in selected_features:
if df[col].dtype == 'object':
print(df[col])
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
print(df[col])
# Convert categorical labels to numerical values
df = pd.get_dummies(df, columns=["Label"], prefix="Label")
test_accs=[]
for i in range(100):
# Shuffle the dataset
df = df.sample(frac=1, random_state=random_state)
x = df.drop(columns=[col for col in df.columns if col.startswith("Label_")]).values
y = df[[col for col in df.columns if col.startswith("Label_")]].values
# Train-test split
n_train = int(0.8 * len(df))
x_train = x[:n_train]
y_train = y[:n_train]
x_test = x[n_train:]
y_test = y[n_train:]
# Train and evaluate a kNN classifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(x_train, y_train)
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print(f"{type(clf).__name__} train acc: {train_acc}")
print(f"{type(clf).__name__} test acc: {test_acc}")
test_accs.append(test_acc)
print(sum(test_accs)/1000)