Skip to content

Commit 0869124

Browse files
create process data
1 parent 28feac4 commit 0869124

File tree

7 files changed

+97
-34
lines changed

7 files changed

+97
-34
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
raw:
22
name: pet
33
path: data/raw/pet.csv
4-
processed: data/processed/pet.csv
4+
processed: data/processed

data_science_tools/prefect_2/config/process.yaml

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,39 @@ defaults:
22
- data: data
33
- _self_
44

5-
n_estimators: 20
5+
n_estimators: 20
6+
use_cols:
7+
- Type
8+
- Age
9+
- Breed1
10+
- Breed2
11+
- Gender
12+
- Color1
13+
- Color2
14+
- Color3
15+
- MaturitySize
16+
- FurLength
17+
- Vaccinated
18+
- Dewormed
19+
- Sterilized
20+
- Health
21+
- Quantity
22+
- Fee
23+
- desc_length
24+
- desc_words
25+
- average_word_length
26+
- AdoptionSpeed
27+
cat_cols:
28+
- Type
29+
- Breed1
30+
- Breed2
31+
- Gender
32+
- Color1
33+
- Color2
34+
- Color3
35+
- MaturitySize
36+
- FurLength
37+
- Vaccinated
38+
- Dewormed
39+
- Sterilized
40+
- Health
890 KB
Binary file not shown.
2.03 MB
Binary file not shown.
134 KB
Binary file not shown.
312 KB
Binary file not shown.

data_science_tools/prefect_2/src/process_data.py

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,96 @@
1+
import pickle
2+
13
import hydra
24
import pandas as pd
35
from hydra.utils import to_absolute_path as abspath
4-
from nltk.tokenize import TweetTokenizer
56
from prefect import flow, task
6-
from sklearn.ensemble import RandomForestClassifier
7-
from sklearn.feature_extraction.text import TfidfVectorizer
7+
from sklearn.model_selection import train_test_split
88

9+
pd.options.mode.chained_assignment = None
910
# ---------------------------------------------------------------------------- #
1011
# Create tasks #
1112
# ---------------------------------------------------------------------------- #
1213

1314

1415
@task
1516
def get_data(data_path: str):
16-
train = pd.read_csv(abspath(data_path.train))
17-
test = pd.read_csv(abspath(data_path.test))
18-
return {"train": train, "test": test}
17+
return pd.read_csv(abspath(data_path))
1918

2019

21-
@task
22-
def get_all_data(data: dict):
23-
return pd.concat([data["train"], data["test"]])
20+
def fill_na_description(data: pd.DataFrame):
21+
data["Description"] = data["Description"].fillna("")
22+
return data
23+
24+
25+
def get_desc_length(data: pd.DataFrame):
26+
data["desc_length"] = data.apply(lambda x: len(x))
27+
return data
28+
29+
30+
def get_desc_words(data: pd.DataFrame):
31+
data["desc_words"] = data["Description"].apply(lambda x: len(x.split()))
32+
return data
33+
34+
35+
def get_average_word_length(data: pd.DataFrame):
36+
data["average_word_length"] = data["desc_length"] / data["desc_words"]
37+
return data
2438

2539

2640
@task
27-
def get_vectorizer(data: pd.DataFrame):
28-
tokenizer = TweetTokenizer()
29-
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
30-
vectorizer.fit(data["Description"].fillna("").values)
31-
return vectorizer
41+
def get_description_features(data: pd.DataFrame):
42+
return (
43+
data.pipe(fill_na_description)
44+
.pipe(get_desc_length)
45+
.pipe(get_desc_words)
46+
.pipe(get_average_word_length)
47+
)
3248

3349

3450
@task
35-
def encode_description(vectorizer: TfidfVectorizer, data: pd.DataFrame):
36-
X_train = vectorizer.transform(data["Description"].fillna(""))
37-
print(X_train)
38-
print(type(X_train))
39-
return X_train
51+
def filter_cols(use_cols: list, data: pd.DataFrame):
52+
return data[use_cols]
4053

4154

4255
@task
43-
def get_adoption_speed(data: pd.DataFrame):
44-
return data["AdoptionSpeed"]
56+
def encode_cat_cols(cat_cols: list, data: pd.DataFrame):
57+
cat_cols = list(cat_cols)
58+
data[cat_cols] = data[cat_cols].astype(str)
59+
for col in cat_cols:
60+
_, indexer = pd.factorize(data[col])
61+
data[col] = indexer.get_indexer(data[col])
62+
return data
4563

4664

4765
@task
48-
def get_classifier(data: pd.DataFrame, adoption_speed: pd.Series, n_estimators: int):
49-
clf = RandomForestClassifier(n_estimators=n_estimators)
50-
clf.fit(data, adoption_speed)
66+
def split_data(data: pd.DataFrame):
67+
X = data.drop(columns=["AdoptionSpeed"])
68+
y = data["AdoptionSpeed"]
69+
X_train, X_test, y_train, y_test = train_test_split(
70+
X,
71+
y,
72+
test_size=0.3,
73+
random_state=0,
74+
)
75+
return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
5176

5277

53-
@flow
54-
def get_description_features(config, all_data, data: dict):
55-
vectorizer = get_vectorizer(all_data)
56-
X_train = encode_description(vectorizer, data["train"])
57-
y_train = get_adoption_speed
78+
@task
79+
def save_data(data: dict, save_dir: str):
80+
for name, value in data.items():
81+
save_path = abspath(f"{save_dir}/{name}")
82+
pickle.dump(value, open(save_path, "wb"))
5883

5984

6085
@hydra.main(config_path="../config", config_name="process", version_base=None)
6186
@flow
6287
def process_data(config):
63-
data = get_data(config.data.raw)
64-
all_data = get_all_data(data)
65-
get_description_features(config, all_data, data)
88+
data = get_data(config.data.raw.path)
89+
processed = get_description_features(data)
90+
filtered = filter_cols(config.use_cols, processed)
91+
encoded = encode_cat_cols(config.cat_cols, filtered)
92+
split = split_data(encoded)
93+
save_data(split, config.data.processed)
6694

6795

6896
# ---------------------------------------------------------------------------- #

0 commit comments

Comments
 (0)