add stackoverflow

khuyentran1401 · khuyentran1401 · commit ae706845a7b5 · 2022-01-06T20:39:18.000-06:00
diff --git a/data_science_tools/mlfoundry_example/data_engineering.py b/data_science_tools/mlfoundry_example/data_engineering.py
@@ -0,0 +1,68 @@
+from prefect import task, Flow, Parameter 
+from prefect.engine.results import LocalResult
+
+from typing import Any, Dict, List
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+# ---------------------------------------------------------------------------- #
+#                                 Create tasks                                 #
+# ---------------------------------------------------------------------------- #
+@task
+def load_data(path: str) -> pd.DataFrame:
+    return pd.read_csv(path)
+
+
+@task(target="{date:%a_%b_%d_%Y_%H-%M-%S}/{task_name}_output", result = LocalResult(dir='data/processed'))
+def get_classes(data: pd.DataFrame, target_col: str) -> List[str]:
+    """Task for getting the classes from the Iris data set."""
+    return sorted(data[target_col].unique())
+
+
+@task
+def encode_categorical_columns(data: pd.DataFrame, target_col: str) -> pd.DataFrame:
+    """Task for encoding the categorical columns in the Iris data set."""
+
+    return pd.get_dummies(data, columns=[target_col], prefix="", prefix_sep="")
+
+
+@task(log_stdout=True, target="{date:%a_%b_%d_%Y_%H-%M-%S}/{task_name}_output", result = LocalResult(dir='data/processed'))
+def split_data(data: pd.DataFrame, test_data_ratio: float, classes: list) -> Dict[str, Any]:
+    """Task for splitting the classical Iris data set into training and test
+    sets, each split into features and labels.
+    """
+
+    print(f"Splitting data into training and test sets with ratio {test_data_ratio}")
+
+    X, y = data.drop(columns=classes), data[classes]
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_data_ratio)
+
+    # When returning many variables, it is a good practice to give them names:
+    return dict(
+        train_x=X_train,
+        train_y=y_train,
+        test_x=X_test,
+        test_y=y_test,
+    )
+
+
+# ---------------------------------------------------------------------------- #
+#                                 Create a flow                                #
+# ---------------------------------------------------------------------------- #
+
+with Flow("data-engineer") as flow:
+    
+    # Define parameters
+    target_col = 'species'
+    test_data_ratio = Parameter("test_data_ratio", default=0.2)
+
+    # Define tasks
+    data = load_data(path="data/raw/iris.csv")
+    classes = get_classes(data=data, target_col=target_col) 
+    categorical_columns = encode_categorical_columns(data=data, target_col=target_col)
+    train_test_dict = split_data(data=categorical_columns, test_data_ratio=test_data_ratio, classes=classes)
+
+# flow.visualize()
+flow.run()
+# flow.register(project_name="Iris Project")
diff --git a/data_science_tools/mlfoundry_example/data_science.py b/data_science_tools/mlfoundry_example/data_science.py
@@ -0,0 +1,114 @@
+from prefect import task, Flow, Parameter
+from prefect.engine.results import LocalResult
+
+import numpy as np
+import pandas as pd
+
+import mlfoundry as mlf 
+
+@task
+def setup_mlf():
+    mlf_api = mlf.set_tracking_uri()
+    return mlf_api.create_run(project_name="Iris-project")
+
+
+# ---------------------------------------------------------------------------- #
+#                                 Create tasks                                 #
+# ---------------------------------------------------------------------------- #
+@task(log_stdout=True)
+def train_model(
+        train_x: pd.DataFrame, train_y: pd.DataFrame, num_train_iter: int, learning_rate: float) -> np.ndarray:
+    """Task for training a simple multi-class logistic regression model. The
+    number of training iterations as well as the learning rate are taken from
+    conf/project/parameters.yml. All of the data as well as the parameters
+    will be provided to this function at the time of execution.
+    """
+    num_iter = num_train_iter
+    lr = learning_rate
+    X = train_x.to_numpy()
+    Y = train_y.to_numpy()
+
+    # Add bias to the features
+    bias = np.ones((X.shape[0], 1))
+    X = np.concatenate((bias, X), axis=1)
+
+    weights = []
+    # Train one model for each class in Y
+    for k in range(Y.shape[1]):
+        # Initialise weights
+        theta = np.zeros(X.shape[1])
+        y = Y[:, k]
+        for _ in range(num_iter):
+            z = np.dot(X, theta)
+            h = _sigmoid(z)
+            gradient = np.dot(X.T, (h - y)) / y.size
+            theta -= lr * gradient
+        # Save the weights for each model
+        weights.append(theta)
+
+    # Print finishing training message
+    print("Finish training the model.")
+
+    # Return a joint multi-class model with weights for all classes
+    return np.vstack(weights).transpose()
+
+
+def _sigmoid(z):
+    """A helper sigmoid function used by the training and the scoring tasks."""
+    return 1 / (1 + np.exp(-z))
+
+@task
+def predict(model: np.ndarray, test_x: pd.DataFrame) -> np.ndarray:
+    """Task for making predictions given a pre-trained model and a test set."""
+    X = test_x.to_numpy()
+
+    # Add bias to the features
+    bias = np.ones((X.shape[0], 1))
+    X = np.concatenate((bias, X), axis=1)
+
+    # Predict "probabilities" for each class
+    result = _sigmoid(np.dot(X, model))
+
+    # Return the index of the class with max probability for all samples
+    return np.argmax(result, axis=1)
+
+
+@task(log_stdout=True)
+def report_accuracy(predictions: np.ndarray, test_y: pd.DataFrame) -> None:
+    """Task for reporting the accuracy of the predictions performed by the
+    previous task. Notice that this function has no outputs, except logging.
+    """
+    # Get true class index
+    target = np.argmax(test_y.to_numpy(), axis=1)
+    # Calculate accuracy of predictions
+    accuracy = np.sum(predictions == target) / target.shape[0]
+    # Log the accuracy of the model
+    print(f"Model accuracy on test set: {round(accuracy * 100, 2)}")
+
+
+# ---------------------------------------------------------------------------- #
+#                                 Create a flow                                #
+# ---------------------------------------------------------------------------- #
+
+with Flow("data-science") as flow:
+
+    
+    train_test_dict = LocalResult(dir='data/processed/Mon_Dec_20_2021_20:55:20').read(location='split_data_output').value
+
+    # Load data
+    train_x = train_test_dict['train_x']
+    train_y = train_test_dict['train_y']
+    test_x = train_test_dict['test_x']
+    test_y = train_test_dict['test_y']
+    
+    # Define parameters
+    num_train_iter = Parameter('num_train_iter', default=10000)
+    learning_rate = Parameter('learning_rate', default = 0.01)
+    
+    # Define tasks
+    model = train_model(train_x, train_y, num_train_iter, learning_rate)
+    predictions = predict(model, test_x)
+    report_accuracy(predictions, test_y)
+
+
+flow.run()
diff --git a/data_science_tools/mlfoundry_example/main.py b/data_science_tools/mlfoundry_example/main.py
@@ -0,0 +1,12 @@
+from prefect import Flow
+from prefect.tasks.prefect import StartFlowRun
+
+data_engineering_flow = StartFlowRun(
+    flow_name="data-engineer", project_name='Iris Project', wait=True, parameters={'test_data_ratio': 0.3})
+data_science_flow = StartFlowRun(
+    flow_name="data-science", project_name='Iris Project', wait=True)
+
+with Flow("main-flow") as flow:
+    result = data_science_flow(upstream_tasks=[data_engineering_flow])
+
+flow.run()
diff --git a/data_science_tools/mlfoundry_example/main_schedule.py b/data_science_tools/mlfoundry_example/main_schedule.py
@@ -0,0 +1,18 @@
+from prefect import Flow 
+from prefect.tasks.prefect import StartFlowRun
+from datetime import timedelta, datetime
+from prefect.schedules import IntervalSchedule
+
+schedule = IntervalSchedule(
+    start_date=datetime.utcnow() + timedelta(seconds=1),
+    interval=timedelta(minutes=1),
+)
+
+data_engineering_flow = StartFlowRun(flow_name="data-engineer", project_name='Iris Project')
+data_science_flow = StartFlowRun(flow_name="data-science", project_name='Iris Project')
+
+with Flow("main-flow", schedule=schedule) as flow:
+    data_science = data_science_flow(upstream_tasks=[data_engineering_flow])
+    
+# flow.register(project_name="Iris Project")
+flow.run()
diff --git a/data_science_tools/mlfoundry_example/mlf/mlruns/0/meta.yaml b/data_science_tools/mlfoundry_example/mlf/mlruns/0/meta.yaml
@@ -0,0 +1,4 @@
+artifact_location: file:///home/khuyen/Data-science/data_science_tools/mlfoundry_example/mlf/mlruns/0
+experiment_id: '0'
+lifecycle_stage: active
+name: Default
diff --git a/data_science_tools/mlfoundry_example/mlf/mlruns/1/70fac48397b2487bb1c3418812b1ef5e/meta.yaml b/data_science_tools/mlfoundry_example/mlf/mlruns/1/70fac48397b2487bb1c3418812b1ef5e/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: file:///home/khuyen/Data-science/data_science_tools/mlfoundry_example/mlf/mlruns/1/70fac48397b2487bb1c3418812b1ef5e/artifacts
+end_time: null
+entry_point_name: ''
+experiment_id: '1'
+lifecycle_stage: active
+name: ''
+run_id: 70fac48397b2487bb1c3418812b1ef5e
+run_uuid: 70fac48397b2487bb1c3418812b1ef5e
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1641322638766
+status: 1
+tags: []
+user_id: unknown
diff --git a/data_science_tools/mlfoundry_example/mlf/mlruns/1/70fac48397b2487bb1c3418812b1ef5e/tags/run_name b/data_science_tools/mlfoundry_example/mlf/mlruns/1/70fac48397b2487bb1c3418812b1ef5e/tags/run_name
@@ -0,0 +1 @@
+run_2022-01-04_18:57:18_utc_1
diff --git a/data_science_tools/mlfoundry_example/mlf/mlruns/1/meta.yaml b/data_science_tools/mlfoundry_example/mlf/mlruns/1/meta.yaml
@@ -0,0 +1,4 @@
+artifact_location: file:///home/khuyen/Data-science/data_science_tools/mlfoundry_example/mlf/mlruns/1
+experiment_id: '1'
+lifecycle_stage: active
+name: test-project
diff --git a/statistics/stackoverflow_survey/analyze_salary.ipynb b/statistics/stackoverflow_survey/analyze_salary.ipynb
diff --git a/visualization/evidently_example/evidently_example.ipynb b/visualization/evidently_example/evidently_example.ipynb
diff --git a/visualization/evidently_example/example.py b/visualization/evidently_example/example.py