From 8b32e8d5adf3124c01632a0fe6d29b240c825fae Mon Sep 17 00:00:00 2001
From: amlrelsa-ms
Date: Wed, 24 Mar 2021 16:45:36 +0000
Subject: [PATCH] update samples from Release-93 as a part of SDK release
---
configuration.ipynb | 2 +-
.../automated-machine-learning/automl_env.yml | 4 +-
.../automl_env_linux.yml | 4 +-
.../automl_env_mac.yml | 4 +-
.../automl_setup_mac.sh | 1 +
...fication-bank-marketing-all-features.ipynb | 11 +-
...-ml-classification-credit-card-fraud.ipynb | 11 +-
.../auto-ml-classification-text-dnn.ipynb | 11 +-
.../auto-ml-continuous-retraining.ipynb | 2 +-
.../auto-ml-regression-model-proxy.ipynb | 6 +-
.../auto-ml-forecasting-beer-remote.ipynb | 18 +-
.../auto-ml-forecasting-bike-share.ipynb | 8 +-
.../auto-ml-forecasting-energy-demand.ipynb | 15 +-
.../auto-ml-forecasting-function.ipynb | 5 +-
...to-ml-forecasting-orange-juice-sales.ipynb | 8 +-
...assification-credit-card-fraud-local.ipynb | 11 +-
...regression-explanation-featurization.ipynb | 11 +-
.../train_explainer.py | 10 +-
.../regression/auto-ml-regression.ipynb | 11 +-
.../spark_job_on_synapse_spark_pool.ipynb | 1010 ++++++++---------
.../spark_session_on_synapse_spark_pool.ipynb | 646 +++++------
...nes-parameter-tuning-with-hyperdrive.ipynb | 16 +-
...pipeline-style-transfer-parallel-run.ipynb | 42 +-
.../files/utils/callbacks.py | 2 +-
.../pong_rllib.ipynb | 17 +-
.../files/utils/callbacks.py | 2 +-
.../files/utils/callbacks.py | 2 +-
.../rai-loan-decision.yml | 1 -
.../logging-api/logging-api.ipynb | 2 +-
.../train-on-amlcompute.ipynb | 9 +-
.../labeled-datasets/labeled-datasets.ipynb | 402 -------
.../labeled-datasets/train.py | 106 --
index.md | 6 +-
setup-environment/configuration.ipynb | 2 +-
tutorials/README.md | 4 +
...ipeline-batch-scoring-classification.ipynb | 5 +-
tutorials/quickstart-ci/AzureMLIn10mins.ipynb | 669 +++++++++++
tutorials/quickstart-ci/AzureMLIn10mins.yml | 11 +
.../ClassificationWithAutomatedML.ipynb | 505 +++++++++
.../ClassificationWithAutomatedML.yml | 4 +
.../GettingStartedWithPythonSDK.ipynb | 710 ++++++++++++
.../GettingStartedWithPythonSDK.yml | 11 +
tutorials/quickstart-ci/score.py | 21 +
.../sklearn-mnist-batch/train.py | 82 ++
.../sklearn-mnist-batch/utils.py | 24 +
tutorials/quickstart-ci/utils.py | 24 +
46 files changed, 2982 insertions(+), 1506 deletions(-)
delete mode 100644 how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets.ipynb
delete mode 100644 how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets/train.py
create mode 100644 tutorials/quickstart-ci/AzureMLIn10mins.ipynb
create mode 100644 tutorials/quickstart-ci/AzureMLIn10mins.yml
create mode 100644 tutorials/quickstart-ci/ClassificationWithAutomatedML.ipynb
create mode 100644 tutorials/quickstart-ci/ClassificationWithAutomatedML.yml
create mode 100644 tutorials/quickstart-ci/GettingStartedWithPythonSDK.ipynb
create mode 100644 tutorials/quickstart-ci/GettingStartedWithPythonSDK.yml
create mode 100644 tutorials/quickstart-ci/score.py
create mode 100644 tutorials/quickstart-ci/sklearn-mnist-batch/train.py
create mode 100644 tutorials/quickstart-ci/sklearn-mnist-batch/utils.py
create mode 100644 tutorials/quickstart-ci/utils.py
diff --git a/configuration.ipynb b/configuration.ipynb
index fcba452e7..75343747b 100644
--- a/configuration.ipynb
+++ b/configuration.ipynb
@@ -103,7 +103,7 @@
"source": [
"import azureml.core\n",
"\n",
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
diff --git a/how-to-use-azureml/automated-machine-learning/automl_env.yml b/how-to-use-azureml/automated-machine-learning/automl_env.yml
index f0d34df81..e34d3ff5a 100644
--- a/how-to-use-azureml/automated-machine-learning/automl_env.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env.yml
@@ -21,8 +21,8 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- - azureml-widgets~=1.24.0
+ - azureml-widgets~=1.25.0
- pytorch-transformers==1.0.0
- spacy==2.1.8
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.24.0/validated_win32_requirements.txt [--no-deps]
+ - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.25.0/validated_win32_requirements.txt [--no-deps]
diff --git a/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml b/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml
index ec2df76c4..7c6463169 100644
--- a/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml
@@ -21,8 +21,8 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- - azureml-widgets~=1.24.0
+ - azureml-widgets~=1.25.0
- pytorch-transformers==1.0.0
- spacy==2.1.8
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.24.0/validated_linux_requirements.txt [--no-deps]
+ - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.25.0/validated_linux_requirements.txt [--no-deps]
diff --git a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
index 7986e34ed..eb0d25899 100644
--- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
@@ -22,8 +22,8 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- - azureml-widgets~=1.24.0
+ - azureml-widgets~=1.25.0
- pytorch-transformers==1.0.0
- spacy==2.1.8
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.24.0/validated_darwin_requirements.txt [--no-deps]
+ - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.25.0/validated_darwin_requirements.txt [--no-deps]
diff --git a/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh b/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh
index dc332e524..d2e97a989 100644
--- a/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh
+++ b/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh
@@ -32,6 +32,7 @@ if [ $? -ne 0 ]; then
fi
sed -i '' 's/AZUREML-SDK-VERSION/latest/' $AUTOML_ENV_FILE
+brew install libomp
if source activate $CONDA_ENV_NAME 2> /dev/null
then
diff --git a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb
index 60098ace6..56885e2e4 100644
--- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb
@@ -105,7 +105,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -374,15 +374,6 @@
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
index ffe01c11c..b1d21d1a3 100644
--- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
@@ -93,7 +93,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -255,15 +255,6 @@
"#remote_run = AutoMLRun(experiment = experiment, run_id = '')"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb
index 767e7c807..558745893 100644
--- a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb
@@ -96,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -319,15 +319,6 @@
"automl_run = experiment.submit(automl_config, show_output=True)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "automl_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb b/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb
index 5d7e3336e..99c5cd03d 100644
--- a/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb
@@ -81,7 +81,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
diff --git a/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb b/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb
index fdda92a2f..fb0b07a14 100644
--- a/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb
@@ -39,6 +39,7 @@
"source": [
"## Introduction\n",
"In this example we use an experimental feature, Model Proxy, to do a predict on the best generated model without downloading the model locally. The prediction will happen on same compute and environment that was used to train the model. This feature is currently in the experimental state, which means that the API is prone to changing, please make sure to run on the latest version of this notebook if you face any issues.\n",
+ "This notebook will also leverage MLFlow for saving models, allowing for more portability of the resulting models. See https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow for more details around MLFlow is AzureML.\n",
"\n",
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
"\n",
@@ -90,7 +91,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -212,10 +213,11 @@
" \"n_cross_validations\": 3,\n",
" \"primary_metric\": 'r2_score',\n",
" \"enable_early_stopping\": True, \n",
- " \"experiment_timeout_hours\": 0.3, #for real scenarios we reccommend a timeout of at least one hour \n",
+ " \"experiment_timeout_hours\": 0.3, #for real scenarios we recommend a timeout of at least one hour \n",
" \"max_concurrent_iterations\": 4,\n",
" \"max_cores_per_iteration\": -1,\n",
" \"verbosity\": logging.INFO,\n",
+ " \"save_mlflow\": True,\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'regression',\n",
diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb
index 97755d3e1..3e7f2122d 100644
--- a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb
@@ -113,7 +113,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -365,7 +365,9 @@
"source": [
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"forecasting_parameters = ForecastingParameters(\n",
- " time_column_name=time_column_name, forecast_horizon=forecast_horizon\n",
+ " time_column_name=time_column_name,\n",
+ " forecast_horizon=forecast_horizon,\n",
+ " freq='MS' # Set the forecast frequency to be monthly (start of the month)\n",
")\n",
"\n",
"automl_config = AutoMLConfig(task='forecasting', \n",
@@ -401,8 +403,7 @@
},
"outputs": [],
"source": [
- "remote_run = experiment.submit(automl_config, show_output= False)\n",
- "remote_run"
+ "remote_run = experiment.submit(automl_config, show_output= True)"
]
},
{
@@ -419,15 +420,6 @@
"# remote_run = AutoMLRun(experiment = experiment, run_id = '')"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run.wait_for_completion()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {
diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
index 21b19f4da..afe9a218a 100644
--- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
@@ -87,7 +87,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -318,7 +318,8 @@
" time_column_name=time_column_name,\n",
" forecast_horizon=forecast_horizon,\n",
" country_or_region_for_holidays='US', # set country_or_region will trigger holiday featurizer\n",
- " target_lags='auto' # use heuristic based lag setting \n",
+ " target_lags='auto', # use heuristic based lag setting\n",
+ " freq='D' # Set the forecast frequency to be daily\n",
")\n",
"\n",
"automl_config = AutoMLConfig(task='forecasting', \n",
@@ -349,8 +350,7 @@
"metadata": {},
"outputs": [],
"source": [
- "remote_run = experiment.submit(automl_config, show_output=False)\n",
- "remote_run"
+ "remote_run = experiment.submit(automl_config, show_output=False)"
]
},
{
diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
index 6ba8039c3..e2861e0c6 100644
--- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -342,7 +342,9 @@
"source": [
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"forecasting_parameters = ForecastingParameters(\n",
- " time_column_name=time_column_name, forecast_horizon=forecast_horizon\n",
+ " time_column_name=time_column_name,\n",
+ " forecast_horizon=forecast_horizon,\n",
+ " freq='H' # Set the forecast frequency to be hourly\n",
")\n",
"\n",
"automl_config = AutoMLConfig(task='forecasting', \n",
@@ -375,15 +377,6 @@
"remote_run = experiment.submit(automl_config, show_output=False)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb
index 4517881da..0650086e4 100644
--- a/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb
@@ -94,7 +94,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -319,7 +319,8 @@
" time_column_name=TIME_COLUMN_NAME,\n",
" forecast_horizon=forecast_horizon,\n",
" time_series_id_column_names=[ TIME_SERIES_ID_COLUMN_NAME ],\n",
- " target_lags=lags\n",
+ " target_lags=lags,\n",
+ " freq='H' # Set the forecast frequency to be hourly\n",
")"
]
},
diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
index 2fbe860ec..47657e920 100644
--- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
@@ -82,7 +82,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -423,7 +423,8 @@
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=time_column_name,\n",
" forecast_horizon=n_test_periods,\n",
- " time_series_id_column_names=time_series_id_column_names\n",
+ " time_series_id_column_names=time_series_id_column_names,\n",
+ " freq='W-THU' # Set the forecast frequency to be weekly (start on each Thursday)\n",
")\n",
"\n",
"automl_config = AutoMLConfig(task='forecasting',\n",
@@ -455,8 +456,7 @@
"metadata": {},
"outputs": [],
"source": [
- "remote_run = experiment.submit(automl_config, show_output=False)\n",
- "remote_run"
+ "remote_run = experiment.submit(automl_config, show_output=False)"
]
},
{
diff --git a/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb b/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
index fa7bec192..bd3a59e36 100644
--- a/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
@@ -96,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -215,15 +215,6 @@
"#local_run = AutoMLRun(experiment = experiment, run_id = '')"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "local_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb
index d7193587c..8151a2378 100644
--- a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb
@@ -96,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -305,15 +305,6 @@
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py
index 4dfdba90c..ed4a95e98 100644
--- a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py
+++ b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py
@@ -27,7 +27,7 @@
# Check if this AutoML model is explainable
if not automl_check_model_if_explainable(automl_run):
- raise Exception("Model explanations is currently not supported for " + automl_run.get_properties().get(
+ raise Exception("Model explanations are currently not supported for " + automl_run.get_properties().get(
'run_algorithm'))
# Download the best model from the artifact store
@@ -38,16 +38,16 @@
# Get the train dataset from the workspace
train_dataset = Dataset.get_by_name(workspace=ws, name='<>')
-# Drop the lablled column to get the training set.
+# Drop the labeled column to get the training set.
X_train = train_dataset.drop_columns(columns=['<>'])
y_train = train_dataset.keep_columns(columns=['<>'], validate=True)
-# Get the train dataset from the workspace
+# Get the test dataset from the workspace
test_dataset = Dataset.get_by_name(workspace=ws, name='<>')
-# Drop the lablled column to get the testing set.
+# Drop the labeled column to get the testing set.
X_test = test_dataset.drop_columns(columns=['<>'])
-# Setup the class for explaining the AtuoML models
+# Setup the class for explaining the AutoML models
automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, '<>',
X=X_train, X_test=X_test,
y=y_train)
diff --git a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
index fea6c7507..86bd4c2e2 100644
--- a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
@@ -92,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -256,15 +256,6 @@
"#remote_run = AutoMLRun(experiment = experiment, run_id = '')"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "remote_run"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb b/how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb
index 8a47e6645..946cc61eb 100644
--- a/how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb
+++ b/how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb
@@ -1,507 +1,507 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Copyright (c) Microsoft Corporation. All rights reserved. \n",
- "\n",
- "Licensed under the MIT License."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Using Synapse Spark Pool as a Compute Target from Azure Machine Learning Remote Run\n",
- "1. To use Synapse Spark Pool as a compute target from Experiment Run, [ScriptRunConfig](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.script_run_config.scriptrunconfig?view=azure-ml-py) is used, the same as other Experiment Runs. This notebook demonstrates how to leverage ScriptRunConfig to submit an experiment run to an attached Synapse Spark cluster.\n",
- "2. To use Synapse Spark Pool as a compute target from [Azure Machine Learning Pipeline](https://aka.ms/pl-concept), a [SynapseSparkStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.synapse_spark_step.synapsesparkstep?view=azure-ml-py) is used. This notebook demonstrates how to leverage SynapseSparkStep in Azure Machine Learning Pipeline.\n",
- "\n",
- "## Before you begin:\n",
- "1. **Create an Azure Synapse workspace**, check [this] (https://docs.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-workspace) for more information.\n",
- "2. **Create Spark Pool in Synapse workspace**: check [this] (https://docs.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-apache-spark-pool-portal) for more information."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Azure Machine Learning and Pipeline SDK-specific imports"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import azureml.core\n",
- "from azureml.core import Workspace, Experiment\n",
- "from azureml.core import LinkedService, SynapseWorkspaceLinkedServiceConfiguration\n",
- "from azureml.core.compute import ComputeTarget, SynapseCompute\n",
- "from azureml.exceptions import ComputeTargetException\n",
- "from azureml.data import HDFSOutputDatasetConfig\n",
- "from azureml.core.datastore import Datastore\n",
- "from azureml.core.runconfig import RunConfiguration\n",
- "from azureml.core.conda_dependencies import CondaDependencies\n",
- "from azureml.pipeline.core import Pipeline\n",
- "from azureml.pipeline.steps import PythonScriptStep, SynapseSparkStep\n",
- "\n",
- "# Check core SDK version number\n",
- "print(\"SDK version:\", azureml.core.VERSION)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ws = Workspace.from_config()\n",
- "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Link Synapse workspace to AML \n",
- "You have to be an \"Owner\" of Synapse workspace resource to perform linking. You can check your role in the Azure resource management portal, if you don't have an \"Owner\" role, you can contact an \"Owner\" to link the workspaces for you."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "# Replace with your resource info before running.\n",
- "\n",
- "synapse_subscription_id=os.getenv(\"SYNAPSE_SUBSCRIPTION_ID\", \"\")\n",
- "synapse_resource_group=os.getenv(\"SYNAPSE_RESOURCE_GROUP\", \"\")\n",
- "synapse_workspace_name=os.getenv(\"SYNAPSE_WORKSPACE_NAME\", \"\")\n",
- "synapse_linked_service_name=os.getenv(\"SYNAPSE_LINKED_SERVICE_NAME\", \"\")\n",
- "\n",
- "synapse_link_config = SynapseWorkspaceLinkedServiceConfiguration(\n",
- " subscription_id=synapse_subscription_id,\n",
- " resource_group=synapse_resource_group,\n",
- " name=synapse_workspace_name\n",
- ")\n",
- "\n",
- "linked_service = LinkedService.register(\n",
- " workspace=ws,\n",
- " name=synapse_linked_service_name,\n",
- " linked_service_config=synapse_link_config)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Linked service property\n",
- "\n",
- "A MSI (system_assigned_identity_principal_id) will be generated for each linked service, for example:\n",
- "\n",
- "name=synapselink,
\n",
- "type=Synapse, \n",
- "linked_service_resource_id=/subscriptions/4faaaf21-663f-4391-96fd-47197c630979/resourceGroups/static_resources_synapse_test/providers/Microsoft.Synapse/workspaces/synapsetest2, \n",
- "system_assigned_identity_principal_id=eb355d52-3806-4c5a-aec9-91447e8cfc2e \n",
- "\n",
- "#### Make sure you grant \"Synapse Apache Spark Administrator\" role of the synapse workspace to the generated workspace linking MSI in Synapse studio portal before you submit job."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "linked_service"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "LinkedService.list(ws)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Attach Synapse spark pool as AML compute target"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "synapse_spark_pool_name=os.getenv(\"SYNAPSE_SPARK_POOL_NAME\", \"\")\n",
- "synapse_compute_name=os.getenv(\"SYNAPSE_COMPUTE_NAME\", \"\")\n",
- "\n",
- "attach_config = SynapseCompute.attach_configuration(\n",
- " linked_service,\n",
- " type=\"SynapseSpark\",\n",
- " pool_name=synapse_spark_pool_name)\n",
- "\n",
- "synapse_compute=ComputeTarget.attach(\n",
- " workspace=ws,\n",
- " name=synapse_compute_name,\n",
- " attach_configuration=attach_config)\n",
- "\n",
- "synapse_compute.wait_for_completion()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Start an experiment run"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prepare data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Use the default blob storage\n",
- "def_blob_store = Datastore(ws, \"workspaceblobstore\")\n",
- "print('Datastore {} will be used'.format(def_blob_store.name))\n",
- "\n",
- "# We are uploading a sample file in the local directory to be used as a datasource\n",
- "file_name = \"Titanic.csv\"\n",
- "def_blob_store.upload_files(files=[\"./{}\".format(file_name)], overwrite=False)\n",
- " "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Tabular dataset as input"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import Dataset\n",
- "titanic_tabular_dataset = Dataset.Tabular.from_delimited_files(path=[(def_blob_store, file_name)])\n",
- "input1 = titanic_tabular_dataset.as_named_input(\"tabular_input\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## File dataset as input"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import Dataset\n",
- "titanic_file_dataset = Dataset.File.from_files(path=[(def_blob_store, file_name)])\n",
- "input2 = titanic_file_dataset.as_named_input(\"file_input\").as_hdfs()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output config: the output will be registered as a File dataset\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.data import HDFSOutputDatasetConfig\n",
- "output = HDFSOutputDatasetConfig(destination=(def_blob_store,\"test\")).register_on_complete(name=\"registered_dataset\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Dataprep script"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "os.makedirs(\"code\", exist_ok=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%writefile code/dataprep.py\n",
- "import os\n",
- "import sys\n",
- "import azureml.core\n",
- "from pyspark.sql import SparkSession\n",
- "from azureml.core import Run, Dataset\n",
- "\n",
- "print(azureml.core.VERSION)\n",
- "print(os.environ)\n",
- "\n",
- "import argparse\n",
- "parser = argparse.ArgumentParser()\n",
- "parser.add_argument(\"--tabular_input\")\n",
- "parser.add_argument(\"--file_input\")\n",
- "parser.add_argument(\"--output_dir\")\n",
- "args = parser.parse_args()\n",
- "\n",
- "# use dataset sdk to read tabular dataset\n",
- "run_context = Run.get_context()\n",
- "dataset = Dataset.get_by_id(run_context.experiment.workspace,id=args.tabular_input)\n",
- "sdf = dataset.to_spark_dataframe()\n",
- "sdf.show()\n",
- "\n",
- "# use hdfs path to read file dataset\n",
- "spark= SparkSession.builder.getOrCreate()\n",
- "sdf = spark.read.option(\"header\", \"true\").csv(args.file_input)\n",
- "sdf.show()\n",
- "\n",
- "sdf.coalesce(1).write\\\n",
- ".option(\"header\", \"true\")\\\n",
- ".mode(\"append\")\\\n",
- ".csv(args.output_dir)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Set up Conda dependency for the following Script Run"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core.environment import CondaDependencies\n",
- "conda_dep = CondaDependencies()\n",
- "conda_dep.add_pip_package(\"azureml-core==1.20.0\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## How to leverage ScriptRunConfig to submit an experiment run to an attached Synapse Spark cluster"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import RunConfiguration\n",
- "from azureml.core import ScriptRunConfig \n",
- "from azureml.core import Experiment\n",
- "\n",
- "run_config = RunConfiguration(framework=\"pyspark\")\n",
- "run_config.target = synapse_compute_name\n",
- "\n",
- "run_config.spark.configuration[\"spark.driver.memory\"] = \"1g\" \n",
- "run_config.spark.configuration[\"spark.driver.cores\"] = 2 \n",
- "run_config.spark.configuration[\"spark.executor.memory\"] = \"1g\" \n",
- "run_config.spark.configuration[\"spark.executor.cores\"] = 1 \n",
- "run_config.spark.configuration[\"spark.executor.instances\"] = 1 \n",
- "\n",
- "run_config.environment.python.conda_dependencies = conda_dep\n",
- "\n",
- "script_run_config = ScriptRunConfig(source_directory = './code',\n",
- " script= 'dataprep.py',\n",
- " arguments = [\"--tabular_input\", input1, \n",
- " \"--file_input\", input2,\n",
- " \"--output_dir\", output],\n",
- " run_config = run_config) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import Experiment \n",
- "exp = Experiment(workspace=ws, name=\"synapse-spark\") \n",
- "run = exp.submit(config=script_run_config) \n",
- "run"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## How to leverage SynapseSparkStep in an AML pipeline to orchestrate data prep step on Synapse Spark and training step on AzureML compute."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Choose a name for your CPU cluster\n",
- "cpu_cluster_name = \"cpucluster\"\n",
- "\n",
- "# Verify that cluster does not exist already\n",
- "try:\n",
- " cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
- " print('Found existing cluster, use it.')\n",
- "except ComputeTargetException:\n",
- " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n",
- " max_nodes=1)\n",
- " cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
- "\n",
- "cpu_cluster.wait_for_completion(show_output=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%writefile code/train.py\n",
- "import glob\n",
- "import os\n",
- "import sys\n",
- "from os import listdir\n",
- "from os.path import isfile, join\n",
- "\n",
- "mypath = os.environ[\"step2_input\"]\n",
- "files = [f for f in listdir(mypath) if isfile(join(mypath, f))]\n",
- "for file in files:\n",
- " with open(join(mypath,file)) as f:\n",
- " print(f.read())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "titanic_tabular_dataset = Dataset.Tabular.from_delimited_files(path=[(def_blob_store, file_name)])\n",
- "titanic_file_dataset = Dataset.File.from_files(path=[(def_blob_store, file_name)])\n",
- "\n",
- "step1_input1 = titanic_tabular_dataset.as_named_input(\"tabular_input\")\n",
- "step1_input2 = titanic_file_dataset.as_named_input(\"file_input\").as_hdfs()\n",
- "step1_output = HDFSOutputDatasetConfig(destination=(def_blob_store,\"test\")).register_on_complete(name=\"registered_dataset\")\n",
- "\n",
- "step2_input = step1_output.as_input(\"step2_input\").as_download()\n",
- "\n",
- "\n",
- "from azureml.core.environment import Environment\n",
- "env = Environment(name=\"myenv\")\n",
- "env.python.conda_dependencies.add_pip_package(\"azureml-core==1.20.0\")\n",
- "\n",
- "step_1 = SynapseSparkStep(name = 'synapse-spark',\n",
- " file = 'dataprep.py',\n",
- " source_directory=\"./code\", \n",
- " inputs=[step1_input1, step1_input2],\n",
- " outputs=[step1_output],\n",
- " arguments = [\"--tabular_input\", step1_input1, \n",
- " \"--file_input\", step1_input2,\n",
- " \"--output_dir\", step1_output],\n",
- " compute_target = synapse_compute_name,\n",
- " driver_memory = \"7g\",\n",
- " driver_cores = 4,\n",
- " executor_memory = \"7g\",\n",
- " executor_cores = 2,\n",
- " num_executors = 1,\n",
- " environment = env)\n",
- "\n",
- "step_2 = PythonScriptStep(script_name=\"train.py\",\n",
- " arguments=[step2_input],\n",
- " inputs=[step2_input],\n",
- " compute_target=cpu_cluster_name,\n",
- " source_directory=\"./code\",\n",
- " allow_reuse=False)\n",
- "\n",
- "pipeline = Pipeline(workspace=ws, steps=[step_1, step_2])\n",
- "pipeline_run = pipeline.submit('synapse-pipeline', regenerate_outputs=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "authors": [
- {
- "name": "yunzhan"
- }
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+ "\n",
+ "Licensed under the MIT License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Using Synapse Spark Pool as a Compute Target from Azure Machine Learning Remote Run\n",
+ "1. To use Synapse Spark Pool as a compute target from Experiment Run, [ScriptRunConfig](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.script_run_config.scriptrunconfig?view=azure-ml-py) is used, the same as other Experiment Runs. This notebook demonstrates how to leverage ScriptRunConfig to submit an experiment run to an attached Synapse Spark cluster.\n",
+ "2. To use Synapse Spark Pool as a compute target from [Azure Machine Learning Pipeline](https://aka.ms/pl-concept), a [SynapseSparkStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.synapse_spark_step.synapsesparkstep?view=azure-ml-py) is used. This notebook demonstrates how to leverage SynapseSparkStep in Azure Machine Learning Pipeline.\n",
+ "\n",
+ "## Before you begin:\n",
+ "1. **Create an Azure Synapse workspace**, check [this] (https://docs.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-workspace) for more information.\n",
+ "2. **Create Spark Pool in Synapse workspace**: check [this] (https://docs.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-apache-spark-pool-portal) for more information."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Azure Machine Learning and Pipeline SDK-specific imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import azureml.core\n",
+ "from azureml.core import Workspace, Experiment\n",
+ "from azureml.core import LinkedService, SynapseWorkspaceLinkedServiceConfiguration\n",
+ "from azureml.core.compute import ComputeTarget, SynapseCompute\n",
+ "from azureml.exceptions import ComputeTargetException\n",
+ "from azureml.data import HDFSOutputDatasetConfig\n",
+ "from azureml.core.datastore import Datastore\n",
+ "from azureml.core.runconfig import RunConfiguration\n",
+ "from azureml.core.conda_dependencies import CondaDependencies\n",
+ "from azureml.pipeline.core import Pipeline\n",
+ "from azureml.pipeline.steps import PythonScriptStep, SynapseSparkStep\n",
+ "\n",
+ "# Check core SDK version number\n",
+ "print(\"SDK version:\", azureml.core.VERSION)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ws = Workspace.from_config()\n",
+ "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Link Synapse workspace to AML \n",
+ "You have to be an \"Owner\" of Synapse workspace resource to perform linking. You can check your role in the Azure resource management portal, if you don't have an \"Owner\" role, you can contact an \"Owner\" to link the workspaces for you."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Replace with your resource info before running.\n",
+ "\n",
+ "synapse_subscription_id=os.getenv(\"SYNAPSE_SUBSCRIPTION_ID\", \"\")\n",
+ "synapse_resource_group=os.getenv(\"SYNAPSE_RESOURCE_GROUP\", \"\")\n",
+ "synapse_workspace_name=os.getenv(\"SYNAPSE_WORKSPACE_NAME\", \"\")\n",
+ "synapse_linked_service_name=os.getenv(\"SYNAPSE_LINKED_SERVICE_NAME\", \"\")\n",
+ "\n",
+ "synapse_link_config = SynapseWorkspaceLinkedServiceConfiguration(\n",
+ " subscription_id=synapse_subscription_id,\n",
+ " resource_group=synapse_resource_group,\n",
+ " name=synapse_workspace_name\n",
+ ")\n",
+ "\n",
+ "linked_service = LinkedService.register(\n",
+ " workspace=ws,\n",
+ " name=synapse_linked_service_name,\n",
+ " linked_service_config=synapse_link_config)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Linked service property\n",
+ "\n",
+ "A MSI (system_assigned_identity_principal_id) will be generated for each linked service, for example:\n",
+ "\n",
+ "name=synapselink,\n",
+ "type=Synapse, \n",
+ "linked_service_resource_id=/subscriptions/4faaaf21-663f-4391-96fd-47197c630979/resourceGroups/static_resources_synapse_test/providers/Microsoft.Synapse/workspaces/synapsetest2, \n",
+ "system_assigned_identity_principal_id=eb355d52-3806-4c5a-aec9-91447e8cfc2e \n",
+ "\n",
+ "#### Make sure you grant \"Synapse Apache Spark Administrator\" role of the synapse workspace to the generated workspace linking MSI in Synapse studio portal before you submit job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "linked_service"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LinkedService.list(ws)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Attach Synapse spark pool as AML compute target"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "synapse_spark_pool_name=os.getenv(\"SYNAPSE_SPARK_POOL_NAME\", \"\")\n",
+ "synapse_compute_name=os.getenv(\"SYNAPSE_COMPUTE_NAME\", \"\")\n",
+ "\n",
+ "attach_config = SynapseCompute.attach_configuration(\n",
+ " linked_service,\n",
+ " type=\"SynapseSpark\",\n",
+ " pool_name=synapse_spark_pool_name)\n",
+ "\n",
+ "synapse_compute=ComputeTarget.attach(\n",
+ " workspace=ws,\n",
+ " name=synapse_compute_name,\n",
+ " attach_configuration=attach_config)\n",
+ "\n",
+ "synapse_compute.wait_for_completion()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Start an experiment run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Prepare data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use the default blob storage\n",
+ "def_blob_store = Datastore(ws, \"workspaceblobstore\")\n",
+ "print('Datastore {} will be used'.format(def_blob_store.name))\n",
+ "\n",
+ "# We are uploading a sample file in the local directory to be used as a datasource\n",
+ "file_name = \"Titanic.csv\"\n",
+ "def_blob_store.upload_files(files=[\"./{}\".format(file_name)], overwrite=False)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tabular dataset as input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.core import Dataset\n",
+ "titanic_tabular_dataset = Dataset.Tabular.from_delimited_files(path=[(def_blob_store, file_name)])\n",
+ "input1 = titanic_tabular_dataset.as_named_input(\"tabular_input\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## File dataset as input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.core import Dataset\n",
+ "titanic_file_dataset = Dataset.File.from_files(path=[(def_blob_store, file_name)])\n",
+ "input2 = titanic_file_dataset.as_named_input(\"file_input\").as_hdfs()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Output config: the output will be registered as a File dataset\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.data import HDFSOutputDatasetConfig\n",
+ "output = HDFSOutputDatasetConfig(destination=(def_blob_store,\"test\")).register_on_complete(name=\"registered_dataset\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Dataprep script"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.makedirs(\"code\", exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/dataprep.py\n",
+ "import os\n",
+ "import sys\n",
+ "import azureml.core\n",
+ "from pyspark.sql import SparkSession\n",
+ "from azureml.core import Run, Dataset\n",
+ "\n",
+ "print(azureml.core.VERSION)\n",
+ "print(os.environ)\n",
+ "\n",
+ "import argparse\n",
+ "parser = argparse.ArgumentParser()\n",
+ "parser.add_argument(\"--tabular_input\")\n",
+ "parser.add_argument(\"--file_input\")\n",
+ "parser.add_argument(\"--output_dir\")\n",
+ "args = parser.parse_args()\n",
+ "\n",
+ "# use dataset sdk to read tabular dataset\n",
+ "run_context = Run.get_context()\n",
+ "dataset = Dataset.get_by_id(run_context.experiment.workspace,id=args.tabular_input)\n",
+ "sdf = dataset.to_spark_dataframe()\n",
+ "sdf.show()\n",
+ "\n",
+ "# use hdfs path to read file dataset\n",
+ "spark= SparkSession.builder.getOrCreate()\n",
+ "sdf = spark.read.option(\"header\", \"true\").csv(args.file_input)\n",
+ "sdf.show()\n",
+ "\n",
+ "sdf.coalesce(1).write\\\n",
+ ".option(\"header\", \"true\")\\\n",
+ ".mode(\"append\")\\\n",
+ ".csv(args.output_dir)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Set up Conda dependency for the following Script Run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.core.environment import CondaDependencies\n",
+ "conda_dep = CondaDependencies()\n",
+ "conda_dep.add_pip_package(\"azureml-core==1.20.0\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to leverage ScriptRunConfig to submit an experiment run to an attached Synapse Spark cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.core import RunConfiguration\n",
+ "from azureml.core import ScriptRunConfig \n",
+ "from azureml.core import Experiment\n",
+ "\n",
+ "run_config = RunConfiguration(framework=\"pyspark\")\n",
+ "run_config.target = synapse_compute_name\n",
+ "\n",
+ "run_config.spark.configuration[\"spark.driver.memory\"] = \"1g\" \n",
+ "run_config.spark.configuration[\"spark.driver.cores\"] = 2 \n",
+ "run_config.spark.configuration[\"spark.executor.memory\"] = \"1g\" \n",
+ "run_config.spark.configuration[\"spark.executor.cores\"] = 1 \n",
+ "run_config.spark.configuration[\"spark.executor.instances\"] = 1 \n",
+ "\n",
+ "run_config.environment.python.conda_dependencies = conda_dep\n",
+ "\n",
+ "script_run_config = ScriptRunConfig(source_directory = './code',\n",
+ " script= 'dataprep.py',\n",
+ " arguments = [\"--tabular_input\", input1, \n",
+ " \"--file_input\", input2,\n",
+ " \"--output_dir\", output],\n",
+ " run_config = run_config) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from azureml.core import Experiment \n",
+ "exp = Experiment(workspace=ws, name=\"synapse-spark\") \n",
+ "run = exp.submit(config=script_run_config) \n",
+ "run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to leverage SynapseSparkStep in an AML pipeline to orchestrate data prep step on Synapse Spark and training step on AzureML compute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Choose a name for your CPU cluster\n",
+ "cpu_cluster_name = \"cpucluster\"\n",
+ "\n",
+ "# Verify that cluster does not exist already\n",
+ "try:\n",
+ " cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
+ " print('Found existing cluster, use it.')\n",
+ "except ComputeTargetException:\n",
+ " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n",
+ " max_nodes=1)\n",
+ " cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
+ "\n",
+ "cpu_cluster.wait_for_completion(show_output=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/train.py\n",
+ "import glob\n",
+ "import os\n",
+ "import sys\n",
+ "from os import listdir\n",
+ "from os.path import isfile, join\n",
+ "\n",
+ "mypath = os.environ[\"step2_input\"]\n",
+ "files = [f for f in listdir(mypath) if isfile(join(mypath, f))]\n",
+ "for file in files:\n",
+ " with open(join(mypath,file)) as f:\n",
+ " print(f.read())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "titanic_tabular_dataset = Dataset.Tabular.from_delimited_files(path=[(def_blob_store, file_name)])\n",
+ "titanic_file_dataset = Dataset.File.from_files(path=[(def_blob_store, file_name)])\n",
+ "\n",
+ "step1_input1 = titanic_tabular_dataset.as_named_input(\"tabular_input\")\n",
+ "step1_input2 = titanic_file_dataset.as_named_input(\"file_input\").as_hdfs()\n",
+ "step1_output = HDFSOutputDatasetConfig(destination=(def_blob_store,\"test\")).register_on_complete(name=\"registered_dataset\")\n",
+ "\n",
+ "step2_input = step1_output.as_input(\"step2_input\").as_download()\n",
+ "\n",
+ "\n",
+ "from azureml.core.environment import Environment\n",
+ "env = Environment(name=\"myenv\")\n",
+ "env.python.conda_dependencies.add_pip_package(\"azureml-core==1.20.0\")\n",
+ "\n",
+ "step_1 = SynapseSparkStep(name = 'synapse-spark',\n",
+ " file = 'dataprep.py',\n",
+ " source_directory=\"./code\", \n",
+ " inputs=[step1_input1, step1_input2],\n",
+ " outputs=[step1_output],\n",
+ " arguments = [\"--tabular_input\", step1_input1, \n",
+ " \"--file_input\", step1_input2,\n",
+ " \"--output_dir\", step1_output],\n",
+ " compute_target = synapse_compute_name,\n",
+ " driver_memory = \"7g\",\n",
+ " driver_cores = 4,\n",
+ " executor_memory = \"7g\",\n",
+ " executor_cores = 2,\n",
+ " num_executors = 1,\n",
+ " environment = env)\n",
+ "\n",
+ "step_2 = PythonScriptStep(script_name=\"train.py\",\n",
+ " arguments=[step2_input],\n",
+ " inputs=[step2_input],\n",
+ " compute_target=cpu_cluster_name,\n",
+ " source_directory=\"./code\",\n",
+ " allow_reuse=False)\n",
+ "\n",
+ "pipeline = Pipeline(workspace=ws, steps=[step_1, step_2])\n",
+ "pipeline_run = pipeline.submit('synapse-pipeline', regenerate_outputs=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
],
- "kernelspec": {
- "display_name": "Python 3.6",
- "language": "python",
- "name": "python36"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.7"
- },
- "nteract": {
- "version": "0.28.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+ "metadata": {
+ "authors": [
+ {
+ "name": "yunzhan"
+ }
+ ],
+ "kernelspec": {
+ "display_name": "Python 3.6",
+ "language": "python",
+ "name": "python36"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ },
+ "nteract": {
+ "version": "0.28.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb b/how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb
index 26ae450d8..1eebb8a45 100644
--- a/how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb
+++ b/how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb
@@ -1,327 +1,327 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Copyright (c) Microsoft Corporation. All rights reserved.\n",
- "\n",
- "Licensed under the MIT License."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Interactive Spark Session on Synapse Spark Pool"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Install package"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install -U \"azureml-synapse\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "For JupyterLab, please additionally run:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!jupyter lab build --minimize=False"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## PLEASE restart kernel and then refresh web page before starting spark session."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 0. How to leverage Spark Magic for interactive Spark experience"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2020-06-05T03:22:14.965395Z",
- "iopub.status.busy": "2020-06-05T03:22:14.965395Z",
- "iopub.status.idle": "2020-06-05T03:22:14.970398Z",
- "shell.execute_reply": "2020-06-05T03:22:14.969397Z",
- "shell.execute_reply.started": "2020-06-05T03:22:14.965395Z"
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Copyright (c) Microsoft Corporation. All rights reserved.\n",
+ "\n",
+ "Licensed under the MIT License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Interactive Spark Session on Synapse Spark Pool"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Install package"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -U \"azureml-synapse\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For JupyterLab, please additionally run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!jupyter lab build --minimize=False"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## PLEASE restart kernel and then refresh web page before starting spark session."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 0. How to leverage Spark Magic for interactive Spark experience"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2020-06-05T03:22:14.965395Z",
+ "iopub.status.busy": "2020-06-05T03:22:14.965395Z",
+ "iopub.status.idle": "2020-06-05T03:22:14.970398Z",
+ "shell.execute_reply": "2020-06-05T03:22:14.969397Z",
+ "shell.execute_reply.started": "2020-06-05T03:22:14.965395Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# show help\n",
+ "%synapse ?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Start Synapse Session"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "synapse_compute_name=os.getenv(\"SYNAPSE_COMPUTE_NAME\", \"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# use Synapse compute linked to the Compute Instance's workspace with an aml envrionment.\n",
+ "# conda dependencies specified in the environment will be installed before the spark session started.\n",
+ "\n",
+ "%synapse start -c $synapse_compute_name -e AzureML-Minimal"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# use Synapse compute from anther workspace via its config file\n",
+ "\n",
+ "# %synapse start -c -f config.json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# use Synapse compute from anther workspace via subscription_id, resource_group and workspace_name\n",
+ "\n",
+ "# %synapse start -c -s -r -w "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# start a spark session with an AML environment, \n",
+ "# %synapse start -c -s -r -w -e AzureML-Minimal"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Data prepration\n",
+ "\n",
+ "Three types of datastore are supported in synapse spark, and you have two ways to load the data.\n",
+ "\n",
+ "\n",
+ "| Datastore Type | Data Acess |\n",
+ "|--------------------|-------------------------------|\n",
+ "| Blob | Credential |\n",
+ "| Adlsgen1 | Credential & Credential-less |\n",
+ "| Adlsgen2 | Credential & Credential-less |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Example 1: Data loading by HDFS path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Read data from Blob**\n",
+ "\n",
+ "```python\n",
+ "# setup access key or sas token\n",
+ "\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.key..blob.core.windows.net\", \"\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.sas...blob.core.windows.net\", \"sas token\")\n",
+ "\n",
+ "df = spark.read.parquet(\"wasbs://@.blob.core.windows.net/\")\n",
+ "```\n",
+ "\n",
+ "**Read data from Adlsgen1**\n",
+ "\n",
+ "```python\n",
+ "# setup service pricinpal which has access of the data\n",
+ "# If no data Credential is setup, the user identity will be used to do access control\n",
+ "\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.access.token.provider.type\",\"ClientCredential\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.client.id\", \"\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.credential\", \"\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.refresh.url\", \"https://login.microsoftonline.com//oauth2/token\")\n",
+ "\n",
+ "df = spark.read.csv(\"adl://.azuredatalakestore.net/\")\n",
+ "```\n",
+ "\n",
+ "**Read data from Adlsgen2**\n",
+ "\n",
+ "```python\n",
+ "# setup service pricinpal which has access of the data\n",
+ "# If no data Credential is setup, the user identity will be used to do access control\n",
+ "\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.auth.type..dfs.core.windows.net\",\"OAuth\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth.provider.type..dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.id..dfs.core.windows.net\", \"\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.secret..dfs.core.windows.net\", \"\")\n",
+ "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net\", \"https://login.microsoftonline.com//oauth2/token\")\n",
+ "\n",
+ "df = spark.read.csv(\"abfss://@.dfs.core.windows.net/\")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2020-06-04T08:11:18.812276Z",
+ "iopub.status.busy": "2020-06-04T08:11:18.812276Z",
+ "iopub.status.idle": "2020-06-04T08:11:23.854526Z",
+ "shell.execute_reply": "2020-06-04T08:11:23.853525Z",
+ "shell.execute_reply.started": "2020-06-04T08:11:18.812276Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%synapse\n",
+ "\n",
+ "from pyspark.sql.functions import col, desc\n",
+ "\n",
+ "df = spark.read.option(\"header\", \"true\").csv(\"wasbs://demo@dprepdata.blob.core.windows.net/Titanic.csv\")\n",
+ "df.filter(col('Survived') == 1).groupBy('Age').count().orderBy(desc('count')).show(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Example 2: Data loading by AML Dataset\n",
+ "\n",
+ "You can create tabular data by following the [guidance](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets) and use to_spark_dataframe() to load the data.\n",
+ "\n",
+ "```text\n",
+ "%%synapse\n",
+ "\n",
+ "import azureml.core\n",
+ "print(azureml.core.VERSION)\n",
+ "\n",
+ "from azureml.core import Workspace, Dataset\n",
+ "ws = Workspace.get(name='', subscription_id='', resource_group='')\n",
+ "ds = Dataset.get_by_name(ws, \"\")\n",
+ "df = ds.to_spark_dataframe()\n",
+ "\n",
+ "# You can do more data transformation on spark dataframe\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Session Metadata\n",
+ "After session started, you can check the session's metadata, find the links to Synapse portal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%synapse meta"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Stop Session\n",
+ "When current session reach the status timeout, dead or any failure, you must explicitly stop it before start new one. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%synapse stop"
+ ]
}
- },
- "outputs": [],
- "source": [
- "# show help\n",
- "%synapse ?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Start Synapse Session"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "synapse_compute_name=os.getenv(\"SYNAPSE_COMPUTE_NAME\", \"\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# use Synapse compute linked to the Compute Instance's workspace with an aml envrionment.\n",
- "# conda dependencies specified in the environment will be installed before the spark session started.\n",
- "\n",
- "%synapse start -c $synapse_compute_name -e AzureML-Minimal"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# use Synapse compute from anther workspace via its config file\n",
- "\n",
- "# %synapse start -c -f config.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# use Synapse compute from anther workspace via subscription_id, resource_group and workspace_name\n",
- "\n",
- "# %synapse start -c -s -r -w "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# start a spark session with an AML environment, \n",
- "# %synapse start -c -s -r -w -e AzureML-Minimal"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Data prepration\n",
- "\n",
- "Three types of datastore are supported in synapse spark, and you have two ways to load the data.\n",
- "\n",
- "\n",
- "| Datastore Type | Data Acess |\n",
- "|--------------------|-------------------------------|\n",
- "| Blob | Credential |\n",
- "| Adlsgen1 | Credential & Credential-less |\n",
- "| Adlsgen2 | Credential & Credential-less |"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Example 1: Data loading by HDFS path"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Read data from Blob**\n",
- "\n",
- "```python\n",
- "# setup access key or sas token\n",
- "\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.key..blob.core.windows.net\", \"\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.sas...blob.core.windows.net\", \"sas token\")\n",
- "\n",
- "df = spark.read.parquet(\"wasbs://@.blob.core.windows.net/\")\n",
- "```\n",
- "\n",
- "**Read data from Adlsgen1**\n",
- "\n",
- "```python\n",
- "# setup service pricinpal which has access of the data\n",
- "# If no data Credential is setup, the user identity will be used to do access control\n",
- "\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.access.token.provider.type\",\"ClientCredential\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.client.id\", \"\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.credential\", \"\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.adl.account..oauth2.refresh.url\", \"https://login.microsoftonline.com//oauth2/token\")\n",
- "\n",
- "df = spark.read.csv(\"adl://.azuredatalakestore.net/\")\n",
- "```\n",
- "\n",
- "**Read data from Adlsgen2**\n",
- "\n",
- "```python\n",
- "# setup service pricinpal which has access of the data\n",
- "# If no data Credential is setup, the user identity will be used to do access control\n",
- "\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.auth.type..dfs.core.windows.net\",\"OAuth\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth.provider.type..dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.id..dfs.core.windows.net\", \"\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.secret..dfs.core.windows.net\", \"\")\n",
- "sc._jsc.hadoopConfiguration().set(\"fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net\", \"https://login.microsoftonline.com//oauth2/token\")\n",
- "\n",
- "df = spark.read.csv(\"abfss://@.dfs.core.windows.net/\")\n",
- "```"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2020-06-04T08:11:18.812276Z",
- "iopub.status.busy": "2020-06-04T08:11:18.812276Z",
- "iopub.status.idle": "2020-06-04T08:11:23.854526Z",
- "shell.execute_reply": "2020-06-04T08:11:23.853525Z",
- "shell.execute_reply.started": "2020-06-04T08:11:18.812276Z"
- }
- },
- "outputs": [],
- "source": [
- "%%synapse\n",
- "\n",
- "from pyspark.sql.functions import col, desc\n",
- "\n",
- "df = spark.read.option(\"header\", \"true\").csv(\"wasbs://demo@dprepdata.blob.core.windows.net/Titanic.csv\")\n",
- "df.filter(col('Survived') == 1).groupBy('Age').count().orderBy(desc('count')).show(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Example 2: Data loading by AML Dataset\n",
- "\n",
- "You can create tabular data by following the [guidance](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets) and use to_spark_dataframe() to load the data.\n",
- "\n",
- "```text\n",
- "%%synapse\n",
- "\n",
- "import azureml.core\n",
- "print(azureml.core.VERSION)\n",
- "\n",
- "from azureml.core import Workspace, Dataset\n",
- "ws = Workspace.get(name='', subscription_id='', resource_group='')\n",
- "ds = Dataset.get_by_name(ws, \"\")\n",
- "df = ds.to_spark_dataframe()\n",
- "\n",
- "# You can do more data transformation on spark dataframe\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Session Metadata\n",
- "After session started, you can check the session's metadata, find the links to Synapse portal."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%synapse meta"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 4. Stop Session\n",
- "When current session reach the status timeout, dead or any failure, you must explicitly stop it before start new one. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%synapse stop"
- ]
- }
- ],
- "metadata": {
- "authors": [
- {
- "name": "yunzhan"
- }
],
- "kernelspec": {
- "display_name": "Python 3.6",
- "language": "python",
- "name": "python36"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "metadata": {
+ "authors": [
+ {
+ "name": "yunzhan"
+ }
+ ],
+ "kernelspec": {
+ "display_name": "Python 3.6",
+ "language": "python",
+ "name": "python36"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ },
+ "nteract": {
+ "version": "0.28.0"
+ }
},
- "nteract": {
- "version": "0.28.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb
index cdd3922ec..ab1c2ee21 100644
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb
@@ -125,13 +125,13 @@
"os.makedirs(data_folder, exist_ok=True)\n",
"\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',\n",
- " filename=os.path.join(data_folder, 'train-images-idx3-ubyte.gz'))\n",
+ " filename=os.path.join(data_folder, 'train-images.gz'))\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',\n",
- " filename=os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'))\n",
+ " filename=os.path.join(data_folder, 'train-labels.gz'))\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n",
- " filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n",
+ " filename=os.path.join(data_folder, 'test-images.gz'))\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n",
- " filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))"
+ " filename=os.path.join(data_folder, 'test-labels.gz'))"
]
},
{
@@ -151,10 +151,10 @@
"from utils import load_data\n",
"\n",
"# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n",
- "X_train = load_data(os.path.join(data_folder, 'train-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n",
- "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n",
- "y_train = load_data(os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'), True).reshape(-1)\n",
- "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n",
+ "X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / np.float32(255.0)\n",
+ "X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / np.float32(255.0)\n",
+ "y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)\n",
+ "y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)\n",
"\n",
"\n",
"count = 0\n",
diff --git a/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer-parallel-run.ipynb b/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer-parallel-run.ipynb
index 5f888d820..18686362f 100644
--- a/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer-parallel-run.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer-parallel-run.ipynb
@@ -81,12 +81,12 @@
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
- "from azureml.core.datastore import Datastore\n",
- "from azureml.data.data_reference import DataReference\n",
- "from azureml.pipeline.core import Pipeline, PipelineData\n",
+ "from azureml.core import Datastore, Dataset\n",
+ "from azureml.pipeline.core import Pipeline\n",
"from azureml.pipeline.steps import PythonScriptStep\n",
"from azureml.core.runconfig import CondaDependencies, RunConfiguration\n",
- "from azureml.core.compute_target import ComputeTargetException"
+ "from azureml.core.compute_target import ComputeTargetException\n",
+ "from azureml.data import OutputFileDatasetConfig"
]
},
{
@@ -297,9 +297,7 @@
"outputs": [],
"source": [
"video_name=os.getenv(\"STYLE_TRANSFER_VIDEO_NAME\", \"orangutan.mp4\") \n",
- "orangutan_video = DataReference(datastore=video_ds,\n",
- " data_reference_name=\"video\",\n",
- " path_on_datastore=video_name, mode=\"download\")"
+ "orangutan_video = Dataset.File.from_files((video_ds,video_name))"
]
},
{
@@ -325,13 +323,11 @@
"metadata": {},
"outputs": [],
"source": [
- "ffmpeg_audio = PipelineData(name=\"ffmpeg_audio\", datastore=default_datastore)\n",
- "processed_images = PipelineData(name=\"processed_images\", datastore=default_datastore)\n",
- "output_video = PipelineData(name=\"output_video\", datastore=default_datastore)\n",
+ "ffmpeg_audio = OutputFileDatasetConfig(name=\"ffmpeg_audio\")\n",
+ "processed_images = OutputFileDatasetConfig(name=\"processed_images\")\n",
+ "output_video = OutputFileDatasetConfig(name=\"output_video\")\n",
"\n",
- "ffmpeg_images_ds_name = \"ffmpeg_images_data\"\n",
- "ffmpeg_images = PipelineData(name=\"ffmpeg_images\", datastore=default_datastore)\n",
- "ffmpeg_images_file_dataset = ffmpeg_images.as_dataset()"
+ "ffmpeg_images = OutputFileDatasetConfig(name=\"ffmpeg_images\")"
]
},
{
@@ -367,13 +363,10 @@
"split_video_step = PythonScriptStep(\n",
" name=\"split video\",\n",
" script_name=\"process_video.py\",\n",
- " arguments=[\"--input_video\", orangutan_video,\n",
+ " arguments=[\"--input_video\", orangutan_video.as_mount(),\n",
" \"--output_audio\", ffmpeg_audio,\n",
- " \"--output_images\", ffmpeg_images_file_dataset,\n",
- " ],\n",
+ " \"--output_images\", ffmpeg_images],\n",
" compute_target=cpu_cluster,\n",
- " inputs=[orangutan_video],\n",
- " outputs=[ffmpeg_images_file_dataset, ffmpeg_audio],\n",
" runconfig=amlcompute_run_config,\n",
" source_directory=scripts_folder\n",
")\n",
@@ -381,12 +374,10 @@
"stitch_video_step = PythonScriptStep(\n",
" name=\"stitch\",\n",
" script_name=\"stitch_video.py\",\n",
- " arguments=[\"--images_dir\", processed_images, \n",
- " \"--input_audio\", ffmpeg_audio, \n",
+ " arguments=[\"--images_dir\", processed_images.as_input(), \n",
+ " \"--input_audio\", ffmpeg_audio.as_input(), \n",
" \"--output_dir\", output_video],\n",
" compute_target=cpu_cluster,\n",
- " inputs=[processed_images, ffmpeg_audio],\n",
- " outputs=[output_video],\n",
" runconfig=amlcompute_run_config,\n",
" source_directory=scripts_folder\n",
")"
@@ -415,7 +406,6 @@
"parallel_cd.add_conda_package(\"torchvision\")\n",
"parallel_cd.add_conda_package(\"pillow<7\") # needed for torchvision==0.4.0\n",
"parallel_cd.add_pip_package(\"azureml-core\")\n",
- "parallel_cd.add_pip_package(\"azureml-dataset-runtime[fuse]\")\n",
"\n",
"styleenvironment = Environment(name=\"styleenvironment\")\n",
"styleenvironment.python.conda_dependencies=parallel_cd\n",
@@ -457,7 +447,7 @@
"\n",
"distributed_style_transfer_step = ParallelRunStep(\n",
" name=parallel_step_name,\n",
- " inputs=[ffmpeg_images_file_dataset], # Input file share/blob container/file dataset\n",
+ " inputs=[ffmpeg_images], # Input file share/blob container/file dataset\n",
" output=processed_images, # Output file share/blob container\n",
" arguments=[\"--style\", style_param],\n",
" parallel_run_config=parallel_run_config,\n",
@@ -552,8 +542,8 @@
"source": [
"def download_video(run, target_dir=None):\n",
" stitch_run = run.find_step_run(stitch_video_step.name)[0]\n",
- " port_data = stitch_run.get_output_data(output_video.name)\n",
- " port_data.download(target_dir, show_progress=True)"
+ " port_data = stitch_run.get_details()['outputDatasets'][0]['dataset']\n",
+ " port_data.download(target_dir)"
]
},
{
diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py
index f34a4e8c8..6d317f7c8 100644
--- a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py
+++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py
@@ -8,7 +8,7 @@
def on_train_result(info):
'''Callback on train result to record metrics returned by trainer.
'''
- run = Run.get_context()
+ run = Run.get_context().parent
run.log(
name='episode_reward_mean',
value=info["result"]["episode_reward_mean"])
diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb
index f8e2dc8ed..5e6c94635 100644
--- a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb
@@ -423,9 +423,6 @@
"source": [
"from azureml.contrib.train.rl import WorkerConfiguration\n",
"\n",
- "# Pip packages we will use for both head and worker\n",
- "pip_packages=[\"ray[rllib]==0.8.3\"] # Latest version of Ray has fixes for isses related to object transfers\n",
- "\n",
"# Specify the Ray worker configuration\n",
"worker_conf = WorkerConfiguration(\n",
" \n",
@@ -439,7 +436,6 @@
" use_gpu=False, \n",
" \n",
" # PIP packages to use\n",
- " pip_packages=pip_packages\n",
")"
]
},
@@ -508,14 +504,11 @@
" # The Azure Machine Learning compute target set up for Ray head nodes\n",
" compute_target=head_compute_target,\n",
" \n",
- " # Pip packages\n",
- " pip_packages=pip_packages,\n",
- " \n",
" # GPU usage\n",
" use_gpu=True,\n",
" \n",
" # Reinforcement learning framework. Currently must be Ray.\n",
- " rl_framework=Ray(),\n",
+ " rl_framework=Ray('0.8.3'),\n",
" \n",
" # Ray worker configuration defined above.\n",
" worker_configuration=worker_conf,\n",
@@ -651,14 +644,8 @@
"metadata": {},
"outputs": [],
"source": [
- "# Get all child runs\n",
- "child_runs = list(run.get_children(_rehydrate_runs=False))\n",
- "\n",
"# Get the reward metrics from worker run\n",
- "if child_runs[0].id.endswith(\"_worker\"):\n",
- " episode_reward_mean = child_runs[0].get_metrics(name='episode_reward_mean')\n",
- "else:\n",
- " episode_reward_mean = child_runs[1].get_metrics(name='episode_reward_mean')"
+ "episode_reward_mean = run.get_metrics(name='episode_reward_mean')"
]
},
{
diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py
index f34a4e8c8..6d317f7c8 100644
--- a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py
+++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py
@@ -8,7 +8,7 @@
def on_train_result(info):
'''Callback on train result to record metrics returned by trainer.
'''
- run = Run.get_context()
+ run = Run.get_context().parent
run.log(
name='episode_reward_mean',
value=info["result"]["episode_reward_mean"])
diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py
index 022aadf01..625a809ab 100644
--- a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py
+++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py
@@ -8,7 +8,7 @@
def on_train_result(info):
'''Callback on train result to record metrics returned by trainer.
'''
- run = Run.get_context()
+ run = Run.get_context().parent
run.log(
name='episode_reward_mean',
value=info["result"]["episode_reward_mean"])
diff --git a/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.yml b/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.yml
index 3fffcf2ff..89a23710c 100644
--- a/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.yml
+++ b/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.yml
@@ -4,7 +4,6 @@ dependencies:
- azureml-sdk
- azureml-interpret
- azureml-contrib-fairness
- - interpret-community[visualization]
- fairlearn==0.4.6
- matplotlib
- azureml-dataset-runtime
diff --git a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb
index 5ccabd654..ce0d1402d 100644
--- a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb
+++ b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb
@@ -100,7 +100,7 @@
"\n",
"# Check core SDK version number\n",
"\n",
- "print(\"This notebook was created using SDK version 1.24.0, you are currently running version\", azureml.core.VERSION)"
+ "print(\"This notebook was created using SDK version 1.25.0, you are currently running version\", azureml.core.VERSION)"
]
},
{
diff --git a/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb b/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb
index e19272288..6f5869fa6 100644
--- a/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb
+++ b/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb
@@ -179,12 +179,14 @@
"outputs": [],
"source": [
"from azureml.core import Environment\n",
+ "from azureml.core.runconfig import DockerConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"myenv = Environment(\"myenv\")\n",
+ "myenv.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn', 'packaging'])\n",
"\n",
- "myenv.docker.enabled = True\n",
- "myenv.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn', 'packaging'])"
+ "# Enable Docker\n",
+ "docker_config = DockerConfiguration(use_docker=True)"
]
},
{
@@ -245,7 +247,8 @@
"src = ScriptRunConfig(source_directory=project_folder, \n",
" script='train.py', \n",
" compute_target=cpu_cluster, \n",
- " environment=myenv)\n",
+ " environment=myenv,\n",
+ " docker_runtime_config=docker_config)\n",
" \n",
"run = experiment.submit(config=src)\n",
"run"
diff --git a/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets.ipynb b/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets.ipynb
deleted file mode 100644
index 3ae1ec691..000000000
--- a/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets.ipynb
+++ /dev/null
@@ -1,402 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Copyright (c) Microsoft Corporation. All rights reserved.\n",
- "\n",
- "Licensed under the MIT License."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Introduction to labeled datasets\n",
- "\n",
- "Labeled datasets are output from Azure Machine Learning [labeling projects](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-create-labeling-projects). It captures the reference to the data (e.g. image files) and its labels. \n",
- "\n",
- "This tutorial introduces the capabilities of labeled datasets and how to use it in training.\n",
- "\n",
- "Learn how-to:\n",
- "\n",
- "> * Set up your development environment\n",
- "> * Explore labeled datasets\n",
- "> * Train a simple deep learning neural network on a remote cluster\n",
- "\n",
- "## Prerequisite:\n",
- "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning\n",
- "* Go through Azure Machine Learning [labeling projects](https://docs.microsoft.com/azure/machine-learning/service/how-to-create-labeling-projects) and export the labels as an Azure Machine Learning dataset\n",
- "* Go through the [configuration notebook](../../../configuration.ipynb) to:\n",
- " * install the latest version of azureml-sdk\n",
- " * install the latest version of azureml-contrib-dataset\n",
- " * install [PyTorch](https://pytorch.org/)\n",
- " * create a workspace and its configuration file (`config.json`)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Set up your development environment\n",
- "\n",
- "All the setup for your development work can be accomplished in a Python notebook. Setup includes:\n",
- "\n",
- "* Importing Python packages\n",
- "* Connecting to a workspace to enable communication between your local computer and remote resources\n",
- "* Creating an experiment to track all your runs\n",
- "* Creating a remote compute target to use for training\n",
- "\n",
- "### Import packages\n",
- "\n",
- "Import Python packages you need in this session. Also display the Azure Machine Learning SDK version."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import azureml.core\n",
- "import azureml.contrib.dataset\n",
- "from azureml.core import Dataset, Workspace, Experiment\n",
- "from azureml.contrib.dataset import FileHandlingOption\n",
- "\n",
- "# check core SDK version number\n",
- "print(\"Azure ML SDK Version: \", azureml.core.VERSION)\n",
- "print(\"Azure ML Contrib Version\", azureml.contrib.dataset.VERSION)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Connect to workspace\n",
- "\n",
- "Create a workspace object from the existing workspace. `Workspace.from_config()` reads the file **config.json** and loads the details into an object named `workspace`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load workspace\n",
- "workspace = Workspace.from_config()\n",
- "print('Workspace name: ' + workspace.name, \n",
- " 'Azure region: ' + workspace.location, \n",
- " 'Subscription id: ' + workspace.subscription_id, \n",
- " 'Resource group: ' + workspace.resource_group, sep='\\n')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create experiment and a directory\n",
- "\n",
- "Create an experiment to track the runs in your workspace and a directory to deliver the necessary code from your computer to the remote resource."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create an ML experiment\n",
- "exp = Experiment(workspace=workspace, name='labeled-datasets')\n",
- "\n",
- "# create a directory\n",
- "script_folder = './labeled-datasets'\n",
- "os.makedirs(script_folder, exist_ok=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create or Attach existing compute resource\n",
- "By using Azure Machine Learning Compute, a managed service, data scientists can train machine learning models on clusters of Azure virtual machines. Examples include VMs with GPU support. In this tutorial, you will create Azure Machine Learning Compute as your training environment. The code below creates the compute clusters for you if they don't already exist in your workspace.\n",
- "\n",
- "**Creation of compute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace the code will skip the creation process."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core.compute import ComputeTarget, AmlCompute\n",
- "from azureml.core.compute_target import ComputeTargetException\n",
- "\n",
- "# choose a name for your cluster\n",
- "cluster_name = \"openhack\"\n",
- "\n",
- "try:\n",
- " compute_target = ComputeTarget(workspace=workspace, name=cluster_name)\n",
- " print('Found existing compute target')\n",
- "except ComputeTargetException:\n",
- " print('Creating a new compute target...')\n",
- " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
- " max_nodes=4)\n",
- "\n",
- " # create the cluster\n",
- " compute_target = ComputeTarget.create(workspace, cluster_name, compute_config)\n",
- "\n",
- " # can poll for a minimum number of nodes and for a specific timeout. \n",
- " # if no min node count is provided it uses the scale settings for the cluster\n",
- " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
- "\n",
- "# use get_status() to get a detailed status for the current cluster. \n",
- "print(compute_target.get_status().serialize())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Explore labeled datasets\n",
- "\n",
- "**Note**: How to create labeled datasets is not covered in this tutorial. To create labeled datasets, you can go through [labeling projects](https://docs.microsoft.com/azure/machine-learning/service/how-to-create-labeling-projects) and export the output labels as Azure Machine Lerning datasets. \n",
- "\n",
- "`animal_labels` used in this tutorial section is the output from a labeling project, with the task type of \"Object Identification\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# get animal_labels dataset from the workspace\n",
- "animal_labels = Dataset.get_by_name(workspace, 'animal_labels')\n",
- "animal_labels"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can load labeled datasets into pandas DataFrame. There are 3 file handling option that you can choose to load the data files referenced by the labeled datasets:\n",
- "* Streaming: The default option to load data files.\n",
- "* Download: Download your data files to a local path.\n",
- "* Mount: Mount your data files to a mount point. Mount only works for Linux-based compute, including Azure Machine Learning notebook VM and Azure Machine Learning Compute."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "animal_pd = animal_labels.to_pandas_dataframe(file_handling_option=FileHandlingOption.DOWNLOAD, target_path='./download/', overwrite_download=True)\n",
- "animal_pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import matplotlib.image as mpimg\n",
- "\n",
- "# read images from downloaded path\n",
- "img = mpimg.imread(animal_pd.loc[0,'image_url'])\n",
- "imgplot = plt.imshow(img)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can also load labeled datasets into [torchvision datasets](https://pytorch.org/docs/stable/torchvision/datasets.html), so that you can leverage on the open source libraries provided by PyTorch for image transformation and training."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from torchvision.transforms import functional as F\n",
- "\n",
- "# load animal_labels dataset into torchvision dataset\n",
- "pytorch_dataset = animal_labels.to_torchvision()\n",
- "img = pytorch_dataset[0][0]\n",
- "print(type(img))\n",
- "\n",
- "# use methods from torchvision to transform the img into grayscale\n",
- "pil_image = F.to_pil_image(img)\n",
- "gray_image = F.to_grayscale(pil_image, num_output_channels=3)\n",
- "\n",
- "imgplot = plt.imshow(gray_image)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Train an image classification model\n",
- "\n",
- " `crack_labels` dataset used in this tutorial section is the output from a labeling project, with the task type of \"Image Classification Multi-class\". We will use this dataset to train an image classification model that classify whether an image has cracks or not."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# get crack_labels dataset from the workspace\n",
- "crack_labels = Dataset.get_by_name(workspace, 'crack_labels')\n",
- "crack_labels"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Configure training job\n",
- "\n",
- "You can ask the system to build a conda environment based on your dependency specification. Once the environment is built, and if you don't change your dependencies, it will be reused in subsequent runs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import Environment\n",
- "from azureml.core.conda_dependencies import CondaDependencies\n",
- "\n",
- "conda_env = Environment('conda-env')\n",
- "conda_env.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk',\n",
- " 'azureml-contrib-dataset',\n",
- " 'torch','torchvision',\n",
- " 'azureml-dataset-runtime[pandas]'])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "A ScriptRunConfig object is used to submit the run. Create a ScriptRunConfig by specifying\n",
- "\n",
- "* The directory that contains your scripts. All the files in this directory are uploaded into the cluster nodes for execution. \n",
- "* The training script name, train.py\n",
- "* The input dataset for training\n",
- "* The compute target. In this case you will use the AmlCompute you created\n",
- "* The environment for the experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.core import ScriptRunConfig\n",
- "\n",
- "src = ScriptRunConfig(source_directory=script_folder,\n",
- " script='train.py',\n",
- " arguments=[crack_labels.as_named_input('crack_labels')],\n",
- " compute_target=compute_target,\n",
- " enviroment=conda_env)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Submit job to run\n",
- "\n",
- "Submit the ScriptRunConfig to the Azure ML experiment to kick off the execution."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "run = exp.submit(src)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "run.wait_for_completion(show_output=True)"
- ]
- }
- ],
- "metadata": {
- "authors": [
- {
- "name": "sihhu"
- }
- ],
- "category": "tutorial",
- "compute": [
- "Remote"
- ],
- "deployment": [
- "None"
- ],
- "exclude_from_index": false,
- "framework": [
- "Azure ML"
- ],
- "friendly_name": "Introduction to labeled datasets",
- "index_order": 1,
- "kernelspec": {
- "display_name": "Python 3.6",
- "language": "python",
- "name": "python36"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.9"
- },
- "nteract": {
- "version": "nteract-front-end@1.0.0"
- },
- "star_tag": [
- "featured"
- ],
- "tags": [
- "Dataset",
- "label",
- "Estimator"
- ],
- "task": "Train"
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
diff --git a/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets/train.py b/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets/train.py
deleted file mode 100644
index a4bfc53e0..000000000
--- a/how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets/train.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import os
-import torchvision
-import torchvision.transforms as transforms
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-from azureml.core import Dataset, Run
-import azureml.contrib.dataset
-from azureml.contrib.dataset import FileHandlingOption, LabeledDatasetTask
-
-run = Run.get_context()
-
-# get input dataset by name
-labeled_dataset = run.input_datasets['crack_labels']
-pytorch_dataset = labeled_dataset.to_torchvision()
-
-
-indices = torch.randperm(len(pytorch_dataset)).tolist()
-dataset_train = torch.utils.data.Subset(pytorch_dataset, indices[:40])
-dataset_test = torch.utils.data.Subset(pytorch_dataset, indices[-10:])
-
-trainloader = torch.utils.data.DataLoader(dataset_train, batch_size=4,
- shuffle=True, num_workers=0)
-
-testloader = torch.utils.data.DataLoader(dataset_test, batch_size=4,
- shuffle=True, num_workers=0)
-
-
-class Net(nn.Module):
- def __init__(self):
- super(Net, self).__init__()
- self.conv1 = nn.Conv2d(3, 6, 5)
- self.pool = nn.MaxPool2d(2, 2)
- self.conv2 = nn.Conv2d(6, 16, 5)
- self.fc1 = nn.Linear(16 * 71 * 71, 120)
- self.fc2 = nn.Linear(120, 84)
- self.fc3 = nn.Linear(84, 10)
-
- def forward(self, x):
- x = self.pool(F.relu(self.conv1(x)))
- x = self.pool(F.relu(self.conv2(x)))
- x = x.view(x.size(0), 16 * 71 * 71)
- x = F.relu(self.fc1(x))
- x = F.relu(self.fc2(x))
- x = self.fc3(x)
- return x
-
-
-net = Net()
-
-criterion = nn.CrossEntropyLoss()
-optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
-
-
-for epoch in range(2): # loop over the dataset multiple times
-
- running_loss = 0.0
- for i, data in enumerate(trainloader, 0):
- # get the inputs; data is a list of [inputs, labels]
- inputs, labels = data
-
- # zero the parameter gradients
- optimizer.zero_grad()
-
- # forward + backward + optimize
- outputs = net(inputs)
- loss = criterion(outputs, labels)
- loss.backward()
- optimizer.step()
-
- # print statistics
- running_loss += loss.item()
- if i % 5 == 4: # print every 5 mini-batches
- print('[%d, %5d] loss: %.3f' %
- (epoch + 1, i + 1, running_loss / 5))
- running_loss = 0.0
-
-print('Finished Training')
-classes = trainloader.dataset.dataset.labels
-PATH = './cifar_net.pth'
-torch.save(net.state_dict(), PATH)
-
-dataiter = iter(testloader)
-images, labels = dataiter.next()
-
-net = Net()
-net.load_state_dict(torch.load(PATH))
-
-outputs = net(images)
-
-_, predicted = torch.max(outputs, 1)
-
-correct = 0
-total = 0
-with torch.no_grad():
- for data in testloader:
- images, labels = data
- outputs = net(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
-
-print('Accuracy of the network on the 10 test images: %d %%' % (100 * correct / total))
-pass
diff --git a/index.md b/index.md
index 3739461cf..aaa000284 100644
--- a/index.md
+++ b/index.md
@@ -19,7 +19,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
| [Register a model and deploy locally](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb) | Deployment | None | Local | Local | None | None |
| :star:[Data drift quickdemo](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datadrift-tutorial/datadrift-tutorial.ipynb) | Filtering | NOAA | Remote | None | Azure ML | Dataset, Timeseries, Drift |
-| :star:[Introduction to labeled datasets](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/labeled-datasets/labeled-datasets.ipynb) | Train | | Remote | None | Azure ML | Dataset, label, Estimator |
| :star:[Datasets with ML Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/pipeline-with-datasets/pipeline-for-image-classification.ipynb) | Train | Fashion MNIST | Remote | None | Azure ML | Dataset, Pipeline, Estimator, ScriptRun |
| :star:[Filtering data using Tabular Timeseiries Dataset related API](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/timeseries-datasets/tabular-timeseries-dataset-filtering.ipynb) | Filtering | NOAA | Local | None | Azure ML | Dataset, Tabular Timeseries |
| :star:[Train with Datasets (Tabular and File)](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets/train-with-datasets.ipynb) | Train | Iris, Diabetes | Remote | None | Azure ML | Dataset, Estimator, ScriptRun |
@@ -110,6 +109,8 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [auto-ml-regression](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb) | | | | | | |
| [automl-databricks-local-01](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb) | | | | | | |
| [automl-databricks-local-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb) | | | | | | |
+| [spark_job_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb) | | | | | | |
+| [spark_session_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb) | | | | | | |
| [multi-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb) | | | | | | |
| [register-model-deploy-local-advanced](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb) | | | | | | |
| [enable-app-insights-in-production-service](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) | | | | | | |
@@ -141,4 +142,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [img-classification-part3-deploy-encrypted](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | | | | | | |
| [tutorial-pipeline-batch-scoring-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb) | | | | | | |
| [azureml-quickstart](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/quickstart/azureml-quickstart.ipynb) | | | | | | |
+| [AzureMLIn10mins](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/quickstart-ci/AzureMLIn10mins.ipynb) | | | | | | |
+| [ClassificationWithAutomatedML](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/quickstart-ci/ClassificationWithAutomatedML.ipynb) | | | | | | |
+| [GettingStartedWithPythonSDK](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/quickstart-ci/GettingStartedWithPythonSDK.ipynb) | | | | | | |
| [regression-automated-ml](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | | | | | | |
diff --git a/setup-environment/configuration.ipynb b/setup-environment/configuration.ipynb
index 895a64bb5..80691cf1a 100644
--- a/setup-environment/configuration.ipynb
+++ b/setup-environment/configuration.ipynb
@@ -102,7 +102,7 @@
"source": [
"import azureml.core\n",
"\n",
- "print(\"This notebook was created using version 1.24.0 of the Azure ML SDK\")\n",
+ "print(\"This notebook was created using version 1.25.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
diff --git a/tutorials/README.md b/tutorials/README.md
index 45f56410a..6136eae51 100644
--- a/tutorials/README.md
+++ b/tutorials/README.md
@@ -23,6 +23,10 @@ The following tutorials are intended to provide an introductory overview of Azur
| [Deploy an image classification model](https://docs.microsoft.com/azure/machine-learning/tutorial-deploy-models-with-aml) | Deploy a scikit-learn image classification model to Azure Container Instances. | [img-classification-part2-deploy.ipynb](image-classification-mnist-data/img-classification-part2-deploy.ipynb) | Image Classification | Scikit-Learn
| [Deploy an encrypted inferencing service](https://docs.microsoft.com/azure/machine-learning/tutorial-deploy-models-with-aml) |Deploy an image classification model for encrypted inferencing in Azure Container Instances | [img-classification-part3-deploy-encrypted.ipynb](image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | Image Classification | Scikit-Learn
| [Use automated machine learning to predict taxi fares](https://docs.microsoft.com/azure/machine-learning/tutorial-auto-train-models) | Train a regression model to predict taxi fares using Automated Machine Learning. | [regression-part2-automated-ml.ipynb](regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | Regression | Automated ML
+| Azure ML in 10 minutes, to be run on a Compute Instance |Learn how to run an image classification model, track model metrics, and deploy a model in 10 minutes. | [AzureMLIn10mins.ipynb](quickstart-ci/AzureMLIn10mins.ipynb) | Image Classification | Scikit-Learn |
+| Get started with Azure ML Job Submission, to be run on a Compute Instance |Learn how to use the Azure Machine Learning Python SDK to submit batch jobs. | [GettingStartedWithPythonSDK.ipynb](quickstart-ci/GettingStartedWithPythonSDK.ipynb) | Image Classification | Scikit-Learn |
+| Get started with Automated ML, to be run on a Compute Instance | Learn how to use Automated ML for Fraud classification. | [ClassificationWithAutomatedML.ipynb](quickstart-ci/ClassificationWithAutomatedML.ipynb) | Classification | Automated ML |
+
## Advanced Samples
diff --git a/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb b/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb
index 14effa639..e593fabcc 100644
--- a/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb
+++ b/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb
@@ -337,7 +337,7 @@
" error_threshold=1,\n",
" compute_target=compute_target,\n",
" process_count_per_node=2,\n",
- " node_count=1\n",
+ " node_count=2\n",
")"
]
},
@@ -367,10 +367,11 @@
"source": [
"from azureml.pipeline.steps import ParallelRunStep\n",
"from datetime import datetime\n",
+ "import uuid\n",
"\n",
"parallel_step_name = \"batchscoring-\" + datetime.now().strftime(\"%Y%m%d%H%M\")\n",
"\n",
- "label_config = label_ds.as_named_input(\"labels_input\")\n",
+ "label_config = label_ds.as_named_input(\"labels_input\").as_mount(\"/tmp/{}\".format(str(uuid.uuid4())))\n",
"\n",
"batch_score_step = ParallelRunStep(\n",
" name=parallel_step_name,\n",
diff --git a/tutorials/quickstart-ci/AzureMLIn10mins.ipynb b/tutorials/quickstart-ci/AzureMLIn10mins.ipynb
new file mode 100644
index 000000000..c6901872d
--- /dev/null
+++ b/tutorials/quickstart-ci/AzureMLIn10mins.ipynb
@@ -0,0 +1,669 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "# Quickstart: Train and deploy a model in Azure Machine Learning in 10 minutes\n",
+ "\n",
+ "In this quickstart, learn how to get started with Azure Machine Learning. You'll train an image classification model using the [MNIST](https://azure.microsoft.com/services/open-datasets/catalog/mnist/) dataset.\n",
+ "\n",
+ "You'll learn how to:\n",
+ "\n",
+ "> * Download a dataset and look at the data\n",
+ "> * Train an image classification model and log metrics\n",
+ "> * Deploy the model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Connect to your workspace and create an experiment\n",
+ "\n",
+ "Import some libraries and create an experiment to track the runs in your workspace. A workspace can have multiple experiments, and all users that have access to the workspace can collaborate on them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965916889
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import azureml.core\n",
+ "from azureml.core import Workspace\n",
+ "from azureml.core import Experiment\n",
+ "\n",
+ "# connect to your workspace\n",
+ "ws = Workspace.from_config()\n",
+ "\n",
+ "# create experiment and start logging to a new run in the experiment\n",
+ "experiment_name = \"azure-ml-in10-mins-tutorial\"\n",
+ "exp = Experiment(workspace=ws, name=experiment_name)\n",
+ "run = exp.start_logging(snapshot_directory=None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Import Data\n",
+ "\n",
+ "Before you train a model, you need to understand the data you're using to train it. In this section, learn how to:\n",
+ "\n",
+ "* Download the MNIST dataset\n",
+ "* Display some sample images\n",
+ "\n",
+ "### Download the MNIST dataset\n",
+ "\n",
+ "You'll use Azure Open Datasets to get the raw MNIST data files. [Azure Open Datasets](https://docs.microsoft.com/azure/open-datasets/overview-what-are-open-datasets) are curated public datasets that you can use to add scenario-specific features to machine learning solutions for better models. Each dataset has a corresponding class, `MNIST` in this case, to retrieve the data in different ways."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965922274
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from azureml.core import Dataset\n",
+ "from azureml.opendatasets import MNIST\n",
+ "\n",
+ "data_folder = os.path.join(os.getcwd(), \"data\")\n",
+ "os.makedirs(data_folder, exist_ok=True)\n",
+ "\n",
+ "mnist_file_dataset = MNIST.get_file_dataset()\n",
+ "mnist_file_dataset.download(data_folder, overwrite=True)\n",
+ "\n",
+ "mnist_file_dataset = mnist_file_dataset.register(\n",
+ " workspace=ws,\n",
+ " name=\"mnist_opendataset\",\n",
+ " description=\"training and test dataset\",\n",
+ " create_new_version=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Take a look at the data\n",
+ "\n",
+ "Load the compressed files into `numpy` arrays. Then use `matplotlib` to plot 30 random images from the dataset with their labels above them. \n",
+ "\n",
+ "Note this step requires a `load_data` function that's included in an `utils.py` file. This file is placed in the same folder as this notebook. The `load_data` function simply parses the compressed files into numpy arrays."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965929041
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from utils import load_data\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import glob\n",
+ "\n",
+ "\n",
+ "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the model converge faster.\n",
+ "X_train = (\n",
+ " load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/train-images-idx3-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " False,\n",
+ " )\n",
+ " / 255.0\n",
+ ")\n",
+ "X_test = (\n",
+ " load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/t10k-images-idx3-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " False,\n",
+ " )\n",
+ " / 255.0\n",
+ ")\n",
+ "y_train = load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/train-labels-idx1-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " True,\n",
+ ").reshape(-1)\n",
+ "y_test = load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/t10k-labels-idx1-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " True,\n",
+ ").reshape(-1)\n",
+ "\n",
+ "\n",
+ "# now let's show some randomly chosen images from the traininng set.\n",
+ "count = 0\n",
+ "sample_size = 30\n",
+ "plt.figure(figsize=(16, 6))\n",
+ "for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n",
+ " count = count + 1\n",
+ " plt.subplot(1, sample_size, count)\n",
+ " plt.axhline(\"\")\n",
+ " plt.axvline(\"\")\n",
+ " plt.text(x=10, y=-10, s=y_train[i], fontsize=18)\n",
+ " plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Train model and log metrics\n",
+ "\n",
+ "You'll train the model using the code below. Your training runs and metrics will be registered in the experiment you created, so that this information is available after you've finished.\n",
+ "\n",
+ "You'll be using the [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) classifier from the [SciKit Learn framework](https://scikit-learn.org/) to classify the data.\n",
+ "\n",
+ "> **Note: The model training takes around 1 minute to complete.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966046970
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# create the model\n",
+ "import numpy as np\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "reg = 0.5\n",
+ "clf = LogisticRegression(\n",
+ " C=1.0 / reg, solver=\"liblinear\", multi_class=\"auto\", random_state=42\n",
+ ")\n",
+ "clf.fit(X_train, y_train)\n",
+ "\n",
+ "# make predictions using the test set and calculate the accuracy\n",
+ "y_hat = clf.predict(X_test)\n",
+ "\n",
+ "# calculate accuracy on the prediction\n",
+ "acc = np.average(y_hat == y_test)\n",
+ "print(\"Accuracy is\", acc)\n",
+ "\n",
+ "run.log(\"regularization rate\", np.float(reg))\n",
+ "run.log(\"accuracy\", np.float(acc))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Version control your models with the model registry\n",
+ "\n",
+ "You can use model registration to store and version your models in your workspace. Registered models are identified by name and version. Each time you register a model with the same name as an existing one, the registry increments the version. Azure Machine Learning supports any model that can be loaded through Python 3.\n",
+ "\n",
+ "The code below:\n",
+ "\n",
+ "1. Saves the model to disk\n",
+ "1. Uploads the model file to the run \n",
+ "1. Registers the uploaded model file\n",
+ "1. Transitions the run to a completed state"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612881042710
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import joblib\n",
+ "from azureml.core.model import Model\n",
+ "\n",
+ "path = \"sklearn_mnist_model.pkl\"\n",
+ "joblib.dump(value=clf, filename=path)\n",
+ "\n",
+ "run.upload_file(name=path, path_or_stream=path)\n",
+ "\n",
+ "model = run.register_model(\n",
+ " model_name=\"sklearn_mnist_model\",\n",
+ " model_path=path,\n",
+ " description=\"Mnist handwriting recognition\",\n",
+ ")\n",
+ "\n",
+ "run.complete()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Deploy the model\n",
+ "\n",
+ "The next cell deploys the model to an Azure Container Instance so that you can score data in real-time (Azure Machine Learning also provides mechanisms to do batch scoring). A real-time endpoint allows application developers to integrate machine learning into their apps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612881061728
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# create environment for the deploy\n",
+ "from azureml.core.environment import Environment\n",
+ "from azureml.core.conda_dependencies import CondaDependencies\n",
+ "\n",
+ "# to install required packages\n",
+ "env = Environment(\"quickstart-env\")\n",
+ "cd = CondaDependencies.create(\n",
+ " pip_packages=[\"azureml-dataset-runtime[pandas,fuse]\", \"azureml-defaults\"],\n",
+ " conda_packages=[\"scikit-learn==0.22.1\"],\n",
+ ")\n",
+ "\n",
+ "env.python.conda_dependencies = cd\n",
+ "\n",
+ "# Register environment to re-use later\n",
+ "env.register(workspace=ws)\n",
+ "\n",
+ "# create config file\n",
+ "from azureml.core.webservice import AciWebservice\n",
+ "\n",
+ "aciconfig = AciWebservice.deploy_configuration(\n",
+ " cpu_cores=1,\n",
+ " memory_gb=1,\n",
+ " tags={\"data\": \"MNIST\", \"method\": \"sklearn\"},\n",
+ " description=\"Predict MNIST with sklearn\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "> **Note: The deployment takes around 3 minutes to complete.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "import uuid\n",
+ "from azureml.core.webservice import Webservice\n",
+ "from azureml.core.model import InferenceConfig\n",
+ "from azureml.core.environment import Environment\n",
+ "from azureml.core import Workspace\n",
+ "from azureml.core.model import Model\n",
+ "\n",
+ "ws = Workspace.from_config()\n",
+ "model = Model(ws, \"sklearn_mnist_model\")\n",
+ "\n",
+ "\n",
+ "myenv = Environment.get(workspace=ws, name=\"quickstart-env\", version=\"1\")\n",
+ "inference_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)\n",
+ "\n",
+ "service_name = \"sklearn-mnist-svc-\" + str(uuid.uuid4())[:4]\n",
+ "service = Model.deploy(\n",
+ " workspace=ws,\n",
+ " name=service_name,\n",
+ " models=[model],\n",
+ " inference_config=inference_config,\n",
+ " deployment_config=aciconfig,\n",
+ ")\n",
+ "\n",
+ "service.wait_for_deployment(show_output=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The [*scoring script*](score.py) file referenced in the code above can be found in the same folder as this notebook, and has two functions:\n",
+ "\n",
+ "1. an `init` function that executes once when the service starts - in this function you normally get the model from the registry and set global variables\n",
+ "1. a `run(data)` function that executes each time a call is made to the service. In this function, you normally format the input data, run a prediction, and output the predicted result."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Test the model service\n",
+ "\n",
+ "You can test the model by sending a raw HTTP request to test the web service. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612881527399
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# scoring web service HTTP endpoint\n",
+ "print(service.scoring_uri)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612881538381
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# send raw HTTP request to test the web service.\n",
+ "import requests\n",
+ "\n",
+ "# send a random row from the test set to score\n",
+ "random_index = np.random.randint(0, len(X_test) - 1)\n",
+ "input_data = '{\"data\": [' + str(list(X_test[random_index])) + \"]}\"\n",
+ "\n",
+ "headers = {\"Content-Type\": \"application/json\"}\n",
+ "\n",
+ "# for AKS deployment you'd need to the service key in the header as well\n",
+ "# api_key = service.get_key()\n",
+ "# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}\n",
+ "\n",
+ "resp = requests.post(service.scoring_uri, input_data, headers=headers)\n",
+ "\n",
+ "print(\"POST to url\", service.scoring_uri)\n",
+ "# print(\"input data:\", input_data)\n",
+ "print(\"label:\", y_test[random_index])\n",
+ "print(\"prediction:\", resp.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "\n",
+ "### View the results of your training\n",
+ "\n",
+ "When you're finished with an experiment run, you can always return to view the results of your model training here in the Azure Machine Learning studio:\n",
+ "\n",
+ "1. Select **Experiments** (left-hand menu)\n",
+ "1. Select **azure-ml-in10-mins-tutorial**\n",
+ "1. Select **Run 1**\n",
+ "1. Select the **Metrics** Tab\n",
+ "\n",
+ "The metrics tab will display the parameter values that were logged to the run."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### View the model in the model registry\n",
+ "\n",
+ "You can see the stored model by navigating to **Models** in the left-hand menu bar. Select the **sklearn_mnist_model** to see the details of the model, including the experiment run ID that created the model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clean up resources\n",
+ "\n",
+ "If you're not going to continue to use this model, delete the Model service using:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612881556520
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# if you want to keep workspace and only delete endpoint (it will incur cost while running)\n",
+ "service.delete()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you want to control cost further, stop the compute instance by selecting the \"Stop compute\" button next to the **Compute** dropdown. Then start the compute instance again the next time you need it."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Next Steps\n",
+ "\n",
+ "In this quickstart, you learned how to run machine learning code in Azure Machine Learning.\n",
+ "\n",
+ "Now that you have working code in a development environment, learn how to submit a **_job_** - ideally on a schedule or trigger (for example, arrival of new data).\n",
+ "\n",
+ " [**Learn how to get started with Azure ML Job Submission**](GettingStartedWithPythonSDK.ipynb) "
+ ]
+ }
+ ],
+ "metadata": {
+ "authors": [
+ {
+ "name": "cewidste"
+ }
+ ],
+ "kernelspec": {
+ "display_name": "Python 3.6",
+ "language": "python36",
+ "name": "python36"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ },
+ "microsoft": {
+ "host": {
+ "AzureML": {
+ "notebookHasBeenCompleted": true
+ }
+ }
+ },
+ "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
+ "nteract": {
+ "version": "nteract-front-end@1.0.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tutorials/quickstart-ci/AzureMLIn10mins.yml b/tutorials/quickstart-ci/AzureMLIn10mins.yml
new file mode 100644
index 000000000..144bb6e7a
--- /dev/null
+++ b/tutorials/quickstart-ci/AzureMLIn10mins.yml
@@ -0,0 +1,11 @@
+name: AzureMLIn10mins
+dependencies:
+- pip:
+ - azureml-sdk
+ - sklearn
+ - numpy
+ - matplotlib
+ - joblib
+ - uuid
+ - requests
+ - azureml-opendatasets
diff --git a/tutorials/quickstart-ci/ClassificationWithAutomatedML.ipynb b/tutorials/quickstart-ci/ClassificationWithAutomatedML.ipynb
new file mode 100644
index 000000000..1ca83f283
--- /dev/null
+++ b/tutorials/quickstart-ci/ClassificationWithAutomatedML.ipynb
@@ -0,0 +1,505 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "# Quickstart: Fraud Classification using Automated ML\n",
+ "\n",
+ "In this quickstart, you use automated machine learning in Azure Machine Learning service to train a classification model on an associated fraud credit card dataset. This process accepts training data and configuration settings, and automatically iterates through combinations of different feature normalization/standardization methods, models, and hyperparameter settings to arrive at the best model.\n",
+ "\n",
+ "You will learn how to:\n",
+ "\n",
+ "> * Download a dataset and look at the data\n",
+ "> * Train a machine learning classification model using autoML \n",
+ "> * Explore the results\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Connect to your workspace and create an experiment\n",
+ "\n",
+ "You start with importing some libraries and creating an experiment to track the runs in your workspace. A workspace can have multiple experiments, and all the users that have access to the workspace can collaborate on them. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612968646250
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import logging\n",
+ "\n",
+ "from matplotlib import pyplot as plt\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "import azureml.core\n",
+ "from azureml.core.experiment import Experiment\n",
+ "from azureml.core.workspace import Workspace\n",
+ "from azureml.core.dataset import Dataset\n",
+ "from azureml.train.automl import AutoMLConfig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612968706273
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ws = Workspace.from_config()\n",
+ "\n",
+ "# choose a name for your experiment\n",
+ "experiment_name = \"fraud-classification-automl-tutorial\"\n",
+ "\n",
+ "experiment = Experiment(ws, experiment_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Load Data\n",
+ "\n",
+ "Load the credit card dataset from a csv file containing both training features and labels. The features are inputs to the model, while the training labels represent the expected output of the model. Next, we'll split the data using random_split and extract the training data for the model.\n",
+ "\n",
+ "\n",
+ "Follow this [how-to](https://aka.ms/azureml/howto/createdatasets) if you want to learn more about Datasets and how to use them.\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612968722555
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
+ "dataset = Dataset.Tabular.from_delimited_files(data)\n",
+ "training_data, validation_data = dataset.random_split(percentage=0.8, seed=223)\n",
+ "label_column_name = \"Class\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Train\n",
+ "\n",
+ "\n",
+ "\n",
+ "When you use automated machine learning in Azure ML, you input training data and configuration settings, and the process automatically iterates through combinations of different feature normalization/standardization methods, models, and hyperparameter settings to arrive at the best model. \n",
+ "Learn more about how you configure automated ML [here](https://docs.microsoft.com/azure/machine-learning/how-to-configure-auto-train).\n",
+ "\n",
+ "\n",
+ "Instantiate an [AutoMLConfig](https://docs.microsoft.com/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py) object. This defines the settings and data used to run the experiment.\n",
+ "\n",
+ "|Property|Description|\n",
+ "|-|-|\n",
+ "|**task**|classification or regression|\n",
+ "|**primary_metric**|This is the metric that you want to optimize. \n",
+ "|**enable_early_stopping** | Stop the run if the metric score is not showing improvement.|\n",
+ "|**n_cross_validations**|Number of cross validation splits.|\n",
+ "|**training_data**|Input dataset, containing both features and label column.|\n",
+ "|**label_column_name**|The name of the label column.|\n",
+ "\n",
+ "You can find more information about primary metrics [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612968806233
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "automl_settings = {\n",
+ " \"n_cross_validations\": 3,\n",
+ " \"primary_metric\": \"average_precision_score_weighted\",\n",
+ " \"experiment_timeout_hours\": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ability to find the best model possible\n",
+ " \"verbosity\": logging.INFO,\n",
+ " \"enable_stack_ensemble\": False,\n",
+ "}\n",
+ "\n",
+ "automl_config = AutoMLConfig(\n",
+ " task=\"classification\",\n",
+ " debug_log=\"automl_errors.log\",\n",
+ " training_data=training_data,\n",
+ " label_column_name=label_column_name,\n",
+ " **automl_settings,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "Call the `submit` method on the experiment object and pass the run configuration. \n",
+ "\n",
+ "**Note: Depending on the data and the number of iterations an AutoML run can take a while to complete.**\n",
+ "\n",
+ "In this example, we specify `show_output = True` to print currently running iterations to the console. It is also possible to navigate to the experiment through the **Experiment** activity tab in the left menu, and monitor the run status from there."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612970125369
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "local_run = experiment.submit(automl_config, show_output=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612976292559
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "local_run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Analyze results\n",
+ "\n",
+ "Below we select the best model from our iterations. The `get_output` method on `automl_classifier` returns the best run and the model for the run."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612976298373
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "best_run, best_model = local_run.get_output()\n",
+ "best_model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Tests\n",
+ "\n",
+ "Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612976320370
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# convert the test data to dataframe\n",
+ "X_test_df = validation_data.drop_columns(\n",
+ " columns=[label_column_name]\n",
+ ").to_pandas_dataframe()\n",
+ "y_test_df = validation_data.keep_columns(\n",
+ " columns=[label_column_name], validate=True\n",
+ ").to_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612976325829
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# call the predict functions on the model\n",
+ "y_pred = best_model.predict(X_test_df)\n",
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "\n",
+ "\n",
+ "### Calculate metrics for the prediction\n",
+ "\n",
+ "Now visualize the data to show what our truth (actual) values are compared to the predicted values \n",
+ "from the trained model that was returned.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612976330108
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import confusion_matrix\n",
+ "import numpy as np\n",
+ "import itertools\n",
+ "\n",
+ "cf = confusion_matrix(y_test_df.values, y_pred)\n",
+ "plt.imshow(cf, cmap=plt.cm.Blues, interpolation=\"nearest\")\n",
+ "plt.colorbar()\n",
+ "plt.title(\"Confusion Matrix\")\n",
+ "plt.xlabel(\"Predicted\")\n",
+ "plt.ylabel(\"Actual\")\n",
+ "class_labels = [\"False\", \"True\"]\n",
+ "tick_marks = np.arange(len(class_labels))\n",
+ "plt.xticks(tick_marks, class_labels)\n",
+ "plt.yticks([-0.5, 0, 1, 1.5], [\"\", \"False\", \"True\", \"\"])\n",
+ "# plotting text value inside cells\n",
+ "thresh = cf.max() / 2.0\n",
+ "for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):\n",
+ " plt.text(\n",
+ " j,\n",
+ " i,\n",
+ " format(cf[i, j], \"d\"),\n",
+ " horizontalalignment=\"center\",\n",
+ " color=\"white\" if cf[i, j] > thresh else \"black\",\n",
+ " )\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Control cost and further exploration\n",
+ "\n",
+ "If you want to control cost you can stop the compute instance this notebook is running on by clicking the \"Stop compute\" button next to the status dropdown in the menu above.\n",
+ "\n",
+ "\n",
+ "If you want to run more notebook samples, you can click on **Sample Notebooks** next to the **Files** view and explore the notebooks made available for you there."
+ ]
+ }
+ ],
+ "metadata": {
+ "authors": [
+ {
+ "name": "cewidste"
+ }
+ ],
+ "kernelspec": {
+ "display_name": "Python 3.6",
+ "language": "python",
+ "name": "python36"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ },
+ "microsoft": {
+ "host": {
+ "AzureML": {
+ "notebookHasBeenCompleted": true
+ }
+ }
+ },
+ "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
+ "nteract": {
+ "version": "nteract-front-end@1.0.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tutorials/quickstart-ci/ClassificationWithAutomatedML.yml b/tutorials/quickstart-ci/ClassificationWithAutomatedML.yml
new file mode 100644
index 000000000..c4f874529
--- /dev/null
+++ b/tutorials/quickstart-ci/ClassificationWithAutomatedML.yml
@@ -0,0 +1,4 @@
+name: ClassificationWithAutomatedML
+dependencies:
+- pip:
+ - azureml-sdk
diff --git a/tutorials/quickstart-ci/GettingStartedWithPythonSDK.ipynb b/tutorials/quickstart-ci/GettingStartedWithPythonSDK.ipynb
new file mode 100644
index 000000000..2fcd6e9d8
--- /dev/null
+++ b/tutorials/quickstart-ci/GettingStartedWithPythonSDK.ipynb
@@ -0,0 +1,710 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "# Quickstart: Learn how to get started with Azure ML Job Submission\n",
+ "\n",
+ "In this quickstart, you train a machine learning model by submitting a Job to a compute target. \n",
+ "When training, it is common to start on your local computer, and then later scale out to a cloud-based cluster. \n",
+ "\n",
+ "All you need to do is define the environment for each compute target within a script run configuration. Then, when you want to run your training experiment on a different compute target, specify the run configuration for that compute.\n",
+ "\n",
+ "This quickstart trains a simple logistic regression using the [MNIST](https://azure.microsoft.com/services/open-datasets/catalog/mnist/) dataset and [scikit-learn](http://scikit-learn.org) with Azure Machine Learning. MNIST is a popular dataset consisting of 70,000 grayscale images. Each image is a handwritten digit of 28x28 pixels, representing a number from 0 to 9. The goal is to create a multi-class classifier to identify the digit a given image represents. \n",
+ "\n",
+ "You will learn how to:\n",
+ "\n",
+ "> * Download a dataset and look at the data\n",
+ "> * Train an image classification model by submitting a batch job to a compute resource\n",
+ "> * Review training results, find and register the best model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Connect to your workspace and create an experiment\n",
+ "\n",
+ "You start with importing some libraries and creating an experiment to track the runs in your workspace. A workspace can have multiple experiments, and all the users that have access to the workspace can collaborate on them. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965838618
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import azureml.core\n",
+ "from azureml.core import Workspace\n",
+ "from azureml.core import Experiment\n",
+ "\n",
+ "# connect to your workspace\n",
+ "ws = Workspace.from_config()\n",
+ "\n",
+ "experiment_name = \"get-started-with-jobsubmission-tutorial\"\n",
+ "exp = Experiment(workspace=ws, name=experiment_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Import Data\n",
+ "\n",
+ "Before you train a model, you need to understand the data that you are using to train it. In this section you will:\n",
+ "\n",
+ "* Download the MNIST dataset\n",
+ "* Display some sample images\n",
+ "\n",
+ "### Download the MNIST dataset\n",
+ "\n",
+ "Use Azure Open Datasets to get the raw MNIST data files. [Azure Open Datasets](https://docs.microsoft.com/azure/open-datasets/overview-what-are-open-datasets) are curated public datasets that you can use to add scenario-specific features to machine learning solutions for more accurate models. Each dataset has a corresponding class, `MNIST` in this case, to retrieve the data in different ways.\n",
+ "\n",
+ "Follow this [how-to](https://aka.ms/azureml/howto/createdatasets) if you want to learn more about Datasets and how to use them.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965850391
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from azureml.core import Dataset\n",
+ "from azureml.opendatasets import MNIST\n",
+ "\n",
+ "data_folder = os.path.join(os.getcwd(), \"data\")\n",
+ "os.makedirs(data_folder, exist_ok=True)\n",
+ "\n",
+ "mnist_file_dataset = MNIST.get_file_dataset()\n",
+ "mnist_file_dataset.download(data_folder, overwrite=True)\n",
+ "\n",
+ "mnist_file_dataset = mnist_file_dataset.register(\n",
+ " workspace=ws,\n",
+ " name=\"mnist_opendataset\",\n",
+ " description=\"training and test dataset\",\n",
+ " create_new_version=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Take a look at the data\n",
+ "You will load the compressed files into `numpy` arrays. Then use `matplotlib` to plot 30 random images from the dataset with their labels above them. Note this step requires a `load_data` function that's included in an `utils.py` file. This file is placed in the same folder as this notebook. The `load_data` function simply parses the compressed files into numpy arrays. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965857960
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# make sure utils.py is in the same directory as this code\n",
+ "from utils import load_data\n",
+ "import glob\n",
+ "\n",
+ "\n",
+ "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the model converge faster.\n",
+ "X_train = (\n",
+ " load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/train-images-idx3-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " False,\n",
+ " )\n",
+ " / 255.0\n",
+ ")\n",
+ "X_test = (\n",
+ " load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/t10k-images-idx3-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " False,\n",
+ " )\n",
+ " / 255.0\n",
+ ")\n",
+ "y_train = load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/train-labels-idx1-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " True,\n",
+ ").reshape(-1)\n",
+ "y_test = load_data(\n",
+ " glob.glob(\n",
+ " os.path.join(data_folder, \"**/t10k-labels-idx1-ubyte.gz\"), recursive=True\n",
+ " )[0],\n",
+ " True,\n",
+ ").reshape(-1)\n",
+ "\n",
+ "\n",
+ "# now let's show some randomly chosen images from the training set.\n",
+ "count = 0\n",
+ "sample_size = 30\n",
+ "plt.figure(figsize=(16, 6))\n",
+ "for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n",
+ " count = count + 1\n",
+ " plt.subplot(1, sample_size, count)\n",
+ " plt.axhline(\"\")\n",
+ " plt.axvline(\"\")\n",
+ " plt.text(x=10, y=-10, s=y_train[i], fontsize=18)\n",
+ " plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Submit your training job\n",
+ "\n",
+ "In this quickstart you submit a job to run on the local compute, but you can use the same code to submit this training job to other compute targets. With Azure Machine Learning, you can run your script on various compute targets without having to change your training script. \n",
+ "\n",
+ "To submit a job you need:\n",
+ "* A directory\n",
+ "* A training script\n",
+ "* Create a script run configuration\n",
+ "* Submit the job \n",
+ "\n",
+ "\n",
+ "### Directory and training script \n",
+ "\n",
+ "You need a directory to deliver the necessary code from your computer to the remote resource. A directory with a training script has been created for you and can be found in the same folder as this notebook.\n",
+ "\n",
+ "Take a few minutes to examine the training script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965865707
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "with open(\"sklearn-mnist-batch/train.py\", \"r\") as f:\n",
+ " print(f.read())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "Notice how the script gets data and saves models:\n",
+ "\n",
+ "+ The training script reads an argument to find the directory containing the data. When you submit the job later, you point to the dataset for this argument:\n",
+ "`parser.add_argument('--data-folder', type=str, dest='data_folder', help='data directory mounting point')`\n",
+ "\n",
+ "\n",
+ "+ The training script saves your model into a directory named outputs.
\n",
+ "`joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')`
\n",
+ "Anything written in this directory is automatically uploaded into your workspace. You'll access your model from this directory later in the tutorial.\n",
+ "\n",
+ "The file `utils.py` is referenced from the training script to load the dataset correctly. This script is also copied into the script folder so that it can be accessed along with the training script."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Configure the training job\n",
+ "\n",
+ "Create a [ScriptRunConfig]() object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on. Configure the ScriptRunConfig by specifying:\n",
+ "\n",
+ "* The directory that contains your scripts. All the files in this directory are uploaded into the cluster nodes for execution. \n",
+ "* The compute target. In this case you will point to local compute\n",
+ "* The training script name, train.py\n",
+ "* An environment that contains the libraries needed to run the script\n",
+ "* Arguments required from the training script. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "An Environment defines Python packages, environment variables, and Docker settings that are used in machine learning experiments. Here you will be using a curated environment that has already been made available through the workspace. \n",
+ "\n",
+ "Read [this article](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments) if you want to learn more about Environments and how to use them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965877458
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from azureml.core.environment import Environment\n",
+ "from azureml.core.conda_dependencies import CondaDependencies\n",
+ "\n",
+ "# use a curated environment that has already been built for you\n",
+ "\n",
+ "env = Environment.get(workspace=ws, name=\"AzureML-Scikit-learn-0.20.3\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "Create a [ScriptRunConfig](https://docs.microsoft.com/python/api/azureml-core/azureml.core.scriptrunconfig?preserve-view=true&view=azure-ml-py) object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on. A script run configuration is used to configure the information necessary for submitting a training run as part of an experiment. \n",
+ "\n",
+ "Read more about configuring and submitting training runs [here](https://docs.microsoft.com/azure/machine-learning/how-to-set-up-training-targets). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965882781
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from azureml.core import ScriptRunConfig\n",
+ "\n",
+ "args = [\"--data-folder\", mnist_file_dataset.as_mount(), \"--regularization\", 0.5]\n",
+ "\n",
+ "script_folder = \"sklearn-mnist-batch\"\n",
+ "src = ScriptRunConfig(\n",
+ " source_directory=script_folder,\n",
+ " script=\"train.py\",\n",
+ " arguments=args,\n",
+ " compute_target=\"local\",\n",
+ " environment=env,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Submit the job\n",
+ "\n",
+ "Run the experiment by submitting the ScriptRunConfig object. After this there are many options for monitoring your run. You can either navigate to the experiment \"get-started-with-jobsubmission-tutorial\" in the left menu item Experiments to monitor the run (quick link to the run details page in the cell output below), or you can monitor the run inline in this notebook by using the Jupyter widget activated below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612965911435
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "run = exp.submit(config=src)\n",
+ "run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Jupyter widget\n",
+ "\n",
+ "Watch the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966026710
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from azureml.widgets import RunDetails\n",
+ "\n",
+ "RunDetails(run).show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "if you want to cancel a run, you can follow [these instructions](https://aka.ms/aml-docs-cancel-run)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Get log results upon completion\n",
+ "\n",
+ "Model training happens in the background. You can use `wait_for_completion` to block and wait until the model has completed training before running more code. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966045110
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# specify show_output to True for a verbose log\n",
+ "run.wait_for_completion(show_output=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "### Display run results\n",
+ "\n",
+ "You now have a trained model. Retrieve all the metrics logged during the run, including the accuracy of the model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966059052
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "print(run.get_metrics())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Register model\n",
+ "\n",
+ "The last step in the training script wrote the file `outputs/sklearn_mnist_model.pkl` in a directory named `outputs` on the compute where the job is executed. `outputs` is a special directory in that all content in this directory is automatically uploaded to your workspace. This content appears in the run record in the experiment under your workspace. Hence, the model file is now also available in your workspace."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966064041
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "print(run.get_file_names())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "Register the model in the workspace so that you (or your team members with access to the workspace) can later query, examine, and deploy this model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "gather": {
+ "logged": 1612966068862
+ },
+ "jupyter": {
+ "outputs_hidden": false,
+ "source_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# register model\n",
+ "model = run.register_model(\n",
+ " model_name=\"sklearn_mnist\", model_path=\"outputs/sklearn_mnist_model.pkl\"\n",
+ ")\n",
+ "print(model.name, model.id, model.version, sep=\"\\t\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ },
+ "source": [
+ "## Control Cost\n",
+ "\n",
+ "If you want to control cost you can stop the compute instance this notebook is running on by clicking the \"Stop compute\" button next to the status dropdown in the menu above.\n",
+ "\n",
+ " ## Next Steps\n",
+ "\n",
+ "In this quickstart, you have seen how to run jobs-based machine learning code in Azure Machine Learning. \n",
+ "\n",
+ "It is also possible to use automated machine learning in Azure Machine Learning service to find the best model in an automated fashion. To see how this works, we recommend that you follow the next quickstart in this series, [**Fraud Classification using Automated ML**](ClassificationWithAutomatedML.ipynb). This quickstart is focused on AutoML using the Python SDK."
+ ]
+ }
+ ],
+ "metadata": {
+ "authors": [
+ {
+ "name": "cewidste"
+ }
+ ],
+ "kernelspec": {
+ "display_name": "Python 3.6",
+ "language": "python",
+ "name": "python36"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ },
+ "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
+ "nteract": {
+ "version": "nteract-front-end@1.0.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tutorials/quickstart-ci/GettingStartedWithPythonSDK.yml b/tutorials/quickstart-ci/GettingStartedWithPythonSDK.yml
new file mode 100644
index 000000000..a0aa8c05d
--- /dev/null
+++ b/tutorials/quickstart-ci/GettingStartedWithPythonSDK.yml
@@ -0,0 +1,11 @@
+name: GettingStartedWithPythonSDK
+dependencies:
+- pip:
+ - azureml-sdk
+ - sklearn
+ - numpy
+ - matplotlib
+ - joblib
+ - uuid
+ - requests
+ - azureml-opendatasets
diff --git a/tutorials/quickstart-ci/score.py b/tutorials/quickstart-ci/score.py
new file mode 100644
index 000000000..d4c6fcd1b
--- /dev/null
+++ b/tutorials/quickstart-ci/score.py
@@ -0,0 +1,21 @@
+import json
+import numpy as np
+import os
+import joblib
+
+
+def init():
+ global model
+ # AZUREML_MODEL_DIR is an environment variable created during deployment.
+ # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
+ # For multiple models, it points to the folder containing all deployed models (./azureml-models)
+ model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), "sklearn_mnist_model.pkl")
+ model = joblib.load(model_path)
+
+
+def run(raw_data):
+ data = np.array(json.loads(raw_data)["data"])
+ # make prediction
+ y_hat = model.predict(data)
+ # you can return any data type as long as it is JSON-serializable
+ return y_hat.tolist()
diff --git a/tutorials/quickstart-ci/sklearn-mnist-batch/train.py b/tutorials/quickstart-ci/sklearn-mnist-batch/train.py
new file mode 100644
index 000000000..328c19d31
--- /dev/null
+++ b/tutorials/quickstart-ci/sklearn-mnist-batch/train.py
@@ -0,0 +1,82 @@
+import argparse
+import os
+import numpy as np
+import glob
+
+from sklearn.linear_model import LogisticRegression
+import joblib
+
+from azureml.core import Run
+from utils import load_data
+
+# let user feed in 2 parameters, the dataset to mount or download,
+# and the regularization rate of the logistic regression model
+parser = argparse.ArgumentParser()
+parser.add_argument(
+ "--data-folder", type=str, dest="data_folder", help="data folder mounting point"
+)
+parser.add_argument(
+ "--regularization", type=float, dest="reg", default=0.01, help="regularization rate"
+)
+args = parser.parse_args()
+
+data_folder = args.data_folder
+print("Data folder:", data_folder)
+
+# load train and test set into numpy arrays
+# note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
+X_train = (
+ load_data(
+ glob.glob(
+ os.path.join(data_folder, "**/train-images-idx3-ubyte.gz"), recursive=True
+ )[0],
+ False,
+ ) /
+ 255.0
+)
+X_test = (
+ load_data(
+ glob.glob(
+ os.path.join(data_folder, "**/t10k-images-idx3-ubyte.gz"), recursive=True
+ )[0],
+ False,
+ ) /
+ 255.0
+)
+y_train = load_data(
+ glob.glob(
+ os.path.join(data_folder, "**/train-labels-idx1-ubyte.gz"), recursive=True
+ )[0],
+ True,
+).reshape(-1)
+y_test = load_data(
+ glob.glob(
+ os.path.join(data_folder, "**/t10k-labels-idx1-ubyte.gz"), recursive=True
+ )[0],
+ True,
+).reshape(-1)
+
+print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep="\n")
+
+# get hold of the current run
+run = Run.get_context()
+
+print("Train a logistic regression model with regularization rate of", args.reg)
+clf = LogisticRegression(
+ C=1.0 / args.reg, solver="liblinear", multi_class="auto", random_state=42
+)
+clf.fit(X_train, y_train)
+
+print("Predict the test set")
+y_hat = clf.predict(X_test)
+
+# calculate accuracy on the prediction
+acc = np.average(y_hat == y_test)
+print("Accuracy is", acc)
+
+run.log("regularization rate", np.float(args.reg))
+run.log("accuracy", np.float(acc))
+
+os.makedirs("outputs", exist_ok=True)
+# note file saved in the outputs folder is automatically uploaded into experiment record
+joblib.dump(value=clf, filename="outputs/sklearn_mnist_model.pkl")
diff --git a/tutorials/quickstart-ci/sklearn-mnist-batch/utils.py b/tutorials/quickstart-ci/sklearn-mnist-batch/utils.py
new file mode 100644
index 000000000..ba3a7ce33
--- /dev/null
+++ b/tutorials/quickstart-ci/sklearn-mnist-batch/utils.py
@@ -0,0 +1,24 @@
+import gzip
+import numpy as np
+import struct
+
+
+# load compressed MNIST gz files and return numpy arrays
+def load_data(filename, label=False):
+ with gzip.open(filename) as gz:
+ struct.unpack("I", gz.read(4))
+ n_items = struct.unpack(">I", gz.read(4))
+ if not label:
+ n_rows = struct.unpack(">I", gz.read(4))[0]
+ n_cols = struct.unpack(">I", gz.read(4))[0]
+ res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
+ res = res.reshape(n_items[0], n_rows * n_cols)
+ else:
+ res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
+ res = res.reshape(n_items[0], 1)
+ return res
+
+
+# one-hot encode a 1-D array
+def one_hot_encode(array, num_of_classes):
+ return np.eye(num_of_classes)[array.reshape(-1)]
diff --git a/tutorials/quickstart-ci/utils.py b/tutorials/quickstart-ci/utils.py
new file mode 100644
index 000000000..ba3a7ce33
--- /dev/null
+++ b/tutorials/quickstart-ci/utils.py
@@ -0,0 +1,24 @@
+import gzip
+import numpy as np
+import struct
+
+
+# load compressed MNIST gz files and return numpy arrays
+def load_data(filename, label=False):
+ with gzip.open(filename) as gz:
+ struct.unpack("I", gz.read(4))
+ n_items = struct.unpack(">I", gz.read(4))
+ if not label:
+ n_rows = struct.unpack(">I", gz.read(4))[0]
+ n_cols = struct.unpack(">I", gz.read(4))[0]
+ res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
+ res = res.reshape(n_items[0], n_rows * n_cols)
+ else:
+ res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
+ res = res.reshape(n_items[0], 1)
+ return res
+
+
+# one-hot encode a 1-D array
+def one_hot_encode(array, num_of_classes):
+ return np.eye(num_of_classes)[array.reshape(-1)]