Add CMLE multi-gpu sample

yaboo-oyabu · yaboo-oyabu · commit 8ee70085f7fe · 2018-12-21T12:17:11.000+09:00
diff --git a/Experimental/distribution/multi-gpu/cmle/config.yaml b/Experimental/distribution/multi-gpu/cmle/config.yaml
@@ -0,0 +1,3 @@
+trainingInput:
+  scaleTier: CUSTOM
+  masterType: complex_model_l_gpu
diff --git a/Experimental/distribution/multi-gpu/cmle/project/__init__.py b/Experimental/distribution/multi-gpu/cmle/project/__init__.py
diff --git a/Experimental/distribution/multi-gpu/cmle/project/setup.py b/Experimental/distribution/multi-gpu/cmle/project/setup.py
@@ -0,0 +1,13 @@
+from setuptools import find_packages
+from setuptools import setup
+
+REQUIRED_PACKAGES = []
+
+setup(
+    name='trainer',
+    version='0.1',
+    install_requires=REQUIRED_PACKAGES,
+    packages=find_packages(),
+    include_package_data=True,
+    description='Generic example trainer package.',
+)
diff --git a/Experimental/distribution/multi-gpu/cmle/project/trainer/__init__.py b/Experimental/distribution/multi-gpu/cmle/project/trainer/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
diff --git a/Experimental/distribution/multi-gpu/cmle/project/trainer/sample_model.py b/Experimental/distribution/multi-gpu/cmle/project/trainer/sample_model.py
@@ -0,0 +1,92 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def _conv(x,kernel, name, log=False):
+    with tf.variable_scope(name):
+        W = tf.get_variable(initializer=tf.truncated_normal(shape=kernel, stddev=0.01), name='W')
+        b = tf.get_variable(initializer=tf.constant(0.0, shape=[kernel[3]]), name='b')
+        conv = tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
+        activation = tf.nn.relu(tf.add(conv,b))
+        pool = tf.nn.max_pool(activation, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
+        if log == True:
+            tf.summary.histogram("weights", W)
+            tf.summary.histogram("biases", b)
+            tf.summary.histogram("activations", activation)
+        return pool
+
+
+def _dense(x,size_in,size_out,name,relu=False,log=False):
+    with tf.variable_scope(name):
+        flat = tf.reshape(x, [-1, size_in])
+        W = tf.get_variable(initializer=tf.truncated_normal([size_in,size_out], stddev=0.1), name='W')
+        b = tf.get_variable(initializer=tf.constant(0.0, shape=[size_out]), name='b')
+        activation = tf.add(tf.matmul(flat, W), b)
+        if relu==True:
+            activation = tf.nn.relu(activation)
+        if log==True:
+            tf.summary.histogram("weights", W)
+            tf.summary.histogram("biases", b)
+            tf.summary.histogram("activations", activation)
+        return activation
+    
+
+def _model(features, mode, params):
+    input_layer = tf.reshape(features, [-1, 32, 32, 3])
+    conv1 = _conv(input_layer, kernel=[5,5,3,128], name='conv1', log=params['log'])
+    conv2 = _conv(conv1, kernel=[5,5,128,128], name='conv2', log=params['log'])
+    conv3 = _conv(conv2, kernel=[3,3,128,256], name='conv3', log=params['log'])
+    conv4 = _conv(conv3, kernel=[3,3,256,512], name='conv4', log=params['log'])
+    dense = _dense(conv4, size_in=2*2*512, size_out=params['dense_units'],
+                   name='Dense', relu=True, log=params['log'])
+    
+    if mode==tf.estimator.ModeKeys.TRAIN:
+        dense = tf.nn.dropout(dense, params['drop_out'])
+        
+    logits = _dense(dense, size_in=params['dense_units'],
+                    size_out=10, name='Output', relu=False, log=params['log'])
+    return logits
+
+
+def model_fn(features, labels, mode, params):
+    logits = _model(features, mode, params)
+    predictions = {"logits": logits,
+                   "classes": tf.argmax(input=logits,axis=1),
+                   "probabilities": tf.nn.softmax(logits,name='softmax')}
+    export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}
+    
+    if (mode==tf.estimator.ModeKeys.TRAIN or mode==tf.estimator.ModeKeys.EVAL):
+        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,logits=logits)
+    
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        learning_rate = tf.train.exponential_decay(params['learning_rate'],
+                                                   tf.train.get_global_step(),
+                                                   decay_steps=100000,
+                                                   decay_rate=0.96)
+        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
+        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
+        tf.summary.scalar('learning_rate', learning_rate)
+        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+    
+    if mode == tf.estimator.ModeKeys.EVAL:
+        accuracy = tf.metrics.accuracy(
+            labels=labels, predictions=tf.argmax(logits, axis=1))
+        metrics = {'accuracy':accuracy}
+        return tf.estimator.EstimatorSpec(mode=mode,loss=loss, eval_metric_ops=metrics)
+        
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        return tf.estimator.EstimatorSpec(
+            mode=mode, predictions=predictions, export_outputs=export_outputs)
diff --git a/Experimental/distribution/multi-gpu/cmle/project/trainer/task.py b/Experimental/distribution/multi-gpu/cmle/project/trainer/task.py
@@ -0,0 +1,170 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tensorflow as tf
+
+import trainer.sample_model as sm
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer(
+    'max_steps', 1000, 'max_step for training.')
+tf.app.flags.DEFINE_string(
+    'output_dir', '', 'GCS location to root directory for checkpoints and exported models.')
+tf.app.flags.DEFINE_string(
+    'model_name', 'sample_model', 'model name.')
+tf.app.flags.DEFINE_integer(
+    'train_batch_size', 200, 'batch size for training.')
+tf.app.flags.DEFINE_integer(
+    'eval_batch_size', 200, 'batch size for evaluation.')
+tf.app.flags.DEFINE_integer(
+    'eval_steps', 50, 'The number of steps that are used in evaluation phase.')
+tf.app.flags.DEFINE_integer(
+    'tf_random_seed', 19851211, '')
+tf.app.flags.DEFINE_integer(
+    'save_checkpoints_steps', 500, '')
+tf.app.flags.DEFINE_string(
+    'train_data_pattern', 'cifar-10/train*.tfrecord', 'path to train dataset on GCS.')
+tf.app.flags.DEFINE_string(
+    'eval_data_pattern', 'cifar-10/valid*.tfrecord', 'path to eval dataset on GCS.')
+tf.app.flags.DEFINE_float(
+    'learning_rate', 1e-3, 'learning rate.')
+tf.app.flags.DEFINE_integer(
+    'num_gpus', 1, 'num of gpus in single-node-multi-GPUs setting.')
+tf.app.flags.DEFINE_integer(
+    'num_gpus_per_worker', 0, 'num of gpus for each node.')
+tf.app.flags.DEFINE_bool(
+    'auto_shard_dataset', False,
+    'whether to auto-shard the dataset when there are multiple workers.')
+tf.app.flags.DEFINE_float(
+    'drop_out_rate', 1e-2, 'drop out rate')
+tf.app.flags.DEFINE_integer(
+    'dense_units', 1024, 'units in dense layer.')
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+def parse_tfrecord(example):
+    feature={'label': tf.FixedLenFeature((), tf.int64),
+             'image': tf.FixedLenFeature((), tf.string, default_value="")}
+    parsed = tf.parse_single_example(example, feature)
+    image = tf.decode_raw(parsed['image'],tf.float64)
+    image = tf.cast(image,tf.float32)
+    image = tf.reshape(image,[32,32,3])
+    return image, parsed['label']
+
+
+def image_scaling(x):
+    return tf.image.per_image_standardization(x)
+
+def distort(x):
+    x = tf.image.resize_image_with_crop_or_pad(x, 40, 40)
+    x = tf.random_crop(x, [32, 32, 3])
+    x = tf.image.random_flip_left_right(x)
+    return x
+
+def dataset_input_fn(params):
+    dataset = tf.data.TFRecordDataset(params['filenames'],
+                                      num_parallel_reads=params['threads'])
+    dataset = dataset.map(parse_tfrecord, num_parallel_calls=params['threads'])
+    dataset = dataset.map(
+        lambda x,y: (image_scaling(x),y), num_parallel_calls=params['threads'])
+    if params['mode']==tf.estimator.ModeKeys.TRAIN:
+        dataset = dataset.map(
+            lambda x,y: (distort(x),y), num_parallel_calls=params['threads'])
+        dataset = dataset.shuffle(buffer_size=params['shuffle_buff'])
+    dataset = dataset.repeat()
+    dataset = dataset.batch(params['batch'])
+    dataset = dataset.prefetch(8*params['batch'])
+    return dataset
+
+
+def train_dataset_input_fn(pattern):
+    files = tf.gfile.Glob(pattern)
+    params = {'filenames': files, 'mode': tf.estimator.ModeKeys.TRAIN,
+              'threads': 16, 'shuffle_buff': 100000, 'batch': FLAGS.train_batch_size}
+    return dataset_input_fn(params)
+
+
+def eval_dataset_input_fn(pattern):
+    files = tf.gfile.Glob(pattern)
+    params = {'filenames': tf.gfile.Glob(pattern), 'mode': tf.estimator.ModeKeys.EVAL,
+              'threads': 16, 'batch': FLAGS.eval_batch_size}
+    return dataset_input_fn(params)
+
+
+def serving_input_fn():
+    receiver_tensor = {'images': tf.placeholder(shape=[None, 32, 32, 3], dtype=tf.float32)}
+    features = tf.map_fn(image_scaling, receiver_tensor['images'])
+    return tf.estimator.export.TensorServingInputReceiver(features, receiver_tensor)
+
+
+def train_and_evaluate():
+    model_dir = os.path.join(FLAGS.output_dir, FLAGS.model_name)
+
+    # MirroredStrategy
+    if FLAGS.num_gpus_per_worker > 0:
+        distribution = tf.contrib.distribute.MirroredStrategy(
+            num_gpus_per_worker=FLAGS.num_gpus_per_worker,
+            auto_shard_dataset=FLAGS.auto_shard_dataset)
+    elif FLAGS.num_gpus > 0:
+        distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=FLAGS.num_gpus)
+    else:
+        distribution = None
+
+    # Configuration for Estimator
+    config = tf.estimator.RunConfig(
+        save_checkpoints_secs=FLAGS.save_checkpoints_steps,
+        keep_checkpoint_max=5,
+        session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),
+        train_distribute=distribution,
+        tf_random_seed=FLAGS.tf_random_seed)
+
+    model_params = {
+        'drop_out': FLAGS.drop_out_rate,
+        'dense_units': FLAGS.dense_units,
+        'learning_rate': FLAGS.learning_rate,
+        'log': True}
+
+    # Create Estimator.
+    estimator = tf.estimator.Estimator(
+        model_fn=sm.model_fn,
+        model_dir=model_dir,
+        params=model_params,
+        config=config)
+
+    # Specify training data paths, batch size and max steps.
+    train_spec = tf.estimator.TrainSpec(
+        input_fn=lambda: train_dataset_input_fn(FLAGS.train_data_pattern),
+        max_steps=FLAGS.max_steps)
+    
+    # Configuration for model exportation
+    exporter = tf.estimator.LatestExporter(
+        name='export',
+        serving_input_receiver_fn=serving_input_fn,
+        assets_extra=None, as_text=False, exports_to_keep=5)
+
+    # Specify validation data paths, steps for evaluation and exporter specs
+    eval_spec = tf.estimator.EvalSpec(
+        input_fn=lambda: eval_dataset_input_fn(FLAGS.eval_data_pattern),
+        steps=FLAGS.eval_steps, exporters=exporter)
+
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+
+def main(unused_argv=None):
+    tf.logging.info(tf.__version__)
+    train_and_evaluate()
+    
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/Experimental/distribution/multi-gpu/cmle/run_job.ipynb b/Experimental/distribution/multi-gpu/cmle/run_job.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Single host multi gpus (K80 * 8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "jobId: sample_model_20181221_112757\n",
+      "state: QUEUED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Job [sample_model_20181221_112757] submitted successfully.\n",
+      "Your job is still active. You may view the status of your job with the command\n",
+      "\n",
+      "  $ gcloud ml-engine jobs describe sample_model_20181221_112757\n",
+      "\n",
+      "or continue streaming the logs with the command\n",
+      "\n",
+      "  $ gcloud ml-engine jobs stream-logs sample_model_20181221_112757\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "PROJECT_ID=\"YOUR-PROJECT-ID\"\n",
+    "BUCKET_ID=\"YOUR-BUCKET-ID\"\n",
+    "REGION=\"YOUR-REGION\"\n",
+    "\n",
+    "TRAINER_PACKAGE_PATH=$(pwd)/project/trainer\n",
+    "now=$(date +\"%Y%m%d_%H%M%S\")\n",
+    "JOB_NAME=\"sample_model_$now\"\n",
+    "MAIN_TRAINER_MODULE=trainer.task\n",
+    "JOB_DIR=gs://$BUCKET_ID/job\n",
+    "PACKAGE_STAGING_PATH=gs://$BUCKET_ID/staging\n",
+    "#https://cloud.google.com/ml-engine/docs/tensorflow/regions\n",
+    "\n",
+    "\n",
+    "JOB_DIR=gs://$BUCKET_ID/sample_model_job_dir\n",
+    "SCALE_TIER=BASIC\n",
+    "RUNTIME_VERSION=\"1.11\"\n",
+    "# https://cloud.google.com/ml-engine/docs/tensorflow/runtime-version-list\n",
+    "\n",
+    "gcloud ml-engine jobs submit training $JOB_NAME \\\n",
+    "  --package-path $TRAINER_PACKAGE_PATH \\\n",
+    "  --module-name $MAIN_TRAINER_MODULE \\\n",
+    "  --job-dir $JOB_DIR \\\n",
+    "  --project $PROJECT_ID \\\n",
+    "  --region $REGION \\\n",
+    "  --runtime-version $RUNTIME_VERSION \\\n",
+    "  --config config.yaml \\\n",
+    "  -- \\\n",
+    "  --train_data_pattern \"gs://$BUCKET_ID/data/cifar10_data_00*\" \\\n",
+    "  --eval_data_pattern \"gs://$BUCKET_ID/data/cifar10_data_01*\" \\\n",
+    "  --max_steps 10000 \\\n",
+    "  --num_gpus 8 \\\n",
+    "  --output_dir \"gs://$BUCKET_ID/model\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+trainingInput:`
	`2`	`+ scaleTier: CUSTOM`
	`3`	`+ masterType: complex_model_l_gpu`