juvchan
diff --git a/‎README.md
Lines changed: 21 additions & 2 deletions b/‎README.md
Lines changed: 21 additions & 2 deletions
diff --git a/‎container/Dockerfile
Lines changed: 19 additions & 0 deletions b/‎container/Dockerfile
Lines changed: 19 additions & 0 deletions
diff --git a/‎container/code/nginx.conf
Lines changed: 27 additions & 0 deletions b/‎container/code/nginx.conf
Lines changed: 27 additions & 0 deletions
diff --git a/‎container/code/serve
Lines changed: 35 additions & 0 deletions b/‎container/code/serve
Lines changed: 35 additions & 0 deletions
diff --git a/‎container/code/train.py
Lines changed: 156 additions & 0 deletions b/‎container/code/train.py
Lines changed: 156 additions & 0 deletions
diff --git a/‎data/juvchan_flower.jpg
274 KB b/‎data/juvchan_flower.jpg
274 KB
@@ -1,2 +1,21 @@
-# amazon-sagemaker-tensorflow-custom-containers
-This project shows step-by-step guide on how to build a real-world flower classifier of 102 flower types using TensorFlow, Amazon SageMaker, Docker and Python in a Jupyter Notebook.
+# **Build, Train and Deploy A Real-World Flower Classifier of 102 Flower Types**
+## *With TensorFlow 2.3, Amazon SageMaker Python SDK 2.5.x and Custom SageMaker Training & Serving Docker Containers*
+
+## **Introduction**
+This project shows step-by-step guide on how to build a **real-world flower classifier** of **102 flower types** using **TensorFlow**, **Amazon SageMaker**, **Docker** and **Python** in a **Jupyter Notebook**. It has been tested with the Python packages in the **requirements.txt** on **Python 3.8.5**.  
+
+## **Installation**
+Clone this project from GitHub.
+Create a new Python virtual environment targeting Python 3.6 and above.
+
+Install the required Python packages from the **requirements.txt** or install them from running the project's Jupyter notebook.
+Start and run the notebook with **Jupyter Lab**.
+
+Note that the external flower images used in the notebook are not provided as part of the project.
+You could use any other free flower images at your own discretion for the evaluation of the project's flower classification model that you are going to build and deploy.
+
+## **Contributing**
+Pull requests, suggestions and feedback are welcome. For major changes or issues, please open an issue to discuss.
+
+## **License**
+[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
@@ -0,0 +1,19 @@
+# Copyright 2020 Juv Chan. All Rights Reserved.
+FROM tensorflow/tensorflow:2.3.0-gpu
+
+LABEL maintainer="Juv Chan <[email protected]>"
+
+RUN apt-get update && apt-get install -y --no-install-recommends nginx curl
+RUN pip install --no-cache-dir --upgrade pip tensorflow-hub tensorflow-datasets sagemaker-tensorflow-training
+
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list
+RUN curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
+RUN apt-get update && apt-get install tensorflow-model-server
+
+ENV PATH="/opt/ml/code:${PATH}"
+
+# /opt/ml and all subdirectories are utilized by SageMaker, we use the /code subdirectory to store our user code.
+COPY /code /opt/ml/code
+WORKDIR /opt/ml/code
+
+RUN chmod 755 serve
@@ -0,0 +1,27 @@
+events {
+    # determines how many requests can simultaneously be served
+    # https://www.digitalocean.com/community/tutorials/how-to-optimize-nginx-configuration
+    # for more information
+    worker_connections 2048;
+}
+
+http {
+  server {
+    # configures the server to listen to the port 8080
+    # Amazon SageMaker sends inference requests to port 8080.
+    # For more information: https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
+    listen 8080 deferred;
+    client_max_body_size 10M;
+
+    # redirects requests from SageMaker to TF Serving
+    location /invocations {
+      proxy_pass http://localhost:8501/v1/models/flowers_model:predict;
+    }
+
+    # Used by SageMaker to confirm if server is alive.
+    # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
+    location /ping {
+      return 200 "OK";
+    }
+  }
+}
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+# This file implements the hosting solution, which just starts TensorFlow Model Serving.
+import subprocess
+import os
+
+TF_SERVING_DEFAULT_PORT = 8501
+MODEL_NAME = 'flowers_model'
+MODEL_BASE_PATH = '/opt/ml/model'
+
+
+def start_server():
+    print('Starting TensorFlow Serving.')
+
+    # link the log streams to stdout/err so they will be logged to the container logs
+    subprocess.check_call(
+        ['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
+    subprocess.check_call(
+        ['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
+
+    # start nginx server
+    nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf'])
+
+    # start TensorFlow Serving
+    # https://www.tensorflow.org/serving/api_rest#start_modelserver_with_the_rest_api_endpoint
+    tf_model_server = subprocess.call(['tensorflow_model_server',
+                                       '--rest_api_port=' +
+                                       str(TF_SERVING_DEFAULT_PORT),
+                                       '--model_name=' + MODEL_NAME,
+                                       '--model_base_path=' + MODEL_BASE_PATH])
+
+
+# The main routine just invokes the start function.
+if __name__ == '__main__':
+    start_server()
@@ -0,0 +1,156 @@
+import argparse
+import numpy as np
+import os
+import logging
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_datasets as tfds
+
+
+EPOCHS = 5
+BATCH_SIZE = 32
+LEARNING_RATE = 0.001
+DROPOUT_RATE = 0.3
+EARLY_STOPPING_TRAIN_ACCURACY = 0.995
+TF_AUTOTUNE = tf.data.experimental.AUTOTUNE
+TF_HUB_MODEL_URL = 'https://tfhub.dev/google/inaturalist/inception_v3/feature_vector/4'
+TF_DATASET_NAME = 'oxford_flowers102'
+IMAGE_SIZE = (299, 299)
+SHUFFLE_BUFFER_SIZE = 473
+MODEL_VERSION = '1'
+
+
+class EarlyStoppingCallback(tf.keras.callbacks.Callback):
+    def on_epoch_end(self, epoch, logs={}):
+        if(logs.get('accuracy') > EARLY_STOPPING_TRAIN_ACCURACY):
+            print(
+                f"\nEarly stopping at {logs.get('accuracy'):.4f} > {EARLY_STOPPING_TRAIN_ACCURACY}!\n")
+            self.model.stop_training = True
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # hyperparameters sent by the client are passed as command-line arguments to the script
+    parser.add_argument('--epochs', type=int, default=EPOCHS)
+    parser.add_argument('--batch_size', type=int, default=BATCH_SIZE)
+    parser.add_argument('--learning_rate', type=float, default=LEARNING_RATE)
+
+    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
+    parser.add_argument('--model_dir', type=str)
+    parser.add_argument('--sm_model_dir', type=str,
+                        default=os.environ.get('SM_MODEL_DIR'))
+    parser.add_argument('--model_version', type=str, default=MODEL_VERSION)
+
+    return parser.parse_known_args()
+
+
+def set_gpu_memory_growth():
+    gpus = tf.config.list_physical_devices('GPU')
+
+    if gpus:
+        print("\nGPU Available.")
+        print(f"Number of GPU: {len(gpus)}")
+        try:
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+                print(f"Enabled Memory Growth on {gpu.name}\n")
+                print()
+        except RuntimeError as e:
+            print(e)
+
+    print()
+
+
+def get_datasets(dataset_name):
+    tfds.disable_progress_bar()
+
+    splits = ['test', 'validation', 'train']
+    splits, ds_info = tfds.load(dataset_name, split=splits, with_info=True)
+    (ds_train, ds_validation, ds_test) = splits
+
+    return (ds_train, ds_validation, ds_test), ds_info
+
+
+def parse_image(features):
+    image = features['image']
+    image = tf.image.resize(image, IMAGE_SIZE) / 255.0
+    return image, features['label']
+
+
+def training_pipeline(train_raw, batch_size):
+    train_preprocessed = train_raw.shuffle(SHUFFLE_BUFFER_SIZE).map(
+        parse_image, num_parallel_calls=TF_AUTOTUNE).cache().batch(batch_size).prefetch(TF_AUTOTUNE)
+
+    return train_preprocessed
+
+
+def test_pipeline(test_raw, batch_size):
+    test_preprocessed = test_raw.map(parse_image, num_parallel_calls=TF_AUTOTUNE).cache(
+    ).batch(batch_size).prefetch(TF_AUTOTUNE)
+
+    return test_preprocessed
+
+
+def create_model(train_batches, val_batches, learning_rate):
+    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+
+    base_model = hub.KerasLayer(TF_HUB_MODEL_URL,
+                                input_shape=IMAGE_SIZE + (3,), trainable=False)
+
+    early_stop_callback = EarlyStoppingCallback()
+
+    model = tf.keras.Sequential([
+        base_model,
+        tf.keras.layers.Dropout(DROPOUT_RATE),
+        tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
+    ])
+
+    model.compile(optimizer=optimizer,
+                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+
+    model.summary()
+
+    model.fit(train_batches, epochs=args.epochs,
+              validation_data=val_batches,
+              callbacks=[early_stop_callback])
+
+    return model
+
+
+if __name__ == "__main__":
+    args, _ = parse_args()
+    batch_size = args.batch_size
+    epochs = args.epochs
+    learning_rate = args.learning_rate
+    print(
+        f"\nBatch Size = {batch_size}, Epochs = {epochs}, Learning Rate = {learning_rate}\n")
+
+    set_gpu_memory_growth()
+
+    (ds_train, ds_validation, ds_test), ds_info = get_datasets(TF_DATASET_NAME)
+    NUM_CLASSES = ds_info.features['label'].num_classes
+
+    print(
+        f"\nNumber of Training dataset samples: {tf.data.experimental.cardinality(ds_train)}")
+    print(
+        f"Number of Validation dataset samples: {tf.data.experimental.cardinality(ds_validation)}")
+    print(
+        f"Number of Test dataset samples: {tf.data.experimental.cardinality(ds_test)}")
+    print(f"Number of Flower Categories: {NUM_CLASSES}\n")
+
+    train_batches = training_pipeline(ds_train, batch_size)
+    validation_batches = test_pipeline(ds_validation, batch_size)
+    test_batches = test_pipeline(ds_test, batch_size)
+
+    model = create_model(train_batches, validation_batches, learning_rate)
+    eval_results = model.evaluate(test_batches)
+
+    for metric, value in zip(model.metrics_names, eval_results):
+        print(metric + ': {:.4f}'.format(value))
+
+    export_path = os.path.join(args.sm_model_dir, args.model_version)
+    print(
+        f'\nModel version: {args.model_version} exported to: {export_path}\n')
+
+    model.save(export_path)