[AGENT-3690] Refactor DrumServerRun and improve shutdown (#706)

elatt · web-flow · commit 5bc10883af3d · 2022-11-01T09:27:07.000-04:00
diff --git a/custom_model_runner/CHANGELOG.md b/custom_model_runner/CHANGELOG.md
@@ -4,12 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-#### [1.9.12] - 2022-10-25
+#### [1.9.12] - 2022-10-31
 ##### Added
-- Add support for a new hook (`custom_flask.py`) in the model-dir to allow extending the Flask
-  application when drum is running in server mode.
-- Add a new model template sample (`flask_extension_httpauth`) to illustrate a potential
-  authentication use-case using the new `custom_flask.py` hook.
+- Add support for a new hook (`custom_flask.py`) in the model-dir to allow extending the Flask application when drum is running in server mode.
+- Add a new model template sample (`flask_extension_httpauth`) to illustrate a potential authentication use-case using the new `custom_flask.py` hook.
+##### Changed
+- Improve handling of SIGTERM to support cleaner shutdowns.
+- Use `--init` flag when running docker containers to improve how signals are propigated to child processes.
 
 #### [1.9.11] - 2022-10-24
 ##### Changed
diff --git a/custom_model_runner/datarobot_drum/drum/drum.py b/custom_model_runner/datarobot_drum/drum/drum.py
@@ -866,7 +866,7 @@ def _prepare_docker_command(self, options, run_mode, raw_arguments):
         in_docker_fit_target_filename = "/opt/fit_target.csv"
         in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"
 
-        docker_cmd = "docker run --rm --entrypoint '' --interactive --user {}:{}".format(
+        docker_cmd = "docker run --rm --init --entrypoint '' --interactive --user {}:{}".format(
             os.getuid(), os.getgid()
         )
         docker_cmd_args = ' -v "{}":{}'.format(options.code_dir, in_docker_model)
@@ -1016,6 +1016,7 @@ def _run_inside_docker(self, options, run_mode, raw_arguments):
         try:
             retcode = p.wait()
         except KeyboardInterrupt:
+            p.terminate()
             retcode = 0
 
         self._print_verbose("{bar} retcode: {retcode} {bar}".format(bar="-" * 10, retcode=retcode))
@@ -1180,7 +1181,7 @@ def output_in_code_dir(code_dir, output_dir):
 
 def create_custom_inference_model_folder(code_dir, output_dir):
     readme = """
-    This folder was generated by the DRUM tool. It provides functionality for making 
+    This folder was generated by the DRUM tool. It provides functionality for making
     predictions using the model trained by DRUM
     """
     files_in_output = set(glob.glob(output_dir + "/**"))
diff --git a/custom_model_runner/datarobot_drum/drum/main.py b/custom_model_runner/datarobot_drum/drum/main.py
@@ -87,7 +87,11 @@ def signal_handler(sig, frame):
         # mlpiper restful_component relies on SIGINT to shutdown nginx and uwsgi,
         # so we don't intercept it.
         if hasattr(runtime.options, "production") and runtime.options.production:
-            pass
+
+            def raise_keyboard_interrupt(sig, frame):
+                raise KeyboardInterrupt("Triggered from {}".format(sig))
+
+            signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
         else:
             signal.signal(signal.SIGINT, signal_handler)
             signal.signal(signal.SIGTERM, signal_handler)
diff --git a/custom_model_runner/datarobot_drum/resource/drum_server_utils.py b/custom_model_runner/datarobot_drum/resource/drum_server_utils.py
@@ -4,8 +4,8 @@
 This is proprietary source code of DataRobot, Inc. and its affiliates.
 Released under the terms of DataRobot Tool and Utility Agreement.
 """
+import logging
 import os
-import psutil
 import requests
 import signal
 import time
@@ -15,30 +15,24 @@
 from datarobot_drum.drum.enum import ArgumentsOptions, ArgumentOptionsEnvVars
 from datarobot_drum.resource.utils import _exec_shell_cmd, _cmd_add_class_labels
 
+logger = logging.getLogger(__name__)
 
-def _wait_for_server(url, timeout, process_holder):
+
+def _wait_for_server(url, timeout):
     # waiting for ping to succeed
     while True:
         try:
             response = requests.get(url)
             if response.ok:
                 break
+            logger.debug("server is not ready: %s\n%s", response, response.text)
         except Exception:
             pass
 
         time.sleep(1)
         timeout -= 1
         if timeout <= 0:
-            if process_holder is not None:
-                print("Killing subprocess: {}".format(process_holder.process.pid))
-                try:
-                    os.killpg(os.getpgid(process_holder.process.pid), signal.SIGTERM)
-                    time.sleep(0.25)
-                    os.killpg(os.getpgid(process_holder.process.pid), signal.SIGKILL)
-                except psutil.ProcessLookupError:
-                    assert False, "Server failed to start: url: {}".format(url)
-
-            assert timeout, "Server failed to start: url: {}".format(url)
+            raise TimeoutError("Server failed to start: url: {}".format(url))
 
 
 def _run_server_thread(cmd, process_obj_holder, verbose=True):
@@ -91,7 +85,8 @@ def __init__(
         else:
             self.url_server_address = "http://localhost:{}".format(self.port)
 
-        cmd = "{} server".format(ArgumentsOptions.MAIN_COMMAND)
+        log_level = logging.getLevelName(logging.root.level).lower()
+        cmd = "{} server --logging-level={}".format(ArgumentsOptions.MAIN_COMMAND, log_level)
 
         if pass_args_as_env_vars:
             os.environ[ArgumentOptionsEnvVars.CODE_DIR] = str(custom_model_dir)
@@ -141,21 +136,40 @@ def __init__(
 
     def __enter__(self):
         self._server_thread = Thread(
-            target=_run_server_thread, args=(self._cmd, self._process_object_holder, self._verbose)
+            name="DRUM Server",
+            target=_run_server_thread,
+            args=(self._cmd, self._process_object_holder, self._verbose),
         )
         self._server_thread.start()
         time.sleep(0.5)
-
-        _wait_for_server(
-            self.url_server_address, timeout=30, process_holder=self._process_object_holder
-        )
+        try:
+            _wait_for_server(self.url_server_address, timeout=30)
+        except TimeoutError:
+            try:
+                self._shutdown_server()
+            except TimeoutError as e:
+                logger.error("server shutdown failure: %s", e)
+            raise
 
         return self
 
     def _shutdown_server(self):
-        # Server has to be killed
-        os.killpg(os.getpgid(self._process_object_holder.process.pid), signal.SIGTERM)
-        self._server_thread.join(timeout=5)
+        pid = self._process_object_holder.process.pid
+        pgid = None
+        try:
+            pgid = os.getpgid(pid)
+            logger.info("Sending signal to ProcessGroup: %s", pgid)
+            os.killpg(pgid, signal.SIGTERM)
+        except ProcessLookupError:
+            logger.warning("server at pid=%s is already gone", pid)
+
+        self._server_thread.join(timeout=10)
+        if self._server_thread.is_alive():
+            if pgid is not None:
+                logger.warning("Forcefully killing process group: %s", pgid)
+                os.killpg(pgid, signal.SIGKILL)
+                self._server_thread.join(timeout=2)
+            raise TimeoutError("Server failed to shutdown gracefully in allotted time")
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # shutdown server
@@ -165,7 +179,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             try:
                 self._shutdown_server()
             except Exception:
-                pass
+                logger.warning("shutdown failure", exc_info=True)
 
     @property
     def process(self):
diff --git a/jenkins/test_drop_in_envs.groovy b/jenkins/test_drop_in_envs.groovy
@@ -22,14 +22,18 @@ node('release-dev && memory-intense'){
         popd
     '''.stripIndent()
 
-    withQuantum([
-        bash: '''\
-            set -exuo pipefail
-            ls -la jenkins_artifacts
-            ./jenkins/test_drop_in_envs.sh
-        '''.stripIndent(),
-        pythonVersion: '3',
-        venvName: "datarobot-user-models"
-    ])
+    try {
+      withQuantum([
+          bash: '''\
+              set -exuo pipefail
+              ls -la jenkins_artifacts
+              ./jenkins/test_drop_in_envs.sh
+          '''.stripIndent(),
+          pythonVersion: '3',
+          venvName: "datarobot-user-models"
+      ])
+    } finally {
+      junit allowEmptyResults: true, testResults: '**/results*.xml'
+    }
   }
 }
diff --git a/jenkins/test_inference_model_templates.groovy b/jenkins/test_inference_model_templates.groovy
@@ -22,14 +22,18 @@ node('release-dev && memory-intense'){
         popd
     '''.stripIndent()
 
-    withQuantum([
-        bash: '''\
-            set -exuo pipefail
-            ls -la jenkins_artifacts
-            ./jenkins/test_inference_model_templates.sh
-        '''.stripIndent(),
-        pythonVersion: '3',
-        venvName: "datarobot-user-models"
-    ])
+    try {
+      withQuantum([
+          bash: '''\
+              set -exuo pipefail
+              ls -la jenkins_artifacts
+              ./jenkins/test_inference_model_templates.sh
+          '''.stripIndent(),
+          pythonVersion: '3',
+          venvName: "datarobot-user-models"
+      ])
+    } finally {
+      junit allowEmptyResults: true, testResults: '**/results*.xml'
+    }
   }
 }
diff --git a/jenkins/test_integration_general.groovy b/jenkins/test_integration_general.groovy
@@ -6,14 +6,18 @@ node('multi-executor && ubuntu:focal'){
     dir('jenkins_artifacts'){
         unstash 'drum_wheel'
     }
-    withQuantum([
-        bash: '''\
-            set -exuo pipefail
-            ls -la jenkins_artifacts
-            jenkins/test_integration_general.sh
-        '''.stripIndent(),
-        pythonVersion: '3',
-        venvName: "datarobot-user-models"
-    ])
+    try {
+      withQuantum([
+          bash: '''\
+              set -exuo pipefail
+              ls -la jenkins_artifacts
+              jenkins/test_integration_general.sh
+          '''.stripIndent(),
+          pythonVersion: '3',
+          venvName: "datarobot-user-models"
+      ])
+    } finally {
+      junit allowEmptyResults: true, testResults: '**/results*.xml'
+    }
   }
-}
+}
diff --git a/jenkins/test_integration_general.sh b/jenkins/test_integration_general.sh
@@ -69,15 +69,15 @@ title "Running tests: sequential test cases Java Custom Predictor and MLOps Moni
 
 # only run here tests which were sequential historically
 pytest tests/drum/test_inference_custom_java_predictor.py tests/drum/test_mlops_monitoring.py \
-       --junit-xml="${GIT_ROOT}/results_integration.xml" \
+       --junit-xml="${GIT_ROOT}/results_integration_serial.xml" \
        -n 1
 TEST_RESULT_1=$?
 
 title "Running tests: all other cases in parallel"
 pytest tests/drum/ \
        -k "not test_inference_custom_java_predictor.py and not test_mlops_monitoring.py" \
        -m "not sequential" \
-       --junit-xml="${GIT_ROOT}/results_integration.xml" \
+       --junit-xml="${GIT_ROOT}/results_integration_parallel.xml" \
        -n auto
 TEST_RESULT_2=$?
 
diff --git a/jenkins/test_integration_per_framework.groovy b/jenkins/test_integration_per_framework.groovy
@@ -6,7 +6,11 @@ node('multi-executor && ubuntu:focal'){
   dir('jenkins_artifacts'){
       unstash 'drum_wheel'
   }
-  sh "ls -la jenkins_artifacts"
-  sh "echo $FRAMEWORK"
-  sh 'bash jenkins/test_integration_per_framework.sh $FRAMEWORK'
-}
+  try {
+    sh "ls -la jenkins_artifacts"
+    sh "echo $FRAMEWORK"
+    sh 'bash jenkins/test_integration_per_framework.sh $FRAMEWORK'
+  } finally {
+    junit allowEmptyResults: true, testResults: '**/results*.xml'
+  }
+}
diff --git a/jenkins/test_training_model_templates.groovy b/jenkins/test_training_model_templates.groovy
@@ -23,14 +23,18 @@ node('release-dev && memory-intense'){
         popd
     '''.stripIndent()
 
-    withQuantum([
-        bash: '''\
-            set -exuo pipefail
-            ls -la jenkins_artifacts
-            ./jenkins/test_training_model_templates.sh
-        '''.stripIndent(),
-        pythonVersion: '3',
-        venvName: "datarobot-user-models"
-    ])
+    try {
+      withQuantum([
+          bash: '''\
+              set -exuo pipefail
+              ls -la jenkins_artifacts
+              ./jenkins/test_training_model_templates.sh
+          '''.stripIndent(),
+          pythonVersion: '3',
+          venvName: "datarobot-user-models"
+      ])
+    } finally {
+      junit allowEmptyResults: true, testResults: '**/results*.xml'
+    }
   }
 }
diff --git a/model_templates/flask_extension_httpauth/README.md b/model_templates/flask_extension_httpauth/README.md
@@ -11,7 +11,9 @@ Note: it is **not** necessary (nor recommended) to add authentication to custom
 This example is simply to demonstration the flexibility of the `custom_flask.py` hook.
 
 ## Instructions
-Create a new custom model with these files and use the Python Drop-In Environment with it
+Create a new custom model with these files and use the Python Drop-In Environment with it.
+
+**Important:** extending the web server is only available when running **without** the `--production` flag (or `PRODUCTION=1` environment variable).
 
 ### To run locally using 'drum'
 Paths are relative to `./datarobot-user-models`:
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,3 +4,6 @@ line-length = 100
 [tool.pytest.ini_options]
 addopts = "--doctest-modules"
 markers = ["sequential: marks tests to be executed sequentially"]
+junit_family = "xunit2"
+junit_logging = "all"
+junit_log_passing_tests = false
diff --git a/tests/drum/run_integration_tests_in_framework_container.sh b/tests/drum/run_integration_tests_in_framework_container.sh
@@ -15,8 +15,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 echo "-- running drum tests - assuming running inside Docker"
 
-GIT_ROOT=$(git rev-parse --show-toplevel)
-echo "GIT_ROOT: $GIT_ROOT"
 echo
 echo "Running pytest:"
 
@@ -40,7 +38,7 @@ fi
 
 pytest ${TESTS_TO_RUN} \
        --framework-env $1 \
-       --junit-xml="$GIT_ROOT/results_integration.xml" \
+       --junit-xml="./results_integration.xml" \
        -n auto
 
 TEST_RESULT=$?
diff --git a/tests/drum/test_drum_server_failures.py b/tests/drum/test_drum_server_failures.py
@@ -50,7 +50,7 @@ def assert_drum_server_run_failure(
         self, server_run_args, with_error_server, error_message, with_nginx=False, docker=None
     ):
         drum_server_run = DrumServerRun(
-            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
+            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
         )
 
         if with_error_server or with_nginx:
@@ -68,8 +68,8 @@ def assert_drum_server_run_failure(
                 assert error_message in response.json()["message"]
         else:
             # DrumServerRun tries to ping the server.
-            # if ping fails for timeout, AssertionError("Server failed to start") is risen
-            with pytest.raises(AssertionError, match="Server failed to start"), drum_server_run:
+            # if ping fails for timeout, TimeoutError("Server failed to start") is risen
+            with pytest.raises(TimeoutError, match="Server failed to start"), drum_server_run:
                 pass
 
             # If server is started with error server or with_nginx (in docker), it is killed in the end of test.
@@ -87,7 +87,7 @@ def test_ping_endpoints(self, params, with_error_server, with_nginx, docker):
         os.remove(os.path.join(custom_model_dir, "custom.py"))
 
         drum_server_run = DrumServerRun(
-            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
+            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
         )
 
         with drum_server_run as run:
@@ -162,7 +162,7 @@ def test_e2e_predict_fails(self, resources, params, with_error_server, with_ngin
         os.remove(os.path.join(custom_model_dir, "custom.py"))
 
         drum_server_run = DrumServerRun(
-            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
+            **server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
         )
 
         with drum_server_run as run: