Skip to content

Commit 5bc1088

Browse files
authored
[AGENT-3690] Refactor DrumServerRun and improve shutdown (#706)
1 parent 1542165 commit 5bc1088

14 files changed

+125
-82
lines changed

custom_model_runner/CHANGELOG.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
#### [1.9.12] - 2022-10-25
7+
#### [1.9.12] - 2022-10-31
88
##### Added
9-
- Add support for a new hook (`custom_flask.py`) in the model-dir to allow extending the Flask
10-
application when drum is running in server mode.
11-
- Add a new model template sample (`flask_extension_httpauth`) to illustrate a potential
12-
authentication use-case using the new `custom_flask.py` hook.
9+
- Add support for a new hook (`custom_flask.py`) in the model-dir to allow extending the Flask application when drum is running in server mode.
10+
- Add a new model template sample (`flask_extension_httpauth`) to illustrate a potential authentication use-case using the new `custom_flask.py` hook.
11+
##### Changed
12+
- Improve handling of SIGTERM to support cleaner shutdowns.
13+
- Use `--init` flag when running docker containers to improve how signals are propigated to child processes.
1314

1415
#### [1.9.11] - 2022-10-24
1516
##### Changed

custom_model_runner/datarobot_drum/drum/drum.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -866,7 +866,7 @@ def _prepare_docker_command(self, options, run_mode, raw_arguments):
866866
in_docker_fit_target_filename = "/opt/fit_target.csv"
867867
in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"
868868

869-
docker_cmd = "docker run --rm --entrypoint '' --interactive --user {}:{}".format(
869+
docker_cmd = "docker run --rm --init --entrypoint '' --interactive --user {}:{}".format(
870870
os.getuid(), os.getgid()
871871
)
872872
docker_cmd_args = ' -v "{}":{}'.format(options.code_dir, in_docker_model)
@@ -1016,6 +1016,7 @@ def _run_inside_docker(self, options, run_mode, raw_arguments):
10161016
try:
10171017
retcode = p.wait()
10181018
except KeyboardInterrupt:
1019+
p.terminate()
10191020
retcode = 0
10201021

10211022
self._print_verbose("{bar} retcode: {retcode} {bar}".format(bar="-" * 10, retcode=retcode))
@@ -1180,7 +1181,7 @@ def output_in_code_dir(code_dir, output_dir):
11801181

11811182
def create_custom_inference_model_folder(code_dir, output_dir):
11821183
readme = """
1183-
This folder was generated by the DRUM tool. It provides functionality for making
1184+
This folder was generated by the DRUM tool. It provides functionality for making
11841185
predictions using the model trained by DRUM
11851186
"""
11861187
files_in_output = set(glob.glob(output_dir + "/**"))

custom_model_runner/datarobot_drum/drum/main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,11 @@ def signal_handler(sig, frame):
8787
# mlpiper restful_component relies on SIGINT to shutdown nginx and uwsgi,
8888
# so we don't intercept it.
8989
if hasattr(runtime.options, "production") and runtime.options.production:
90-
pass
90+
91+
def raise_keyboard_interrupt(sig, frame):
92+
raise KeyboardInterrupt("Triggered from {}".format(sig))
93+
94+
signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
9195
else:
9296
signal.signal(signal.SIGINT, signal_handler)
9397
signal.signal(signal.SIGTERM, signal_handler)

custom_model_runner/datarobot_drum/resource/drum_server_utils.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
This is proprietary source code of DataRobot, Inc. and its affiliates.
55
Released under the terms of DataRobot Tool and Utility Agreement.
66
"""
7+
import logging
78
import os
8-
import psutil
99
import requests
1010
import signal
1111
import time
@@ -15,30 +15,24 @@
1515
from datarobot_drum.drum.enum import ArgumentsOptions, ArgumentOptionsEnvVars
1616
from datarobot_drum.resource.utils import _exec_shell_cmd, _cmd_add_class_labels
1717

18+
logger = logging.getLogger(__name__)
1819

19-
def _wait_for_server(url, timeout, process_holder):
20+
21+
def _wait_for_server(url, timeout):
2022
# waiting for ping to succeed
2123
while True:
2224
try:
2325
response = requests.get(url)
2426
if response.ok:
2527
break
28+
logger.debug("server is not ready: %s\n%s", response, response.text)
2629
except Exception:
2730
pass
2831

2932
time.sleep(1)
3033
timeout -= 1
3134
if timeout <= 0:
32-
if process_holder is not None:
33-
print("Killing subprocess: {}".format(process_holder.process.pid))
34-
try:
35-
os.killpg(os.getpgid(process_holder.process.pid), signal.SIGTERM)
36-
time.sleep(0.25)
37-
os.killpg(os.getpgid(process_holder.process.pid), signal.SIGKILL)
38-
except psutil.ProcessLookupError:
39-
assert False, "Server failed to start: url: {}".format(url)
40-
41-
assert timeout, "Server failed to start: url: {}".format(url)
35+
raise TimeoutError("Server failed to start: url: {}".format(url))
4236

4337

4438
def _run_server_thread(cmd, process_obj_holder, verbose=True):
@@ -91,7 +85,8 @@ def __init__(
9185
else:
9286
self.url_server_address = "http://localhost:{}".format(self.port)
9387

94-
cmd = "{} server".format(ArgumentsOptions.MAIN_COMMAND)
88+
log_level = logging.getLevelName(logging.root.level).lower()
89+
cmd = "{} server --logging-level={}".format(ArgumentsOptions.MAIN_COMMAND, log_level)
9590

9691
if pass_args_as_env_vars:
9792
os.environ[ArgumentOptionsEnvVars.CODE_DIR] = str(custom_model_dir)
@@ -141,21 +136,40 @@ def __init__(
141136

142137
def __enter__(self):
143138
self._server_thread = Thread(
144-
target=_run_server_thread, args=(self._cmd, self._process_object_holder, self._verbose)
139+
name="DRUM Server",
140+
target=_run_server_thread,
141+
args=(self._cmd, self._process_object_holder, self._verbose),
145142
)
146143
self._server_thread.start()
147144
time.sleep(0.5)
148-
149-
_wait_for_server(
150-
self.url_server_address, timeout=30, process_holder=self._process_object_holder
151-
)
145+
try:
146+
_wait_for_server(self.url_server_address, timeout=30)
147+
except TimeoutError:
148+
try:
149+
self._shutdown_server()
150+
except TimeoutError as e:
151+
logger.error("server shutdown failure: %s", e)
152+
raise
152153

153154
return self
154155

155156
def _shutdown_server(self):
156-
# Server has to be killed
157-
os.killpg(os.getpgid(self._process_object_holder.process.pid), signal.SIGTERM)
158-
self._server_thread.join(timeout=5)
157+
pid = self._process_object_holder.process.pid
158+
pgid = None
159+
try:
160+
pgid = os.getpgid(pid)
161+
logger.info("Sending signal to ProcessGroup: %s", pgid)
162+
os.killpg(pgid, signal.SIGTERM)
163+
except ProcessLookupError:
164+
logger.warning("server at pid=%s is already gone", pid)
165+
166+
self._server_thread.join(timeout=10)
167+
if self._server_thread.is_alive():
168+
if pgid is not None:
169+
logger.warning("Forcefully killing process group: %s", pgid)
170+
os.killpg(pgid, signal.SIGKILL)
171+
self._server_thread.join(timeout=2)
172+
raise TimeoutError("Server failed to shutdown gracefully in allotted time")
159173

160174
def __exit__(self, exc_type, exc_val, exc_tb):
161175
# shutdown server
@@ -165,7 +179,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
165179
try:
166180
self._shutdown_server()
167181
except Exception:
168-
pass
182+
logger.warning("shutdown failure", exc_info=True)
169183

170184
@property
171185
def process(self):

jenkins/test_drop_in_envs.groovy

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,18 @@ node('release-dev && memory-intense'){
2222
popd
2323
'''.stripIndent()
2424

25-
withQuantum([
26-
bash: '''\
27-
set -exuo pipefail
28-
ls -la jenkins_artifacts
29-
./jenkins/test_drop_in_envs.sh
30-
'''.stripIndent(),
31-
pythonVersion: '3',
32-
venvName: "datarobot-user-models"
33-
])
25+
try {
26+
withQuantum([
27+
bash: '''\
28+
set -exuo pipefail
29+
ls -la jenkins_artifacts
30+
./jenkins/test_drop_in_envs.sh
31+
'''.stripIndent(),
32+
pythonVersion: '3',
33+
venvName: "datarobot-user-models"
34+
])
35+
} finally {
36+
junit allowEmptyResults: true, testResults: '**/results*.xml'
37+
}
3438
}
3539
}

jenkins/test_inference_model_templates.groovy

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,18 @@ node('release-dev && memory-intense'){
2222
popd
2323
'''.stripIndent()
2424

25-
withQuantum([
26-
bash: '''\
27-
set -exuo pipefail
28-
ls -la jenkins_artifacts
29-
./jenkins/test_inference_model_templates.sh
30-
'''.stripIndent(),
31-
pythonVersion: '3',
32-
venvName: "datarobot-user-models"
33-
])
25+
try {
26+
withQuantum([
27+
bash: '''\
28+
set -exuo pipefail
29+
ls -la jenkins_artifacts
30+
./jenkins/test_inference_model_templates.sh
31+
'''.stripIndent(),
32+
pythonVersion: '3',
33+
venvName: "datarobot-user-models"
34+
])
35+
} finally {
36+
junit allowEmptyResults: true, testResults: '**/results*.xml'
37+
}
3438
}
3539
}

jenkins/test_integration_general.groovy

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@ node('multi-executor && ubuntu:focal'){
66
dir('jenkins_artifacts'){
77
unstash 'drum_wheel'
88
}
9-
withQuantum([
10-
bash: '''\
11-
set -exuo pipefail
12-
ls -la jenkins_artifacts
13-
jenkins/test_integration_general.sh
14-
'''.stripIndent(),
15-
pythonVersion: '3',
16-
venvName: "datarobot-user-models"
17-
])
9+
try {
10+
withQuantum([
11+
bash: '''\
12+
set -exuo pipefail
13+
ls -la jenkins_artifacts
14+
jenkins/test_integration_general.sh
15+
'''.stripIndent(),
16+
pythonVersion: '3',
17+
venvName: "datarobot-user-models"
18+
])
19+
} finally {
20+
junit allowEmptyResults: true, testResults: '**/results*.xml'
21+
}
1822
}
19-
}
23+
}

jenkins/test_integration_general.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,15 @@ title "Running tests: sequential test cases Java Custom Predictor and MLOps Moni
6969

7070
# only run here tests which were sequential historically
7171
pytest tests/drum/test_inference_custom_java_predictor.py tests/drum/test_mlops_monitoring.py \
72-
--junit-xml="${GIT_ROOT}/results_integration.xml" \
72+
--junit-xml="${GIT_ROOT}/results_integration_serial.xml" \
7373
-n 1
7474
TEST_RESULT_1=$?
7575

7676
title "Running tests: all other cases in parallel"
7777
pytest tests/drum/ \
7878
-k "not test_inference_custom_java_predictor.py and not test_mlops_monitoring.py" \
7979
-m "not sequential" \
80-
--junit-xml="${GIT_ROOT}/results_integration.xml" \
80+
--junit-xml="${GIT_ROOT}/results_integration_parallel.xml" \
8181
-n auto
8282
TEST_RESULT_2=$?
8383

jenkins/test_integration_per_framework.groovy

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@ node('multi-executor && ubuntu:focal'){
66
dir('jenkins_artifacts'){
77
unstash 'drum_wheel'
88
}
9-
sh "ls -la jenkins_artifacts"
10-
sh "echo $FRAMEWORK"
11-
sh 'bash jenkins/test_integration_per_framework.sh $FRAMEWORK'
12-
}
9+
try {
10+
sh "ls -la jenkins_artifacts"
11+
sh "echo $FRAMEWORK"
12+
sh 'bash jenkins/test_integration_per_framework.sh $FRAMEWORK'
13+
} finally {
14+
junit allowEmptyResults: true, testResults: '**/results*.xml'
15+
}
16+
}

jenkins/test_training_model_templates.groovy

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,18 @@ node('release-dev && memory-intense'){
2323
popd
2424
'''.stripIndent()
2525

26-
withQuantum([
27-
bash: '''\
28-
set -exuo pipefail
29-
ls -la jenkins_artifacts
30-
./jenkins/test_training_model_templates.sh
31-
'''.stripIndent(),
32-
pythonVersion: '3',
33-
venvName: "datarobot-user-models"
34-
])
26+
try {
27+
withQuantum([
28+
bash: '''\
29+
set -exuo pipefail
30+
ls -la jenkins_artifacts
31+
./jenkins/test_training_model_templates.sh
32+
'''.stripIndent(),
33+
pythonVersion: '3',
34+
venvName: "datarobot-user-models"
35+
])
36+
} finally {
37+
junit allowEmptyResults: true, testResults: '**/results*.xml'
38+
}
3539
}
3640
}

model_templates/flask_extension_httpauth/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ Note: it is **not** necessary (nor recommended) to add authentication to custom
1111
This example is simply to demonstration the flexibility of the `custom_flask.py` hook.
1212

1313
## Instructions
14-
Create a new custom model with these files and use the Python Drop-In Environment with it
14+
Create a new custom model with these files and use the Python Drop-In Environment with it.
15+
16+
**Important:** extending the web server is only available when running **without** the `--production` flag (or `PRODUCTION=1` environment variable).
1517

1618
### To run locally using 'drum'
1719
Paths are relative to `./datarobot-user-models`:

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ line-length = 100
44
[tool.pytest.ini_options]
55
addopts = "--doctest-modules"
66
markers = ["sequential: marks tests to be executed sequentially"]
7+
junit_family = "xunit2"
8+
junit_logging = "all"
9+
junit_log_passing_tests = false

tests/drum/run_integration_tests_in_framework_container.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1515

1616
echo "-- running drum tests - assuming running inside Docker"
1717

18-
GIT_ROOT=$(git rev-parse --show-toplevel)
19-
echo "GIT_ROOT: $GIT_ROOT"
2018
echo
2119
echo "Running pytest:"
2220

@@ -40,7 +38,7 @@ fi
4038

4139
pytest ${TESTS_TO_RUN} \
4240
--framework-env $1 \
43-
--junit-xml="$GIT_ROOT/results_integration.xml" \
41+
--junit-xml="./results_integration.xml" \
4442
-n auto
4543

4644
TEST_RESULT=$?

tests/drum/test_drum_server_failures.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def assert_drum_server_run_failure(
5050
self, server_run_args, with_error_server, error_message, with_nginx=False, docker=None
5151
):
5252
drum_server_run = DrumServerRun(
53-
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
53+
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
5454
)
5555

5656
if with_error_server or with_nginx:
@@ -68,8 +68,8 @@ def assert_drum_server_run_failure(
6868
assert error_message in response.json()["message"]
6969
else:
7070
# DrumServerRun tries to ping the server.
71-
# if ping fails for timeout, AssertionError("Server failed to start") is risen
72-
with pytest.raises(AssertionError, match="Server failed to start"), drum_server_run:
71+
# if ping fails for timeout, TimeoutError("Server failed to start") is risen
72+
with pytest.raises(TimeoutError, match="Server failed to start"), drum_server_run:
7373
pass
7474

7575
# If server is started with error server or with_nginx (in docker), it is killed in the end of test.
@@ -87,7 +87,7 @@ def test_ping_endpoints(self, params, with_error_server, with_nginx, docker):
8787
os.remove(os.path.join(custom_model_dir, "custom.py"))
8888

8989
drum_server_run = DrumServerRun(
90-
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
90+
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
9191
)
9292

9393
with drum_server_run as run:
@@ -162,7 +162,7 @@ def test_e2e_predict_fails(self, resources, params, with_error_server, with_ngin
162162
os.remove(os.path.join(custom_model_dir, "custom.py"))
163163

164164
drum_server_run = DrumServerRun(
165-
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker
165+
**server_run_args, with_error_server=with_error_server, nginx=with_nginx, docker=docker,
166166
)
167167

168168
with drum_server_run as run:

0 commit comments

Comments
 (0)