-
Notifications
You must be signed in to change notification settings - Fork 9
Description
batch transform error:
2022-08-30T09:01:17.792:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=50, BatchStrategy=MULTI_RECORD
2022-08-30T09:01:17.883:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: ClientError: 400
2022-08-30T09:01:17.883:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg:
2022-08-30T09:01:17.883:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: Message:
2022-08-30T09:01:17.883:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: {
2022-08-30T09:01:17.883:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: "code": 400,
2022-08-30T09:01:17.884:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: "type": "BadRequestException",
2022-08-30T09:01:17.884:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: "message": "Parameter model_name is required."
2022-08-30T09:01:17.884:[sagemaker logs]: st-s3/trainingPlatform/model/ba0ba70ebb2c48f69c61240a199f7a24/inference/dataset/21f9bb9e2a39407691bdb18e04e1b672/202208170843490.jpg: }
-------------------------------------------------my dockerfile is-------------------------------------------------
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
NCCL_VERSION=2.4.7, CUDNN_VERSION=7.6.2.24
LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
Add arguments to achieve the version, python and url
ARG PYTHON=python3
ARG PYTHON_VERSION=3.7.3
ARG OPEN_MPI_VERSION=4.0.1
ARG TS_VERSION="0.3.1"
ARG PT_INFERENCE_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.6.0_inference/20200727-223446/b0251e7e070e57f34ee08ac59ab4710081b41918/gpu/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl
ARG PT_VISION_URL=https://torchvision-build.s3.amazonaws.com/1.6.0/gpu/torchvision-0.7.0-cp36-cp36m-linux_x86_64.whl
See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH
ENV PATH /opt/conda/bin:$PATH
ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
ENV TEMP=/home/model-server/tmp
RUN apt-get update
&& apt-get install -y --no-install-recommends software-properties-common
&& add-apt-repository ppa:openjdk-r/ppa
&& apt-get update
&& apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends
build-essential
ca-certificates
cmake
curl
emacs
git
jq
libgl1-mesa-glx
libglib2.0-0
libgomp1
libibverbs-dev
libnuma1
libnuma-dev
libsm6
libxext6
libxrender-dev
openjdk-11-jdk
vim
wget
unzip
zlib1g-dev
docker-library/openjdk#261 https://github.com/docker-library/openjdk/pull/263/files
RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt;
mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts;
/var/lib/dpkg/info/ca-certificates-java.postinst configure;
RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz
&& gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf -
&& cd openmpi-$OPEN_MPI_VERSION
&& ./configure --prefix=/home/.openmpi
&& make all install
&& cd ..
&& rm openmpi-$OPEN_MPI_VERSION.tar.gz
&& rm -rf openmpi-$OPEN_MPI_VERSION
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
Install OpenSSH. Allow OpenSSH to talk to containers without asking for confirmation
RUN apt-get install -y --no-install-recommends
openssh-client
openssh-server
&& mkdir -p /var/run/sshd
&& cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
&& mv /etc/ssh/ssh_config.new /etc/ssh/ssh_configs
RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh
&& chmod +x ~/miniconda.sh
&& ~/miniconda.sh -b -p /opt/conda
&& rm ~/miniconda.sh
RUN /opt/conda/bin/conda update conda
&& /opt/conda/bin/conda install -c conda-forge
python=$PYTHON_VERSION
&& /opt/conda/bin/conda install -y
cython==0.29.12
ipython==7.7.0
mkl-include==2019.4
mkl==2019.4
numpy==1.19.1
scipy==1.3.0
typing==3.6.4
&& /opt/conda/bin/conda clean -ya
RUN conda install -c
pytorch magma-cuda101
&& conda install -c
conda-forge
opencv==4.0.1
&& conda install -y
scikit-learn==0.21.2
pandas==0.25.0
h5py==2.9.0
requests==2.22.0
&& conda clean -ya
&& /opt/conda/bin/conda config --set ssl_verify False
&& pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3
&& pip install packaging==20.4
enum-compat==0.0.3
ruamel-yaml
Uninstall and re-install torch and torchvision from the PyTorch website
RUN pip install --no-cache-dir -U https://pypi.tuna.tsinghua.edu.cn/packages/5d/5e/35140615fc1f925023f489e71086a9ecc188053d263d3594237281284d82/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl#sha256=87d65c01d1b70bb46070824f28bfd93c86d3c5c56b90cbbe836a3f2491d91c76
RUN pip uninstall -y torchvision
&& pip install --no-deps --no-cache-dir -U https://mirrors.aliyun.com/pypi/packages/4d/b5/60d5eb61f1880707a5749fea43e0ec76f27dfe69391cdec953ab5da5e676/torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl#sha256=0d1a5adfef4387659c7a0af3b72e16caa0c67224a422050ab65184d13ac9fb13
RUN pip uninstall -y model-archiver multi-model-server
&& pip install captum
&& pip install torchserve==$TS_VERSION
&& pip install torch-model-archiver==$TS_VERSION
RUN useradd -m model-server
&& mkdir -p /home/model-server/tmp /opt/ml/model
&& chown -R model-server /home/model-server /opt/ml/model
COPY torchserve-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
COPY config.properties /home/model-server
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py
ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
RUN chmod +x /usr/local/bin/deep_learning_container.py
RUN pip install --no-cache-dir "sagemaker-pytorch-inference>=2"
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.6.0/license.txt -o /license.txt
RUN conda install -y -c conda-forge "pyyaml>5.4,<5.5"
RUN pip install pillow==8.2.0 "awscli<2"
RUN python3 -m pip install detectron2==0.4 -f
https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html
RUN HOME_DIR=/root
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip
&& unzip {HOME_DIR}/
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance
&& chmod +x /usr/local/bin/testOSSCompliance
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh
&& {HOME_DIR} ${PYTHON}
&& rm -rf ${HOME_DIR}/oss_compliance*
EXPOSE 8080 8081
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
CMD ["torchserve", "--start", "--ts-config", "/home/model-server/config.properties", "--model-store", "/home/model-server/"]