Skip to content

Commit eb32c00

Browse files
authored
Added scripts for BioNeMo 2.5 (#596)
Added scripts for BioNeMo 2.5
1 parent b2af25d commit eb32c00

File tree

7 files changed

+197
-4
lines changed

7 files changed

+197
-4
lines changed

3.test_cases/14.bionemo/0.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/clara/bionemo-framework:2.5
1+
FROM nvcr.io/nvidia/clara/bionemo-framework:1.2
22

33
ARG EFA_INSTALLER_VERSION=1.37.0
44
ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
@@ -90,6 +90,6 @@ SHELL ["/bin/sh", "-c"]
9090
COPY requirements.txt /workspace/
9191
RUN pip3 install -r /workspace/requirements.txt
9292

93-
COPY prepare_uniref50.py /workspace/bionemo2
93+
COPY prepare_uniref50.py /workspace/bionemo
9494

95-
WORKDIR /workspace/bionemo2
95+
WORKDIR /workspace/bionemo

3.test_cases/14.bionemo/1.uniref50.slurm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ declare -a ARGS=(
2121
--container-mounts $FSX_MOUNT
2222
)
2323

24-
srun -l "${ARGS[@]}" python3 /workspace/bionemo/prepare_uniref50.py
24+
srun -l "${ARGS[@]}" python3 /workspace/bionemo2/prepare_uniref50.py
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
FROM nvcr.io/nvidia/clara/bionemo-framework:2.5
2+
3+
ARG GDRCOPY_VERSION=v2.4.1
4+
ARG EFA_INSTALLER_VERSION=1.37.0
5+
ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
6+
ARG NCCL_VERSION=v2.23.4-1
7+
8+
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
9+
10+
######################
11+
# Update and remove the IB libverbs
12+
######################
13+
RUN apt-get update -y && apt-get upgrade -y
14+
RUN apt-get remove -y --allow-change-held-packages \
15+
ibverbs-utils \
16+
libibverbs-dev \
17+
libibverbs1 \
18+
libmlx5-1
19+
20+
RUN rm -rf /opt/hpcx/ompi \
21+
&& rm -rf /usr/local/mpi \
22+
&& rm -rf /usr/local/ucx \
23+
&& ldconfig
24+
25+
RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
26+
apt-utils \
27+
autoconf \
28+
automake \
29+
build-essential \
30+
cmake \
31+
curl \
32+
gcc \
33+
gdb \
34+
git \
35+
kmod \
36+
libtool \
37+
openssh-client \
38+
openssh-server \
39+
vim \
40+
&& apt autoremove -y
41+
42+
RUN mkdir -p /var/run/sshd && \
43+
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
44+
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
45+
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
46+
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
47+
48+
RUN rm -rf /root/.ssh/ \
49+
&& mkdir -p /root/.ssh/ \
50+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
51+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
52+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
53+
54+
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
55+
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
56+
57+
#################################################
58+
## Install NVIDIA GDRCopy
59+
##
60+
## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
61+
## that the cuda-compat-xx-x package is the latest.
62+
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
63+
&& cd /tmp/gdrcopy \
64+
&& make prefix=/opt/gdrcopy install
65+
66+
ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
67+
ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
68+
ENV CPATH /opt/gdrcopy/include:$CPATH
69+
ENV PATH /opt/gdrcopy/bin:$PATH
70+
71+
#################################################
72+
## Install EFA installer
73+
RUN cd $HOME \
74+
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
75+
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
76+
&& cd aws-efa-installer \
77+
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
78+
&& rm -rf $HOME/aws-efa-installer
79+
80+
81+
###################################################
82+
## Install AWS-OFI-NCCL plugin
83+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
84+
#Switch from sh to bash to allow parameter expansion
85+
SHELL ["/bin/bash", "-c"]
86+
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
87+
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
88+
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
89+
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
90+
--with-mpi=/opt/amazon/openmpi \
91+
--with-libfabric=/opt/amazon/efa \
92+
--with-cuda=/usr/local/cuda \
93+
--enable-platform-aws \
94+
&& make -j $(nproc) \
95+
&& make install \
96+
&& cd .. \
97+
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
98+
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
99+
100+
SHELL ["/bin/sh", "-c"]
101+
102+
###################################################
103+
RUN rm -rf /var/lib/apt/lists/*
104+
105+
RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \
106+
&& echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
107+
108+
RUN pip3 install awscli pynvml wandb
109+
110+
RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
111+
&& echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \
112+
&& echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \
113+
&& chmod a+x $OPEN_MPI_PATH/bin/mpirun
114+
115+
116+
## Set Open MPI variables to exclude network interface and conduit.
117+
ENV OMPI_MCA_pml=^cm,ucx \
118+
OMPI_MCA_btl=tcp,self \
119+
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
120+
OPAL_PREFIX=/opt/amazon/openmpi \
121+
NCCL_SOCKET_IFNAME=^docker,lo,veth
122+
123+
## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
124+
ENV PMIX_MCA_gds=hash
125+
## BioNemo dependencies
126+
WORKDIR /workspace/bionemo2/sub-packages/bionemo-esm2
127+
128+
RUN pip install -e .
129+
130+
WORKDIR /workspace
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
docker build -t bionemo:aws .
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
rm /fsxl/awsankur/bionemo.sqsh
4+
5+
enroot import -o /fsxl/awsankur/bionemo/bionemo.sqsh dockerd://bionemo:aws
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
docker run --rm -v /fsxl/awsankur/bionemo:/root/.cache/bionemo bionemo:aws download_bionemo_data esm2/testdata_esm2_pretrain:2.0
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
#SBATCH --nodes=2 # number of nodes
3+
#SBATCH --ntasks-per-node=8 # n tasks per machine (one task per gpu) <required>
4+
#SBATCH --gpus-per-node=8
5+
#SBATCH --exclusive # exclusive node access
6+
#SBATCH --output slurm-esm2-train-%j.out
7+
8+
#export FI_EFA_USE_HUGE_PAGE=0 #Uncomment if you get os.fork() memory error
9+
export FI_PROVIDER=efa
10+
export NCCL_DEBUG=INFO
11+
12+
#Path to store data and checkpoints
13+
export DATA_HOME_DIR=/fsxl/awsankur/bionemo
14+
15+
###########################
16+
###### User Variables #####
17+
###########################
18+
19+
# default variables for Enroot
20+
: "${IMAGE:=${DATA_HOME_DIR}/bionemo.sqsh}"
21+
: "${DATA_PATH:=/fsxl}"
22+
: "${FSX_MOUNT:=$DATA_PATH:$DATA_PATH}"
23+
24+
declare -a ARGS=(
25+
--container-image $IMAGE
26+
--container-mount-home
27+
--container-mounts ${DATA_HOME_DIR}:${DATA_HOME_DIR}
28+
)
29+
30+
# Enable fused attention in transformer engine for speed-up
31+
DATA_DIR=$(find $DATA_HOME_DIR -type d -name "*untar*" -print -quit)
32+
33+
srun -l "${ARGS[@]}" python3 /workspace/bionemo2/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/train_esm2.py \
34+
--train-cluster-path ${DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet \
35+
--train-database-path ${DATA_DIR}/2024_03_sanity/train_sanity.db \
36+
--valid-cluster-path ${DATA_DIR}/2024_03_sanity/valid_clusters.parquet \
37+
--valid-database-path ${DATA_DIR}/2024_03_sanity/validation.db \
38+
--precision="bf16-mixed" \
39+
--num-gpus 8 \
40+
--num-nodes 2 \
41+
--num-steps 100 \
42+
--val-check-interval 25 \
43+
--max-seq-length 1024 \
44+
--limit-val-batches 2 \
45+
--micro-batch-size 2 \
46+
--num-layers 33 \
47+
--hidden-size 1280 \
48+
--num-attention-head 20 \
49+
--ffn-hidden-size 5120 \
50+
--tensor-model-parallel-size 1 \
51+
--create-tensorboard-logger \
52+
--result-dir ${DATA_HOME_DIR}

0 commit comments

Comments
 (0)