New CUDA CI+development Docker container (#1162)

BenWibking · pgrete · web-flow · commit fa9ea74bea80 · 2025-06-09T18:58:30.000Z
* update CI Docker container

Update to CUDA 12.6, Ubuntu 24.04, and Clang 19.

* update OpenMPI

* revert to OpenMPI 4.1.4

* add ADIOS2+openPMD

* add c-blosc ubuntu package

* fix Dockerfile.nvcc

* fix dockerfile

* install python headers

* install cmake from apt (required for aarch64)

* remove duplicate cmake dep

* disable ascent build

* add newer ascent version

* disable ascent build; fix openpmd build

* downgrade to CUDA 12.0

* fix ascent build path

* fix bug in build_ascent.sh

* remove unneeded patches

* ascent complains if MFEM is not built

* control cuda support for ascent with env var

* add MAKEOPTS=--output-sync=target

* add comment to Dockerfile

* Downgrade numpy

* Fix ADIOS2 and OpenPMD versions

* Directly use Ascent script with small patch

* Use Cuda12.1 container and drop to local user

* add emacs and vi

* set build_jobs=`nproc` to avoid OOM kill

* add developer tools for Codespaces/VSCode

* add devcontainer.json

* update to CUDA 12.8 and VTK-m 2.3

* update image ref

* fetch BLT

* extract BLT into correct dir

* avoid uid 1000

* build and publish Docker image based on Dockerfiles in repo

* Update CI image to be used

* Fix python version used for linting

* Bump opmd to stable release

* Add changelog

* Use updated clang for compiler check

* remove docker-publish action

* Fix C++20 build

* Attempt to fix Parthenon Ascent dep

* Try Ben's BLT build fix

* Fix Ascent build

* Include opmd in rocm image

* Use actual image

---------

Co-authored-by: Philipp Grete &lt;pgrete@hs.uni-hamburg.de&gt;
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,35 @@
+// devcontainer.json
+    {
+      "name": "parthenon-dev",
+      "image": "ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent",
+      "hostRequirements": {
+        "cpus": 4
+      },
+      "customizations": {
+        "vscode": {
+          "settings": {},
+          "extensions": [
+            "-ms-vscode.cpptools",
+            "llvm-vs-code-extensions.vscode-clangd",
+            "github.vscode-pull-request-github",
+            "ms-python.python",
+            "ms-toolsai.jupyter",
+            "ms-vscode.live-server",
+            "ms-azuretools.vscode-docker",
+            "swyddfa.esbonio",
+            "tomoki1207.pdf",
+            "ms-vscode.cmake-tools",
+            "ms-vsliveshare.vsliveshare"
+        ]
+        }
+      },
+      "remoteEnv": {
+        "PATH": "${containerEnv:PATH}:/usr/local/hdf5/parallel/bin",
+        "OMPI_MCA_opal_warn_on_missing_libcuda": "0"
+      },
+      //"remoteUser": "ubuntu",
+      // we need to manually checkout the submodules,
+      // but VSCode may try to configure CMake before they are fully checked-out.
+      // workaround TBD
+      "postCreateCommand": "git submodule update --init"
+    }
diff --git a/.github/workflows/check-compilers.yml b/.github/workflows/check-compilers.yml
@@ -21,20 +21,13 @@ jobs:
     continue-on-error: true
     strategy:
       matrix:
-        cxx: ['g++', 'clang++-15']
+        cxx: ['g++', 'clang++-20']
         cmake_build_type: ['Release', 'DbgNoSym']
         device: ['cuda', 'host']
         parallel: ['serial', 'mpi']
-        exclude:
-          # Debug cuda clang build fail for the unit test.
-          # Exclude for now until we figure out what's going on.
-          # https://github.com/lanl/parthenon/issues/630
-          - cxx: clang++-15
-            device: cuda
-            cmake_build_type: DbgNoSym
     runs-on: ubuntu-latest
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent
     env:
       CMAKE_GENERATOR: Ninja
     steps:
diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
@@ -34,7 +34,7 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
       options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
@@ -100,7 +100,8 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
             -DPARTHENON_ENABLE_ASCENT=ON \
-            -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
+            -DCMAKE_CUDA_HOST_COMPILER=g++ \
+            -DAscent_DIR=/usr/local/ascent-checkout/lib/cmake/ascent
           cmake --build build-ascent
           cd example/advection/
           # Pick GPU with most available memory
@@ -131,7 +132,7 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, navi1030]
     container:
-      image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
+      image: ghcr.io/parthenon-hpc-lab/rocm6.2-mpi-hdf5
       # Map to local user id on CI  machine to allow writing to build cache and
       # forward device handles to access AMD GPU within container
       options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
@@ -22,15 +22,15 @@ jobs:
   style:
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
       options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
           submodules: 'true'
       - name: cpplint
-        run: python ./tst/style/cpplint.py --counting=detailed --recursive src example tst
+        run: python3 ./tst/style/cpplint.py --counting=detailed --recursive src example tst
       - name: copyright
         run: |
           cmake -DCMAKE_CXX_FLAGS=-Werror -Bbuild-copyright-check
@@ -47,7 +47,7 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
       options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
@@ -79,7 +79,7 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda12.8-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
       options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
@@ -137,7 +137,7 @@ jobs:
   integration-amdgpu:
     runs-on: [self-hosted, navi1030]
     container:
-      image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
+      image: ghcr.io/parthenon-hpc-lab/rocm6.2-mpi-hdf5
       # Map to local user id on CI  machine to allow writing to build cache and
       # forward device handles to access AMD GPU within container
       options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 1162]](https://github.com/parthenon-hpc-lab/parthenon/pull/1162) Add dev container (e.g., GitHub Codepsacer or VSCode)
 
 
 ### Changed (changing behavior/API/variables/...)
@@ -13,6 +14,7 @@
 
 
 ### Infrastructure (changes irrelevant to downstream codes)
+- [[PR 1162]](https://github.com/parthenon-hpc-lab/parthenon/pull/1162) Update CI container to Cuda 12.8
 
 
 ### Removed (removing behavior/API/varaibles/...)
diff --git a/scripts/docker/Dockerfile.hip-rocm b/scripts/docker/Dockerfile.hip-rocm
@@ -1,7 +1,7 @@
 FROM rocm/dev-ubuntu-24.04:6.2
 
 RUN apt-get clean && apt-get update -y && \
-    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib lcov curl cmake ninja-build openmpi-bin libopenmpi-dev && \
+    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib lcov curl cmake ninja-build openmpi-bin libopenmpi-dev adios2-mpi-bin adios2-serial-bin libadios2-mpi-c++11-dev libadios2-mpi-core-dev libadios2-serial-core-dev libadios2-serial-c++11-dev && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 RUN cd /tmp && \
@@ -16,6 +16,20 @@ RUN cd /tmp && \
 
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
 
+# commit version is ver 0.16.1
+RUN mkdir /tmp/build-openpmd && cd /tmp/build-openpmd && \
+    wget https://github.com/openPMD/openPMD-api/archive/3a60e77.tar.gz && \
+    tar xzf 3a60e77.tar.gz && \
+    mkdir openPMD-api-build && cd openPMD-api-build && \
+    cmake ../openPMD-api-3a60e7714f6143c8fc7bf89809f2167d058359ee -DopenPMD_USE_PYTHON=ON -DPython_EXECUTABLE=$(which python3) -DopenPMD_USE_ADIOS2=ON && \
+    cmake --build . -j 16 && \
+    cmake --build . --target install && \
+    cd / && \
+    rm -rf /tmp/build-openpmd
+
+# Technically not necessary (as we installed the api above) but makes it easier for package discovery
+RUN env openPMD_USE_MPI=ON python3 -m pip install openpmd-api --no-binary openpmd-api --break-system-packages
+
 # Latest image has default user with uid 1000 (which maps to the one running the container on the CI host
 # Need to add user to the group that can access the GPU
 RUN usermod -a -G render ubuntu
diff --git a/scripts/docker/Dockerfile.nvcc b/scripts/docker/Dockerfile.nvcc
@@ -1,23 +1,36 @@
-FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.8.0-devel-ubuntu24.04
 
 RUN apt-get clean && apt-get update -y && \
-    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib python3-scipy python3-pip lcov curl cuda-nsight-systems-11-6 cmake ninja-build
+    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib python3-scipy python3-pip lcov curl cuda-nsight-systems-12-6 cmake ninja-build libpython3-dev gcc-11 g++-11 emacs nvi sphinx-doc python3-sphinx-rtd-theme python3-sphinxcontrib.bibtex python3-sphinx-copybutton && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
 
-RUN pip3 install unyt
+RUN g++ --version
+
+RUN pip3 install unyt --break-system-packages
+
+RUN pip3 install blosc2 --break-system-packages
+
+# for Codespaces/VSCode Sphinx support
+RUN pip3 install esbonio --break-system-packages
+
+# h5py from the repo is incompatible with the default numpy 2.1.0
+# Downgrading is not the cleanest solution, but it works...
+# see https://stackoverflow.com/questions/78634235/numpy-dtype-size-changed-may-indicate-binary-incompatibility-expected-96-from
+RUN pip3 install numpy==1.26.4 --break-system-packages
 
 RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key| apt-key add - && \
-    echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" > /etc/apt/sources.list.d/llvm.list
+    echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main" > /etc/apt/sources.list.d/llvm.list
 
 RUN apt-get clean && apt-get update -y && \
-    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends clang-15 llvm-15 libomp-15-dev && \
+    DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends clang-20 llvm-20 libomp-20-dev clangd-20 libstdc++-14-dev && \
     rm -rf /var/lib/apt/lists/*
 
-
 RUN cd /tmp && \
     wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.bz2 && \
     tar xjf openmpi-4.1.4.tar.bz2 && \
     cd openmpi-4.1.4 && \
-    ./configure --prefix=/opt/openmpi --enable-mpi-cxx --with-cuda && \
+    ./configure --prefix=/opt/openmpi --disable-mpi-fortran --disable-oshmem --with-cuda && \
     make -j16 && \
     make install && \
     cd / && \
@@ -36,19 +49,51 @@ RUN cd /tmp && \
     cd / && \
     rm -rf /tmp/hdf5-1.12.2*
 
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+RUN mkdir /tmp/build-adios2 && cd /tmp/build-adios2 && \
+    wget https://github.com/ornladios/ADIOS2/archive/refs/tags/v2.10.1.tar.gz && \
+    tar xzf v2.10.1.tar.gz && \
+    mkdir adios2-build && cd adios2-build && \
+    cmake ../ADIOS2-2.10.1 -DADIOS2_USE_Blosc2=ON -DADIOS2_USE_Fortran=OFF && \
+    make -j 16 && make install && \
+    cd / && \
+    rm -rf /tmp/build-adios2
 
-RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.23.2/cmake-3.23.2-linux-x86_64.tar.gz -o cmake-3.23.2-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.23.2-linux-x86_64.tar.gz -C /opt
+# commit version is ver 0.16.1
+RUN mkdir /tmp/build-openpmd && cd /tmp/build-openpmd && \
+    wget https://github.com/openPMD/openPMD-api/archive/3a60e77.tar.gz && \
+    tar xzf 3a60e77.tar.gz && \
+    mkdir openPMD-api-build && cd openPMD-api-build && \
+    cmake ../openPMD-api-3a60e7714f6143c8fc7bf89809f2167d058359ee -DopenPMD_USE_PYTHON=ON -DPython_EXECUTABLE=$(which python3) -DopenPMD_USE_ADIOS2=ON && \
+    cmake --build . -j 16 && \
+    cmake --build . --target install && \
+    cd / && \
+    rm -rf /tmp/build-openpmd
+
+RUN mkdir /tmp/build-ascent
 
-ENV PATH=/opt/cmake-3.23.2-linux-x86_64/bin:$PATH
+COPY ascent_build.patch /tmp/build-ascent
 
-COPY build_ascent_cuda.sh /tmp/build-ascent/build_ascent_cuda.sh
+## NOTE: with enable_cuda=ON, you need a Docker VM with a LARGE amount of RAM (at least 15 GB RAM, 4 GB swap)
 
+# commit version is dev branch on 2025-04-10
 RUN cd /tmp/build-ascent && \
-    bash build_ascent_cuda.sh && \
+    wget https://github.com/Alpine-DAV/ascent/archive/4da1379.tar.gz && \
+    tar xzf 4da1379.tar.gz -C . --strip-components=1 && \
+    wget https://github.com/LLNL/blt/archive/refs/tags/v0.6.2.tar.gz && \
+    tar xzf v0.6.2.tar.gz -C ./src/blt --strip-components=1 && \
+    cd ./scripts/build_ascent && \
+    patch -p1 build_ascent.sh /tmp/build-ascent/ascent_build.patch && \
+    env enable_cuda=ON enable_mpi=ON build_hdf5=false build_silo=false bash build_ascent.sh && \
     cd / && \
     rm -rf /tmp/build-ascent
 
-# manually downgrade numpy as deprecated `typeDict` is still used by h5py
-RUN pip install numpy==1.21
+# Technically not necessary (as we installed the api above) but makes it easier for package discovery
+RUN env openPMD_USE_MPI=ON python3 -m pip install openpmd-api --no-binary openpmd-api --break-system-packages
+
+# create new user
+RUN groupadd -g 109 render
+RUN useradd --create-home --shell /bin/bash -G render,sudo ci
+
+USER ci
+
+WORKDIR /home/ci
diff --git a/scripts/docker/ascent_build.patch b/scripts/docker/ascent_build.patch
@@ -0,0 +1,50 @@
+--- build_ascent.sh	2024-08-29 21:00:24.000000000 +0000
++++ build_ascent_parthenon.sh	2024-08-30 09:55:58.976365723 +0000
+@@ -21,6 +21,8 @@
+ # Build Options
+ ##############################################################################
+ 
++export MAKEFLAGS="--output-sync=target"
++
+ # shared options
+ enable_cuda="${enable_cuda:=OFF}"
+ enable_hip="${enable_hip:=OFF}"
+ 
+@@ -126,8 +128,8 @@
+ root_dir=$(ospath ${root_dir})
+ root_dir=$(abs_path ${root_dir})
+ script_dir=$(abs_path "$(dirname "${BASH_SOURCE[0]}")")
+-build_dir=$(ospath ${root_dir}/build)
+-source_dir=$(ospath ${root_dir}/source)
++build_dir=$(ospath build)
++source_dir=$(ospath source)
+ 
+ 
+ # root_dir is where we will build and install
+@@ -140,7 +142,7 @@
+ 
+ # install_dir is where we will install
+ # override with `prefix` env var
+-install_dir="${install_dir:=$root_dir/install}"
++install_dir=/usr/local
+ 
+ echo "*** prefix:       ${root_dir}" 
+ echo "*** build root:   ${build_dir}"
+@@ -231,7 +233,7 @@
+ hdf5_short_version=1.14
+ hdf5_src_dir=$(ospath ${source_dir}/hdf5-${hdf5_version})
+ hdf5_build_dir=$(ospath ${build_dir}/hdf5-${hdf5_version}/)
+-hdf5_install_dir=$(ospath ${install_dir}/hdf5-${hdf5_version}/)
++hdf5_install_dir=/usr/local/hdf5/parallel
+ hdf5_tarball=$(ospath ${source_dir}/hdf5-${hdf5_version}.tar.gz)
+ 
+ # build only if install doesn't exist
+@@ -650,7 +650,7 @@ fi # if enable_hip || enable_sycl
+ ################
+ # VTK-m
+ ################
+-vtkm_version=v2.2.0
++vtkm_version=v2.3.0
+ vtkm_src_dir=$(ospath ${source_dir}/vtk-m-${vtkm_version})
+ vtkm_build_dir=$(ospath ${build_dir}/vtk-m-${vtkm_version})
+ vtkm_install_dir=$(ospath ${install_dir}/vtk-m-${vtkm_version}/)
diff --git a/scripts/docker/build_ascent_cuda.sh b/scripts/docker/build_ascent_cuda.sh