diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 3d742289a..23611d6a2 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -38,6 +38,62 @@ env: UCX_VERSION: v1.19.x steps: + - name: Pre-flight Check + parallel: false + run: | + # Check for hanging Docker containers + HANGING_CONTAINERS=$(docker ps -q --filter "name=nixl-ci-test") + if [ -n "$HANGING_CONTAINERS" ]; then + echo "ERROR: Found hanging Docker containers:" + docker ps --filter "name=nixl-ci-test" + exit 1 + fi + echo "Pre-flight check: No hanging containers found" + + # Initial workspace state + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=preflight" + + - name: File Count Debug + parallel: false + run: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=initial" + + # Fail if required files are missing + if [ "$GITLAB_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab directory is missing!" + ls -la ${WORKSPACE} + exit 1 + fi + if [ "$BUILD_SH_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab/build.sh is missing!" + ls -la ${WORKSPACE}/.gitlab + exit 1 + fi + + - name: Stash Marker Debug + parallel: false + run: | + STASH_MARKER=$(ls -1 ${WORKSPACE} | grep -c '^scm-repo.tar$' || echo 0) + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + echo "DEBUG_FILES: count=$FILE_COUNT stash_marker=$STASH_MARKER stage=after_initial" + + # Stash marker status explanation: + # stash_marker=0: Normal workspace (no tarball) - files extracted properly + # stash_marker=1: Tarball present - possible unstash failure, workspace not extracted + if [ "$STASH_MARKER" -eq 1 ]; then + echo "WARNING: scm-repo.tar found - possible Jenkins unstash failure" + echo "Workspace contents:" + ls -la ${WORKSPACE} + else + echo "INFO: No stash tarball found - workspace appears normal" + fi + - name: Get Environment Info parallel: false run: | @@ -61,16 +117,83 @@ steps: ibv_devinfo #ib_write_bw + - name: Start File Monitoring + parallel: false + run: | + # Start background file monitoring for deletions (after workspace is stable) + if command -v inotifywait >/dev/null 2>&1; then + nohup inotifywait -m -r -e delete,delete_self,moved_from \ + --format '%T DELETED: %w%f (event: %e)' --timefmt '%Y-%m-%d %H:%M:%S' \ + ${WORKSPACE}/ > /tmp/file-monitor-${BUILD_ID}-${axis_index}.log 2>&1 & + echo $! > /tmp/monitor-pid-${BUILD_ID}-${axis_index} + echo "File monitoring started, PID: $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index})" + else + echo "inotifywait not available - skipping file monitoring" + touch /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + fi + + - name: Post Environment Info Debug + parallel: false + run: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_env" + + # Fail if required files are missing + if [ "$GITLAB_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab directory disappeared after environment info!" + exit 1 + fi + if [ "$BUILD_SH_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab/build.sh disappeared after environment info!" + exit 1 + fi - name: Build GPU Test Environment parallel: false run: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_build" + docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} . - onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_build" + + # Fail if required files are missing + if [ "$GITLAB_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab directory disappeared after Docker build!" + exit 1 + fi + if [ "$BUILD_SH_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab/build.sh disappeared after Docker build!" + exit 1 + fi + onfail: | + # Show what files were deleted + if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then + echo "=== FILES DELETED DURING BUILD ===" + cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + echo "=================================" + fi + docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + # Comprehensive Docker cleanup + chmod +x .ci/scripts/docker-cleanup.sh + .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}" - name: Run GPU Test Environment parallel: false run: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_run" + docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \ --ulimit memlock=-1:-1 \ --network=host \ @@ -82,33 +205,110 @@ steps: -v ${WORKSPACE}:${WORKSPACE} \ -w ${WORKSPACE} \ "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_run" + + # Fail if required files are missing + if [ "$GITLAB_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab directory disappeared after Docker run!" + exit 1 + fi + if [ "$BUILD_SH_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab/build.sh disappeared after Docker run!" + exit 1 + fi onfail: | + # Show what files were deleted + if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then + echo "=== FILES DELETED DURING BUILD ===" + cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + echo "=================================" + fi docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + # Comprehensive Docker cleanup + chmod +x .ci/scripts/docker-cleanup.sh + .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}" - name: Build parallel: false run: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO") + BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO") + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_build" + + # Final check before attempting build + if [ "$GITLAB_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab directory is missing before build!" + exit 1 + fi + if [ "$BUILD_SH_EXISTS" = "NO" ]; then + echo "ERROR: .gitlab/build.sh is missing before build!" + exit 1 + fi + docker exec -w ${WORKSPACE} -e UCX_VERSION=${UCX_VERSION} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/build.sh ${INSTALL_DIR}" - onfail: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - - name: Test CPP - parallel: false - run: | - docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" - onfail: | + FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l) + echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=YES build_sh=YES stage=post_build" + always: | + # Show what files were deleted + if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then + echo "=== FILES DELETED DURING BUILD ===" + cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + echo "=================================" + fi + # Stop monitoring + if [ -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} ]; then + kill $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index} 2>/dev/null) 2>/dev/null || true + rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} + fi docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + # Comprehensive Docker cleanup (on success or failure) + chmod +x .ci/scripts/docker-cleanup.sh + .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}" - - name: Test Python + - name: Show File Monitoring Results parallel: false run: | - docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" - always: | - docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" - docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + echo "=== COMPLETE FILE MONITORING LOG ===" + if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then + echo "Log file exists, size: $(wc -l < /tmp/file-monitor-${BUILD_ID}-${axis_index}.log) lines" + if [ -s /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then + echo "File deletions detected:" + cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + else + echo "No file deletions detected during build" + fi + else + echo "Monitoring log file not found" + fi + echo "===================================" + + # Final cleanup + rm -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log + rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} + +# Tests commented out - problem occurs before tests +# - name: Test CPP +# parallel: false +# run: | +# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}" +# onfail: | +# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" +# docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" + +# - name: Test Python +# parallel: false +# run: | +# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}" +# always: | +# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" # once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step # https://github.com/Mellanox/ci-demo/pull/111 diff --git a/.ci/scripts/docker-cleanup.sh b/.ci/scripts/docker-cleanup.sh new file mode 100755 index 000000000..bec999874 --- /dev/null +++ b/.ci/scripts/docker-cleanup.sh @@ -0,0 +1,142 @@ +#!/bin/bash +set -euo pipefail + +# Docker Cleanup Script - Comprehensive residue detection and cleanup +# Usage: docker-cleanup.sh [BUILD_ID] [AXIS_INDEX] + +BUILD_ID="${1:-unknown}" +AXIS_INDEX="${2:-unknown}" +SCRIPT_NAME="docker-cleanup.sh" + +echo "=== $SCRIPT_NAME: Starting comprehensive Docker cleanup ===" +echo "Build ID: $BUILD_ID, Axis: $AXIS_INDEX" +echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')" + +# Function to report findings +report_finding() { + local category="$1" + local count="$2" + local details="$3" + + if [ "$count" -gt 0 ]; then + echo "FOUND: $category - $count items" + echo "$details" + echo "---" + else + echo "CLEAN: $category - no residues found" + fi +} + +# Function to safe cleanup with error handling +safe_cleanup() { + local cmd="$1" + local desc="$2" + + echo "Executing: $desc" + if eval "$cmd" 2>/dev/null; then + echo "SUCCESS: $desc" + else + echo "WARNING: Failed to execute: $desc (exit code: $?)" + fi +} + +echo +echo "=== PHASE 1: DETECTION ===" + +# 1. Check for nixl-ci-test containers (all states) +echo "1. Checking for nixl-ci-test containers..." +CONTAINERS=$(docker ps -a --filter "name=nixl-ci-test" --format "{{.ID}} {{.Names}} {{.Status}}" 2>/dev/null || echo "") +CONTAINER_COUNT=$(echo "$CONTAINERS" | grep -c . || echo "0") +report_finding "nixl-ci-test containers" "$CONTAINER_COUNT" "$CONTAINERS" + +# 2. Check for nixl-ci-test images +echo "2. Checking for nixl-ci-test images..." +IMAGES=$(docker images --filter "reference=nixl-ci-test*" --format "{{.Repository}}:{{.Tag}} {{.ID}} {{.Size}}" 2>/dev/null || echo "") +IMAGE_COUNT=$(echo "$IMAGES" | grep -c . || echo "0") +report_finding "nixl-ci-test images" "$IMAGE_COUNT" "$IMAGES" + +# 3. Check for dangling images +echo "3. Checking for dangling images..." +DANGLING=$(docker images -f "dangling=true" --format "{{.ID}} {{.CreatedAt}}" 2>/dev/null || echo "") +DANGLING_COUNT=$(echo "$DANGLING" | grep -c . || echo "0") +report_finding "dangling images" "$DANGLING_COUNT" "$DANGLING" + +# 4. Check for orphaned volumes +echo "4. Checking for orphaned volumes..." +VOLUMES=$(docker volume ls -f "dangling=true" --format "{{.Name}} {{.Driver}}" 2>/dev/null || echo "") +VOLUME_COUNT=$(echo "$VOLUMES" | grep -c . || echo "0") +report_finding "orphaned volumes" "$VOLUME_COUNT" "$VOLUMES" + +# 5. Check for dead containers +echo "5. Checking for dead containers..." +DEAD_CONTAINERS=$(docker ps -a --filter "status=dead" --format "{{.ID}} {{.Names}} {{.CreatedAt}}" 2>/dev/null || echo "") +DEAD_COUNT=$(echo "$DEAD_CONTAINERS" | grep -c . || echo "0") +report_finding "dead containers" "$DEAD_COUNT" "$DEAD_CONTAINERS" + +# 6. Check for exited containers (potential zombies) +echo "6. Checking for old exited containers..." +EXITED_CONTAINERS=$(docker ps -a --filter "status=exited" --filter "name=nixl-ci-test" --format "{{.ID}} {{.Names}} {{.Status}}" 2>/dev/null || echo "") +EXITED_COUNT=$(echo "$EXITED_CONTAINERS" | grep -c . || echo "0") +report_finding "exited nixl-ci-test containers" "$EXITED_COUNT" "$EXITED_CONTAINERS" + +# 7. Check Docker daemon health +echo "7. Checking Docker daemon status..." +DOCKER_INFO=$(docker info --format "{{.Containers}} containers, {{.Images}} images" 2>/dev/null || echo "DOCKER_DAEMON_ERROR") +echo "Docker status: $DOCKER_INFO" + +# 8. Check for build cache +echo "8. Checking Docker build cache..." +BUILD_CACHE=$(docker system df --format "{{.Type}}: {{.Size}}" 2>/dev/null | grep -i cache || echo "") +BUILD_CACHE_COUNT=$(echo "$BUILD_CACHE" | grep -c . || echo "0") +report_finding "build cache" "$BUILD_CACHE_COUNT" "$BUILD_CACHE" + +echo +echo "=== PHASE 2: CLEANUP ===" + +# Cleanup Phase 1: Specific nixl-ci-test resources +if [ "$CONTAINER_COUNT" -gt 0 ]; then + echo "Cleaning up nixl-ci-test containers..." + safe_cleanup "docker ps -aq --filter 'name=nixl-ci-test' | xargs -r docker rm -f" "Remove all nixl-ci-test containers" +fi + +if [ "$IMAGE_COUNT" -gt 0 ]; then + echo "Cleaning up nixl-ci-test images..." + safe_cleanup "docker images -q --filter 'reference=nixl-ci-test*' | xargs -r docker rmi -f" "Remove all nixl-ci-test images" +fi + +# Cleanup Phase 2: General Docker cleanup +if [ "$DEAD_COUNT" -gt 0 ]; then + echo "Cleaning up dead containers..." + safe_cleanup "docker container prune -f" "Remove dead containers" +fi + +if [ "$DANGLING_COUNT" -gt 0 ]; then + echo "Cleaning up dangling images..." + safe_cleanup "docker image prune -f" "Remove dangling images" +fi + +if [ "$VOLUME_COUNT" -gt 0 ]; then + echo "Cleaning up orphaned volumes..." + safe_cleanup "docker volume prune -f" "Remove orphaned volumes" +fi + +# Cleanup Phase 3: Network cleanup (if needed) +echo "Cleaning up unused networks..." +safe_cleanup "docker network prune -f" "Remove unused networks" + +# Final verification +echo +echo "=== PHASE 3: VERIFICATION ===" +echo "Final container count:" +docker ps -a --filter "name=nixl-ci-test" --format "table {{.Names}}\t{{.Status}}" 2>/dev/null || echo "No containers found" + +echo "Final image count:" +docker images --filter "reference=nixl-ci-test*" --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}" 2>/dev/null || echo "No images found" + +# System summary +echo +echo "=== SYSTEM SUMMARY ===" +docker system df 2>/dev/null || echo "Could not get system usage" + +echo +echo "=== $SCRIPT_NAME: Cleanup completed at $(date '+%Y-%m-%d %H:%M:%S') ==="