Skip to content

Commit db8f70e

Browse files
committed
CI/TEST/DNR: Debug prints
1 parent 67a12b2 commit db8f70e

File tree

2 files changed

+356
-14
lines changed

2 files changed

+356
-14
lines changed

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 214 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,62 @@ env:
3838
UCX_VERSION: v1.19.x
3939

4040
steps:
41+
- name: Pre-flight Check
42+
parallel: false
43+
run: |
44+
# Check for hanging Docker containers
45+
HANGING_CONTAINERS=$(docker ps -q --filter "name=nixl-ci-test")
46+
if [ -n "$HANGING_CONTAINERS" ]; then
47+
echo "ERROR: Found hanging Docker containers:"
48+
docker ps --filter "name=nixl-ci-test"
49+
exit 1
50+
fi
51+
echo "Pre-flight check: No hanging containers found"
52+
53+
# Initial workspace state
54+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
55+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
56+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
57+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=preflight"
58+
59+
- name: File Count Debug
60+
parallel: false
61+
run: |
62+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
63+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
64+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
65+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=initial"
66+
67+
# Fail if required files are missing
68+
if [ "$GITLAB_EXISTS" = "NO" ]; then
69+
echo "ERROR: .gitlab directory is missing!"
70+
ls -la ${WORKSPACE}
71+
exit 1
72+
fi
73+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
74+
echo "ERROR: .gitlab/build.sh is missing!"
75+
ls -la ${WORKSPACE}/.gitlab
76+
exit 1
77+
fi
78+
79+
- name: Stash Marker Debug
80+
parallel: false
81+
run: |
82+
STASH_MARKER=$(ls -1 ${WORKSPACE} | grep -c '^scm-repo.tar$' || echo 0)
83+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
84+
echo "DEBUG_FILES: count=$FILE_COUNT stash_marker=$STASH_MARKER stage=after_initial"
85+
86+
# Stash marker status explanation:
87+
# stash_marker=0: Normal workspace (no tarball) - files extracted properly
88+
# stash_marker=1: Tarball present - possible unstash failure, workspace not extracted
89+
if [ "$STASH_MARKER" -eq 1 ]; then
90+
echo "WARNING: scm-repo.tar found - possible Jenkins unstash failure"
91+
echo "Workspace contents:"
92+
ls -la ${WORKSPACE}
93+
else
94+
echo "INFO: No stash tarball found - workspace appears normal"
95+
fi
96+
4197
- name: Get Environment Info
4298
parallel: false
4399
run: |
@@ -61,16 +117,83 @@ steps:
61117
ibv_devinfo
62118
#ib_write_bw
63119
120+
- name: Start File Monitoring
121+
parallel: false
122+
run: |
123+
# Start background file monitoring for deletions (after workspace is stable)
124+
if command -v inotifywait >/dev/null 2>&1; then
125+
nohup inotifywait -m -r -e delete,delete_self,moved_from \
126+
--format '%T DELETED: %w%f (event: %e)' --timefmt '%Y-%m-%d %H:%M:%S' \
127+
${WORKSPACE}/ > /tmp/file-monitor-${BUILD_ID}-${axis_index}.log 2>&1 &
128+
echo $! > /tmp/monitor-pid-${BUILD_ID}-${axis_index}
129+
echo "File monitoring started, PID: $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index})"
130+
else
131+
echo "inotifywait not available - skipping file monitoring"
132+
touch /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
133+
fi
134+
135+
- name: Post Environment Info Debug
136+
parallel: false
137+
run: |
138+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
139+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
140+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
141+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_env"
142+
143+
# Fail if required files are missing
144+
if [ "$GITLAB_EXISTS" = "NO" ]; then
145+
echo "ERROR: .gitlab directory disappeared after environment info!"
146+
exit 1
147+
fi
148+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
149+
echo "ERROR: .gitlab/build.sh disappeared after environment info!"
150+
exit 1
151+
fi
64152
65153
- name: Build GPU Test Environment
66154
parallel: false
67155
run: |
156+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
157+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
158+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
159+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_build"
160+
68161
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} .
69-
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
162+
163+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
164+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
165+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
166+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_build"
167+
168+
# Fail if required files are missing
169+
if [ "$GITLAB_EXISTS" = "NO" ]; then
170+
echo "ERROR: .gitlab directory disappeared after Docker build!"
171+
exit 1
172+
fi
173+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
174+
echo "ERROR: .gitlab/build.sh disappeared after Docker build!"
175+
exit 1
176+
fi
177+
onfail: |
178+
# Show what files were deleted
179+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
180+
echo "=== FILES DELETED DURING BUILD ==="
181+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
182+
echo "================================="
183+
fi
184+
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
185+
# Comprehensive Docker cleanup
186+
chmod +x .ci/scripts/docker-cleanup.sh
187+
.ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
70188
71189
- name: Run GPU Test Environment
72190
parallel: false
73191
run: |
192+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
193+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
194+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
195+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_run"
196+
74197
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
75198
--ulimit memlock=-1:-1 \
76199
--network=host \
@@ -82,33 +205,110 @@ steps:
82205
-v ${WORKSPACE}:${WORKSPACE} \
83206
-w ${WORKSPACE} \
84207
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
208+
209+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
210+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
211+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
212+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_run"
213+
214+
# Fail if required files are missing
215+
if [ "$GITLAB_EXISTS" = "NO" ]; then
216+
echo "ERROR: .gitlab directory disappeared after Docker run!"
217+
exit 1
218+
fi
219+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
220+
echo "ERROR: .gitlab/build.sh disappeared after Docker run!"
221+
exit 1
222+
fi
85223
onfail: |
224+
# Show what files were deleted
225+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
226+
echo "=== FILES DELETED DURING BUILD ==="
227+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
228+
echo "================================="
229+
fi
86230
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
87231
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
232+
# Comprehensive Docker cleanup
233+
chmod +x .ci/scripts/docker-cleanup.sh
234+
.ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
88235
89236
- name: Build
90237
parallel: false
91238
run: |
239+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
240+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
241+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
242+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_build"
243+
244+
# Final check before attempting build
245+
if [ "$GITLAB_EXISTS" = "NO" ]; then
246+
echo "ERROR: .gitlab directory is missing before build!"
247+
exit 1
248+
fi
249+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
250+
echo "ERROR: .gitlab/build.sh is missing before build!"
251+
exit 1
252+
fi
253+
92254
docker exec -w ${WORKSPACE} -e UCX_VERSION=${UCX_VERSION} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/build.sh ${INSTALL_DIR}"
93-
onfail: |
94-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
96255
97-
- name: Test CPP
98-
parallel: false
99-
run: |
100-
docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
101-
onfail: |
256+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
257+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=YES build_sh=YES stage=post_build"
258+
always: |
259+
# Show what files were deleted
260+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
261+
echo "=== FILES DELETED DURING BUILD ==="
262+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
263+
echo "================================="
264+
fi
265+
# Stop monitoring
266+
if [ -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} ]; then
267+
kill $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index} 2>/dev/null) 2>/dev/null || true
268+
rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
269+
fi
102270
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
103271
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
272+
# Comprehensive Docker cleanup (on success or failure)
273+
chmod +x .ci/scripts/docker-cleanup.sh
274+
.ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
104275
105-
- name: Test Python
276+
- name: Show File Monitoring Results
106277
parallel: false
107278
run: |
108-
docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
109-
always: |
110-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
111-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
279+
echo "=== COMPLETE FILE MONITORING LOG ==="
280+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
281+
echo "Log file exists, size: $(wc -l < /tmp/file-monitor-${BUILD_ID}-${axis_index}.log) lines"
282+
if [ -s /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
283+
echo "File deletions detected:"
284+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
285+
else
286+
echo "No file deletions detected during build"
287+
fi
288+
else
289+
echo "Monitoring log file not found"
290+
fi
291+
echo "==================================="
292+
293+
# Final cleanup
294+
rm -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
295+
rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
296+
297+
# Tests commented out - problem occurs before tests
298+
# - name: Test CPP
299+
# parallel: false
300+
# run: |
301+
# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
302+
# onfail: |
303+
# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
304+
# docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
305+
306+
# - name: Test Python
307+
# parallel: false
308+
# run: |
309+
# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
310+
# always: |
311+
# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
112312

113313
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
114314
# https://github.com/Mellanox/ci-demo/pull/111

0 commit comments

Comments
 (0)