Skip to content

Commit d5ade59

Browse files
committed
CI/TEST/DNR: Debug prints
1 parent 67a12b2 commit d5ade59

File tree

1 file changed

+225
-34
lines changed

1 file changed

+225
-34
lines changed

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 225 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -38,39 +38,160 @@ env:
3838
UCX_VERSION: v1.19.x
3939

4040
steps:
41-
- name: Get Environment Info
41+
- name: Pre-flight Check
4242
parallel: false
4343
run: |
44-
set +ex
45-
# print kernel version
46-
uname -r
47-
# print ofed info
48-
ofed_info -s
49-
# print nvidia drivers info
50-
lsmod | grep nvidia_peermem
51-
lsmod | grep gdrdrv
52-
lsmod | grep nvidia_fs
53-
# print nvidia-smi
54-
nvidia-smi
55-
nvidia-smi topo -m
56-
# print MPS info
57-
pgrep -a mps
58-
# print compute mode
59-
nvidia-smi -q | grep -i "compute mode"
60-
# check rdma status
61-
ibv_devinfo
62-
#ib_write_bw
44+
# Check for hanging Docker containers
45+
HANGING_CONTAINERS=$(docker ps -q --filter "name=nixl-ci-test")
46+
if [ -n "$HANGING_CONTAINERS" ]; then
47+
echo "ERROR: Found hanging Docker containers:"
48+
docker ps --filter "name=nixl-ci-test"
49+
exit 1
50+
fi
51+
echo "Pre-flight check: No hanging containers found"
6352
53+
# Initial workspace state
54+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
55+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
56+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
57+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=preflight"
58+
59+
- name: Start File Monitoring
60+
parallel: false
61+
run: |
62+
# Start background file monitoring for deletions
63+
if command -v inotifywait >/dev/null 2>&1; then
64+
nohup inotifywait -m -r -e delete,delete_self,moved_from \
65+
--format '%T DELETED: %w%f (event: %e)' --timefmt '%Y-%m-%d %H:%M:%S' \
66+
${WORKSPACE}/ > /tmp/file-monitor-${BUILD_ID}-${axis_index}.log 2>&1 &
67+
echo $! > /tmp/monitor-pid-${BUILD_ID}-${axis_index}
68+
echo "File monitoring started, PID: $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index})"
69+
else
70+
echo "inotifywait not available - skipping file monitoring"
71+
touch /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
72+
fi
73+
74+
- name: File Count Debug
75+
parallel: false
76+
run: |
77+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
78+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
79+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
80+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=initial"
81+
82+
# Fail if required files are missing
83+
if [ "$GITLAB_EXISTS" = "NO" ]; then
84+
echo "ERROR: .gitlab directory is missing!"
85+
ls -la ${WORKSPACE}
86+
exit 1
87+
fi
88+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
89+
echo "ERROR: .gitlab/build.sh is missing!"
90+
ls -la ${WORKSPACE}/.gitlab
91+
exit 1
92+
fi
93+
94+
- name: Stash Marker Debug
95+
parallel: false
96+
run: |
97+
STASH_MARKER=$(ls -1 ${WORKSPACE} | grep -c '^scm-repo.tar$' || echo 0)
98+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
99+
echo "DEBUG_FILES: count=$FILE_COUNT stash_marker=$STASH_MARKER stage=after_initial"
100+
101+
# Stash marker status explanation:
102+
# stash_marker=0: Normal workspace (no tarball) - files extracted properly
103+
# stash_marker=1: Tarball present - possible unstash failure, workspace not extracted
104+
if [ "$STASH_MARKER" -eq 1 ]; then
105+
echo "WARNING: scm-repo.tar found - possible Jenkins unstash failure"
106+
echo "Workspace contents:"
107+
ls -la ${WORKSPACE}
108+
else
109+
echo "INFO: No stash tarball found - workspace appears normal"
110+
fi
111+
112+
# Environment info stage disabled - suspected of causing file disappearance
113+
# - name: Get Environment Info
114+
# parallel: false
115+
# run: |
116+
# set +ex
117+
# # print kernel version
118+
# uname -r
119+
# # print ofed info
120+
# ofed_info -s
121+
# # print nvidia drivers info
122+
# lsmod | grep nvidia_peermem
123+
# lsmod | grep gdrdrv
124+
# lsmod | grep nvidia_fs
125+
# # print nvidia-smi
126+
# nvidia-smi
127+
# nvidia-smi topo -m
128+
# # print MPS info
129+
# pgrep -a mps
130+
# # print compute mode
131+
# nvidia-smi -q | grep -i "compute mode"
132+
# # check rdma status
133+
# ibv_devinfo
134+
# #ib_write_bw
135+
136+
# - name: Post Environment Info Debug
137+
# parallel: false
138+
# run: |
139+
# FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
140+
# GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
141+
# BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
142+
# echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_env"
143+
#
144+
# # Fail if required files are missing
145+
# if [ "$GITLAB_EXISTS" = "NO" ]; then
146+
# echo "ERROR: .gitlab directory disappeared after environment info!"
147+
# exit 1
148+
# fi
149+
# if [ "$BUILD_SH_EXISTS" = "NO" ]; then
150+
# echo "ERROR: .gitlab/build.sh disappeared after environment info!"
151+
# exit 1
152+
# fi
64153

65154
- name: Build GPU Test Environment
66155
parallel: false
67156
run: |
157+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
158+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
159+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
160+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_build"
161+
68162
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} .
69-
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
163+
164+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
165+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
166+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
167+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_build"
168+
169+
# Fail if required files are missing
170+
if [ "$GITLAB_EXISTS" = "NO" ]; then
171+
echo "ERROR: .gitlab directory disappeared after Docker build!"
172+
exit 1
173+
fi
174+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
175+
echo "ERROR: .gitlab/build.sh disappeared after Docker build!"
176+
exit 1
177+
fi
178+
onfail: |
179+
# Show what files were deleted
180+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
181+
echo "=== FILES DELETED DURING BUILD ==="
182+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
183+
echo "================================="
184+
fi
185+
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
70186
71187
- name: Run GPU Test Environment
72188
parallel: false
73189
run: |
190+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
191+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
192+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
193+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_run"
194+
74195
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
75196
--ulimit memlock=-1:-1 \
76197
--network=host \
@@ -82,33 +203,103 @@ steps:
82203
-v ${WORKSPACE}:${WORKSPACE} \
83204
-w ${WORKSPACE} \
84205
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
206+
207+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
208+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
209+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
210+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_run"
211+
212+
# Fail if required files are missing
213+
if [ "$GITLAB_EXISTS" = "NO" ]; then
214+
echo "ERROR: .gitlab directory disappeared after Docker run!"
215+
exit 1
216+
fi
217+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
218+
echo "ERROR: .gitlab/build.sh disappeared after Docker run!"
219+
exit 1
220+
fi
85221
onfail: |
222+
# Show what files were deleted
223+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
224+
echo "=== FILES DELETED DURING BUILD ==="
225+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
226+
echo "================================="
227+
fi
86228
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
87229
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
88230
89231
- name: Build
90232
parallel: false
91233
run: |
234+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
235+
GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
236+
BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
237+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_build"
238+
239+
# Final check before attempting build
240+
if [ "$GITLAB_EXISTS" = "NO" ]; then
241+
echo "ERROR: .gitlab directory is missing before build!"
242+
exit 1
243+
fi
244+
if [ "$BUILD_SH_EXISTS" = "NO" ]; then
245+
echo "ERROR: .gitlab/build.sh is missing before build!"
246+
exit 1
247+
fi
248+
92249
docker exec -w ${WORKSPACE} -e UCX_VERSION=${UCX_VERSION} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/build.sh ${INSTALL_DIR}"
93-
onfail: |
250+
251+
FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
252+
echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=YES build_sh=YES stage=post_build"
253+
always: |
254+
# Show what files were deleted
255+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
256+
echo "=== FILES DELETED DURING BUILD ==="
257+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
258+
echo "================================="
259+
fi
260+
# Stop monitoring
261+
if [ -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} ]; then
262+
kill $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index} 2>/dev/null) 2>/dev/null || true
263+
rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
264+
fi
94265
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95266
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
96267
97-
- name: Test CPP
268+
- name: Show File Monitoring Results
98269
parallel: false
99270
run: |
100-
docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
101-
onfail: |
102-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
103-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
271+
echo "=== COMPLETE FILE MONITORING LOG ==="
272+
if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
273+
echo "Log file exists, size: $(wc -l < /tmp/file-monitor-${BUILD_ID}-${axis_index}.log) lines"
274+
if [ -s /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
275+
echo "File deletions detected:"
276+
cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
277+
else
278+
echo "No file deletions detected during build"
279+
fi
280+
else
281+
echo "Monitoring log file not found"
282+
fi
283+
echo "==================================="
104284
105-
- name: Test Python
106-
parallel: false
107-
run: |
108-
docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
109-
always: |
110-
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
111-
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
285+
# Final cleanup
286+
rm -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
287+
rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
288+
289+
# - name: Test CPP
290+
# parallel: false
291+
# run: |
292+
# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
293+
# onfail: |
294+
# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
295+
# docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
296+
#
297+
# - name: Test Python
298+
# parallel: false
299+
# run: |
300+
# docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
301+
# always: |
302+
# docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
112303

113304
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
114305
# https://github.com/Mellanox/ci-demo/pull/111

0 commit comments

Comments
 (0)