@@ -38,39 +38,160 @@ env:
38
38
UCX_VERSION : v1.19.x
39
39
40
40
steps :
41
- - name : Get Environment Info
41
+ - name : Pre-flight Check
42
42
parallel : false
43
43
run : |
44
- set +ex
45
- # print kernel version
46
- uname -r
47
- # print ofed info
48
- ofed_info -s
49
- # print nvidia drivers info
50
- lsmod | grep nvidia_peermem
51
- lsmod | grep gdrdrv
52
- lsmod | grep nvidia_fs
53
- # print nvidia-smi
54
- nvidia-smi
55
- nvidia-smi topo -m
56
- # print MPS info
57
- pgrep -a mps
58
- # print compute mode
59
- nvidia-smi -q | grep -i "compute mode"
60
- # check rdma status
61
- ibv_devinfo
62
- #ib_write_bw
44
+ # Check for hanging Docker containers
45
+ HANGING_CONTAINERS=$(docker ps -q --filter "name=nixl-ci-test")
46
+ if [ -n "$HANGING_CONTAINERS" ]; then
47
+ echo "ERROR: Found hanging Docker containers:"
48
+ docker ps --filter "name=nixl-ci-test"
49
+ exit 1
50
+ fi
51
+ echo "Pre-flight check: No hanging containers found"
63
52
53
+ # Initial workspace state
54
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
55
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
56
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
57
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=preflight"
58
+
59
+ - name : Start File Monitoring
60
+ parallel : false
61
+ run : |
62
+ # Start background file monitoring for deletions
63
+ if command -v inotifywait >/dev/null 2>&1; then
64
+ nohup inotifywait -m -r -e delete,delete_self,moved_from \
65
+ --format '%T DELETED: %w%f (event: %e)' --timefmt '%Y-%m-%d %H:%M:%S' \
66
+ ${WORKSPACE}/ > /tmp/file-monitor-${BUILD_ID}-${axis_index}.log 2>&1 &
67
+ echo $! > /tmp/monitor-pid-${BUILD_ID}-${axis_index}
68
+ echo "File monitoring started, PID: $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index})"
69
+ else
70
+ echo "inotifywait not available - skipping file monitoring"
71
+ touch /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
72
+ fi
73
+
74
+ - name : File Count Debug
75
+ parallel : false
76
+ run : |
77
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
78
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
79
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
80
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=initial"
81
+
82
+ # Fail if required files are missing
83
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
84
+ echo "ERROR: .gitlab directory is missing!"
85
+ ls -la ${WORKSPACE}
86
+ exit 1
87
+ fi
88
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
89
+ echo "ERROR: .gitlab/build.sh is missing!"
90
+ ls -la ${WORKSPACE}/.gitlab
91
+ exit 1
92
+ fi
93
+
94
+ - name : Stash Marker Debug
95
+ parallel : false
96
+ run : |
97
+ STASH_MARKER=$(ls -1 ${WORKSPACE} | grep -c '^scm-repo.tar$' || echo 0)
98
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
99
+ echo "DEBUG_FILES: count=$FILE_COUNT stash_marker=$STASH_MARKER stage=after_initial"
100
+
101
+ # Stash marker status explanation:
102
+ # stash_marker=0: Normal workspace (no tarball) - files extracted properly
103
+ # stash_marker=1: Tarball present - possible unstash failure, workspace not extracted
104
+ if [ "$STASH_MARKER" -eq 1 ]; then
105
+ echo "WARNING: scm-repo.tar found - possible Jenkins unstash failure"
106
+ echo "Workspace contents:"
107
+ ls -la ${WORKSPACE}
108
+ else
109
+ echo "INFO: No stash tarball found - workspace appears normal"
110
+ fi
111
+
112
+ # Environment info stage disabled - suspected of causing file disappearance
113
+ # - name: Get Environment Info
114
+ # parallel: false
115
+ # run: |
116
+ # set +ex
117
+ # # print kernel version
118
+ # uname -r
119
+ # # print ofed info
120
+ # ofed_info -s
121
+ # # print nvidia drivers info
122
+ # lsmod | grep nvidia_peermem
123
+ # lsmod | grep gdrdrv
124
+ # lsmod | grep nvidia_fs
125
+ # # print nvidia-smi
126
+ # nvidia-smi
127
+ # nvidia-smi topo -m
128
+ # # print MPS info
129
+ # pgrep -a mps
130
+ # # print compute mode
131
+ # nvidia-smi -q | grep -i "compute mode"
132
+ # # check rdma status
133
+ # ibv_devinfo
134
+ # #ib_write_bw
135
+
136
+ # - name: Post Environment Info Debug
137
+ # parallel: false
138
+ # run: |
139
+ # FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
140
+ # GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
141
+ # BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
142
+ # echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_env"
143
+ #
144
+ # # Fail if required files are missing
145
+ # if [ "$GITLAB_EXISTS" = "NO" ]; then
146
+ # echo "ERROR: .gitlab directory disappeared after environment info!"
147
+ # exit 1
148
+ # fi
149
+ # if [ "$BUILD_SH_EXISTS" = "NO" ]; then
150
+ # echo "ERROR: .gitlab/build.sh disappeared after environment info!"
151
+ # exit 1
152
+ # fi
64
153
65
154
- name : Build GPU Test Environment
66
155
parallel : false
67
156
run : |
157
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
158
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
159
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
160
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_build"
161
+
68
162
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} .
69
- onfail : docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
163
+
164
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
165
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
166
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
167
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_build"
168
+
169
+ # Fail if required files are missing
170
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
171
+ echo "ERROR: .gitlab directory disappeared after Docker build!"
172
+ exit 1
173
+ fi
174
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
175
+ echo "ERROR: .gitlab/build.sh disappeared after Docker build!"
176
+ exit 1
177
+ fi
178
+ onfail : |
179
+ # Show what files were deleted
180
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
181
+ echo "=== FILES DELETED DURING BUILD ==="
182
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
183
+ echo "================================="
184
+ fi
185
+ docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
70
186
71
187
- name : Run GPU Test Environment
72
188
parallel : false
73
189
run : |
190
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
191
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
192
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
193
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_run"
194
+
74
195
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
75
196
--ulimit memlock=-1:-1 \
76
197
--network=host \
@@ -82,33 +203,103 @@ steps:
82
203
-v ${WORKSPACE}:${WORKSPACE} \
83
204
-w ${WORKSPACE} \
84
205
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
206
+
207
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
208
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
209
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
210
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_run"
211
+
212
+ # Fail if required files are missing
213
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
214
+ echo "ERROR: .gitlab directory disappeared after Docker run!"
215
+ exit 1
216
+ fi
217
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
218
+ echo "ERROR: .gitlab/build.sh disappeared after Docker run!"
219
+ exit 1
220
+ fi
85
221
onfail : |
222
+ # Show what files were deleted
223
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
224
+ echo "=== FILES DELETED DURING BUILD ==="
225
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
226
+ echo "================================="
227
+ fi
86
228
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
87
229
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
88
230
89
231
- name : Build
90
232
parallel : false
91
233
run : |
234
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
235
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
236
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
237
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_build"
238
+
239
+ # Final check before attempting build
240
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
241
+ echo "ERROR: .gitlab directory is missing before build!"
242
+ exit 1
243
+ fi
244
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
245
+ echo "ERROR: .gitlab/build.sh is missing before build!"
246
+ exit 1
247
+ fi
248
+
92
249
docker exec -w ${WORKSPACE} -e UCX_VERSION=${UCX_VERSION} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/build.sh ${INSTALL_DIR}"
93
- onfail : |
250
+
251
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
252
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=YES build_sh=YES stage=post_build"
253
+ always : |
254
+ # Show what files were deleted
255
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
256
+ echo "=== FILES DELETED DURING BUILD ==="
257
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
258
+ echo "================================="
259
+ fi
260
+ # Stop monitoring
261
+ if [ -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} ]; then
262
+ kill $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index} 2>/dev/null) 2>/dev/null || true
263
+ rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
264
+ fi
94
265
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95
266
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
96
267
97
- - name : Test CPP
268
+ - name : Show File Monitoring Results
98
269
parallel : false
99
270
run : |
100
- docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
101
- onfail : |
102
- docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
103
- docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
271
+ echo "=== COMPLETE FILE MONITORING LOG ==="
272
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
273
+ echo "Log file exists, size: $(wc -l < /tmp/file-monitor-${BUILD_ID}-${axis_index}.log) lines"
274
+ if [ -s /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
275
+ echo "File deletions detected:"
276
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
277
+ else
278
+ echo "No file deletions detected during build"
279
+ fi
280
+ else
281
+ echo "Monitoring log file not found"
282
+ fi
283
+ echo "==================================="
104
284
105
- - name : Test Python
106
- parallel : false
107
- run : |
108
- docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
109
- always : |
110
- docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
111
- docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
285
+ # Final cleanup
286
+ rm -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
287
+ rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
288
+
289
+ # - name: Test CPP
290
+ # parallel: false
291
+ # run: |
292
+ # docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
293
+ # onfail: |
294
+ # docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
295
+ # docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
296
+ #
297
+ # - name: Test Python
298
+ # parallel: false
299
+ # run: |
300
+ # docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
301
+ # always: |
302
+ # docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
112
303
113
304
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
114
305
# https://github.com/Mellanox/ci-demo/pull/111
0 commit comments