38
38
UCX_VERSION : v1.19.x
39
39
40
40
steps :
41
+ - name : Pre-flight Check
42
+ parallel : false
43
+ run : |
44
+ # Check for hanging Docker containers
45
+ HANGING_CONTAINERS=$(docker ps -q --filter "name=nixl-ci-test")
46
+ if [ -n "$HANGING_CONTAINERS" ]; then
47
+ echo "ERROR: Found hanging Docker containers:"
48
+ docker ps --filter "name=nixl-ci-test"
49
+ exit 1
50
+ fi
51
+ echo "Pre-flight check: No hanging containers found"
52
+
53
+ # Initial workspace state
54
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
55
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
56
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
57
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=preflight"
58
+
59
+ - name : File Count Debug
60
+ parallel : false
61
+ run : |
62
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
63
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
64
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
65
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=initial"
66
+
67
+ # Fail if required files are missing
68
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
69
+ echo "ERROR: .gitlab directory is missing!"
70
+ ls -la ${WORKSPACE}
71
+ exit 1
72
+ fi
73
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
74
+ echo "ERROR: .gitlab/build.sh is missing!"
75
+ ls -la ${WORKSPACE}/.gitlab
76
+ exit 1
77
+ fi
78
+
79
+ - name : Stash Marker Debug
80
+ parallel : false
81
+ run : |
82
+ STASH_MARKER=$(ls -1 ${WORKSPACE} | grep -c '^scm-repo.tar$' || echo 0)
83
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
84
+ echo "DEBUG_FILES: count=$FILE_COUNT stash_marker=$STASH_MARKER stage=after_initial"
85
+
86
+ # Stash marker status explanation:
87
+ # stash_marker=0: Normal workspace (no tarball) - files extracted properly
88
+ # stash_marker=1: Tarball present - possible unstash failure, workspace not extracted
89
+ if [ "$STASH_MARKER" -eq 1 ]; then
90
+ echo "WARNING: scm-repo.tar found - possible Jenkins unstash failure"
91
+ echo "Workspace contents:"
92
+ ls -la ${WORKSPACE}
93
+ else
94
+ echo "INFO: No stash tarball found - workspace appears normal"
95
+ fi
96
+
41
97
- name : Get Environment Info
42
98
parallel : false
43
99
run : |
@@ -61,16 +117,83 @@ steps:
61
117
ibv_devinfo
62
118
#ib_write_bw
63
119
120
+ - name : Start File Monitoring
121
+ parallel : false
122
+ run : |
123
+ # Start background file monitoring for deletions (after workspace is stable)
124
+ if command -v inotifywait >/dev/null 2>&1; then
125
+ nohup inotifywait -m -r -e delete,delete_self,moved_from \
126
+ --format '%T DELETED: %w%f (event: %e)' --timefmt '%Y-%m-%d %H:%M:%S' \
127
+ ${WORKSPACE}/ > /tmp/file-monitor-${BUILD_ID}-${axis_index}.log 2>&1 &
128
+ echo $! > /tmp/monitor-pid-${BUILD_ID}-${axis_index}
129
+ echo "File monitoring started, PID: $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index})"
130
+ else
131
+ echo "inotifywait not available - skipping file monitoring"
132
+ touch /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
133
+ fi
134
+
135
+ - name : Post Environment Info Debug
136
+ parallel : false
137
+ run : |
138
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
139
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
140
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
141
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_env"
142
+
143
+ # Fail if required files are missing
144
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
145
+ echo "ERROR: .gitlab directory disappeared after environment info!"
146
+ exit 1
147
+ fi
148
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
149
+ echo "ERROR: .gitlab/build.sh disappeared after environment info!"
150
+ exit 1
151
+ fi
64
152
65
153
- name : Build GPU Test Environment
66
154
parallel : false
67
155
run : |
156
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
157
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
158
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
159
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_build"
160
+
68
161
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} .
69
- onfail : docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
162
+
163
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
164
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
165
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
166
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_build"
167
+
168
+ # Fail if required files are missing
169
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
170
+ echo "ERROR: .gitlab directory disappeared after Docker build!"
171
+ exit 1
172
+ fi
173
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
174
+ echo "ERROR: .gitlab/build.sh disappeared after Docker build!"
175
+ exit 1
176
+ fi
177
+ onfail : |
178
+ # Show what files were deleted
179
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
180
+ echo "=== FILES DELETED DURING BUILD ==="
181
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
182
+ echo "================================="
183
+ fi
184
+ docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
185
+ # Comprehensive Docker cleanup
186
+ chmod +x .ci/scripts/docker-cleanup.sh
187
+ .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
70
188
71
189
- name : Run GPU Test Environment
72
190
parallel : false
73
191
run : |
192
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
193
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
194
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
195
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_docker_run"
196
+
74
197
docker run -dt --name "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" \
75
198
--ulimit memlock=-1:-1 \
76
199
--network=host \
@@ -82,33 +205,110 @@ steps:
82
205
-v ${WORKSPACE}:${WORKSPACE} \
83
206
-w ${WORKSPACE} \
84
207
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
208
+
209
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
210
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
211
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
212
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=post_docker_run"
213
+
214
+ # Fail if required files are missing
215
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
216
+ echo "ERROR: .gitlab directory disappeared after Docker run!"
217
+ exit 1
218
+ fi
219
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
220
+ echo "ERROR: .gitlab/build.sh disappeared after Docker run!"
221
+ exit 1
222
+ fi
85
223
onfail : |
224
+ # Show what files were deleted
225
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
226
+ echo "=== FILES DELETED DURING BUILD ==="
227
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
228
+ echo "================================="
229
+ fi
86
230
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
87
231
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
232
+ # Comprehensive Docker cleanup
233
+ chmod +x .ci/scripts/docker-cleanup.sh
234
+ .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
88
235
89
236
- name : Build
90
237
parallel : false
91
238
run : |
239
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
240
+ GITLAB_EXISTS=$(test -d "${WORKSPACE}/.gitlab" && echo "YES" || echo "NO")
241
+ BUILD_SH_EXISTS=$(test -f "${WORKSPACE}/.gitlab/build.sh" && echo "YES" || echo "NO")
242
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=$GITLAB_EXISTS build_sh=$BUILD_SH_EXISTS stage=pre_build"
243
+
244
+ # Final check before attempting build
245
+ if [ "$GITLAB_EXISTS" = "NO" ]; then
246
+ echo "ERROR: .gitlab directory is missing before build!"
247
+ exit 1
248
+ fi
249
+ if [ "$BUILD_SH_EXISTS" = "NO" ]; then
250
+ echo "ERROR: .gitlab/build.sh is missing before build!"
251
+ exit 1
252
+ fi
253
+
92
254
docker exec -w ${WORKSPACE} -e UCX_VERSION=${UCX_VERSION} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/build.sh ${INSTALL_DIR}"
93
- onfail : |
94
- docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
95
- docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
96
255
97
- - name : Test CPP
98
- parallel : false
99
- run : |
100
- docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
101
- onfail : |
256
+ FILE_COUNT=$(find ${WORKSPACE} -type f | wc -l)
257
+ echo "DEBUG_FILES: count=$FILE_COUNT gitlab_dir=YES build_sh=YES stage=post_build"
258
+ always : |
259
+ # Show what files were deleted
260
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
261
+ echo "=== FILES DELETED DURING BUILD ==="
262
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
263
+ echo "================================="
264
+ fi
265
+ # Stop monitoring
266
+ if [ -f /tmp/monitor-pid-${BUILD_ID}-${axis_index} ]; then
267
+ kill $(cat /tmp/monitor-pid-${BUILD_ID}-${axis_index} 2>/dev/null) 2>/dev/null || true
268
+ rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
269
+ fi
102
270
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
103
271
docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
272
+ # Comprehensive Docker cleanup (on success or failure)
273
+ chmod +x .ci/scripts/docker-cleanup.sh
274
+ .ci/scripts/docker-cleanup.sh "${BUILD_ID}" "${axis_index}"
104
275
105
- - name : Test Python
276
+ - name : Show File Monitoring Results
106
277
parallel : false
107
278
run : |
108
- docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
109
- always : |
110
- docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
111
- docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
279
+ echo "=== COMPLETE FILE MONITORING LOG ==="
280
+ if [ -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
281
+ echo "Log file exists, size: $(wc -l < /tmp/file-monitor-${BUILD_ID}-${axis_index}.log) lines"
282
+ if [ -s /tmp/file-monitor-${BUILD_ID}-${axis_index}.log ]; then
283
+ echo "File deletions detected:"
284
+ cat /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
285
+ else
286
+ echo "No file deletions detected during build"
287
+ fi
288
+ else
289
+ echo "Monitoring log file not found"
290
+ fi
291
+ echo "==================================="
292
+
293
+ # Final cleanup
294
+ rm -f /tmp/file-monitor-${BUILD_ID}-${axis_index}.log
295
+ rm -f /tmp/monitor-pid-${BUILD_ID}-${axis_index}
296
+
297
+ # Tests commented out - problem occurs before tests
298
+ # - name: Test CPP
299
+ # parallel: false
300
+ # run: |
301
+ # docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_cpp.sh ${INSTALL_DIR}"
302
+ # onfail: |
303
+ # docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
304
+ # docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
305
+
306
+ # - name: Test Python
307
+ # parallel: false
308
+ # run: |
309
+ # docker exec -w ${WORKSPACE} "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" /bin/bash -c ".gitlab/test_python.sh ${INSTALL_DIR}"
310
+ # always: |
311
+ # docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
112
312
113
313
# once this fix is merged we can use the following to stop/kill/rm the container instead of the cleanup command in each step
114
314
# https://github.com/Mellanox/ci-demo/pull/111
0 commit comments