Skip to content

Commit fcf36d9

Browse files
ci: add diagnostic instrumentation for npm-ci hang
`npm ci` is hanging silently for ~8 minutes in protected-runner CI and dying with "Exit handler never called", which skips bin-linking and breaks downstream `prettier`/`nyc` calls. Previous fix attempts have not surfaced the root cause; this commit adds enough signal that the next run will reveal exactly what's going wrong. Three diagnostic blocks added to lint / unit-test / e2e-test jobs: 1. pre-npm-ci: captures effective npm config, ~/.npmrc contents, cache state, lockfile resolved-URL distribution, and reachability probes to both registries plus a sample fetch of a known new package (basic-ftp). Runs always; cheap. 2. npm ci flags: --loglevel=http --no-progress --foreground-scripts so the CI log itself shows every HTTP request npm makes and any lifecycle-script output that would otherwise be buffered. 3. post-npm-ci on failure: bundles ~/.npm/_logs (npm's own debug log — definitive proof of what hung), cacache state, node_modules bin state, process tree, dmesg tail, and the in-CI lockfile. Uploaded as artifact npm-diag-{job}{-node<v>}. To be reverted in one commit once the hang is fixed. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala <vikrant.puppala@databricks.com>
1 parent 439de47 commit fcf36d9

1 file changed

Lines changed: 163 additions & 3 deletions

File tree

.github/workflows/main.yml

Lines changed: 163 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,69 @@ jobs:
3434
${{ runner.os }}-build-${{ env.cache-name }}-
3535
${{ runner.os }}-build-
3636
${{ runner.os }}-
37+
# DIAGNOSTIC (temporary — remove once npm-hang root cause identified).
38+
# Captures effective npm config, lockfile registry distribution, and
39+
# registry reachability BEFORE npm ci runs.
40+
- name: Diag — pre-npm-ci
41+
run: |
42+
set +e
43+
echo "=== effective npm config ==="
44+
npm config list -l 2>&1 | grep -E '^(registry|fetch-|cache|loglevel|prefer-|@databricks)' || true
45+
echo "=== ~/.npmrc (auth masked) ==="
46+
sed 's/_authToken=.*/_authToken=***/' ~/.npmrc 2>/dev/null || echo "(no ~/.npmrc)"
47+
echo "=== npm cache dir ==="
48+
npm config get cache
49+
ls -la "$(npm config get cache)" 2>/dev/null | head -10 || echo "(empty)"
50+
echo "=== package-lock.json resolved URLs by registry ==="
51+
grep -oE '"resolved": "https://[^/]+' package-lock.json | sort | uniq -c
52+
echo "=== reachability probes ==="
53+
for url in https://registry.npmjs.org/ https://databricks.jfrog.io/artifactory/api/npm/db-npm/; do
54+
echo "--- $url ---"
55+
curl -sS -o /dev/null -w "HTTP=%{http_code} connect=%{time_connect}s total=%{time_total}s\n" \
56+
--max-time 10 --connect-timeout 5 \
57+
-H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
58+
"$url" || echo "FAIL (curl exit $?)"
59+
done
60+
echo "=== sample package metadata fetch (basic-ftp, new in this PR) ==="
61+
curl -sS -o /tmp/probe.json -w "basic-ftp: HTTP=%{http_code} size=%{size_download}b time=%{time_total}s\n" \
62+
--max-time 10 -H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
63+
"https://databricks.jfrog.io/artifactory/api/npm/db-npm/basic-ftp" || echo "FAIL"
64+
head -c 200 /tmp/probe.json 2>/dev/null; echo
65+
echo "=== /probe ==="
66+
true
3767
- name: Check code style
3868
run: |
39-
npm ci
69+
npm ci --loglevel=http --no-progress --foreground-scripts
4070
npm run prettier
4171
npm run lint
72+
# DIAGNOSTIC (temporary — remove once npm-hang root cause identified).
73+
# Captures npm debug log, cache state, and node_modules state AFTER
74+
# the failure so we can see exactly what npm did during the silent
75+
# 8-minute hang.
76+
- name: Diag — post-npm-ci on failure
77+
if: failure()
78+
run: |
79+
set +e
80+
DIAG=/tmp/npm-diag
81+
mkdir -p "$DIAG"
82+
cp -r ~/.npm/_logs "$DIAG/npm_logs" 2>/dev/null || echo "no _logs dir"
83+
du -sh ~/.npm/_cacache 2>/dev/null > "$DIAG/cacache_size.txt"
84+
ls -la node_modules/.bin/ 2>/dev/null > "$DIAG/node_modules_bin.txt" || echo "(no .bin)" > "$DIAG/node_modules_bin.txt"
85+
ls node_modules/ 2>/dev/null | wc -l > "$DIAG/node_modules_pkg_count.txt"
86+
ps auxf > "$DIAG/ps_snapshot.txt" 2>&1 || true
87+
dmesg 2>&1 | tail -50 > "$DIAG/dmesg_tail.txt" || true
88+
cp package-lock.json "$DIAG/package-lock.json.in-ci"
89+
echo "=== diag bundle contents ==="
90+
ls -la "$DIAG"
91+
true
92+
- name: Diag — upload bundle
93+
if: failure()
94+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
95+
with:
96+
name: npm-diag-lint
97+
path: /tmp/npm-diag
98+
if-no-files-found: warn
99+
retention-days: 7
42100

43101
unit-test:
44102
runs-on:
@@ -72,10 +130,61 @@ jobs:
72130
${{ runner.os }}-${{ matrix.node-version }}-build-${{ env.cache-name }}-
73131
${{ runner.os }}-${{ matrix.node-version }}-build-
74132
${{ runner.os }}-${{ matrix.node-version }}-
133+
- name: Diag — pre-npm-ci
134+
run: |
135+
set +e
136+
echo "=== effective npm config ==="
137+
npm config list -l 2>&1 | grep -E '^(registry|fetch-|cache|loglevel|prefer-|@databricks)' || true
138+
echo "=== ~/.npmrc (auth masked) ==="
139+
sed 's/_authToken=.*/_authToken=***/' ~/.npmrc 2>/dev/null || echo "(no ~/.npmrc)"
140+
echo "=== npm cache dir ==="
141+
npm config get cache
142+
ls -la "$(npm config get cache)" 2>/dev/null | head -10 || echo "(empty)"
143+
echo "=== package-lock.json resolved URLs by registry ==="
144+
grep -oE '"resolved": "https://[^/]+' package-lock.json | sort | uniq -c
145+
echo "=== reachability probes ==="
146+
for url in https://registry.npmjs.org/ https://databricks.jfrog.io/artifactory/api/npm/db-npm/; do
147+
echo "--- $url ---"
148+
curl -sS -o /dev/null -w "HTTP=%{http_code} connect=%{time_connect}s total=%{time_total}s\n" \
149+
--max-time 10 --connect-timeout 5 \
150+
-H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
151+
"$url" || echo "FAIL (curl exit $?)"
152+
done
153+
echo "=== sample package metadata fetch (basic-ftp, new in this PR) ==="
154+
curl -sS -o /tmp/probe.json -w "basic-ftp: HTTP=%{http_code} size=%{size_download}b time=%{time_total}s\n" \
155+
--max-time 10 -H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
156+
"https://databricks.jfrog.io/artifactory/api/npm/db-npm/basic-ftp" || echo "FAIL"
157+
head -c 200 /tmp/probe.json 2>/dev/null; echo
158+
echo "=== /probe ==="
159+
true
75160
- name: Run unit tests
76161
run: |
77-
npm ci
162+
npm ci --loglevel=http --no-progress --foreground-scripts
78163
npm run test
164+
- name: Diag — post-npm-ci on failure
165+
if: failure()
166+
run: |
167+
set +e
168+
DIAG=/tmp/npm-diag
169+
mkdir -p "$DIAG"
170+
cp -r ~/.npm/_logs "$DIAG/npm_logs" 2>/dev/null || echo "no _logs dir"
171+
du -sh ~/.npm/_cacache 2>/dev/null > "$DIAG/cacache_size.txt"
172+
ls -la node_modules/.bin/ 2>/dev/null > "$DIAG/node_modules_bin.txt" || echo "(no .bin)" > "$DIAG/node_modules_bin.txt"
173+
ls node_modules/ 2>/dev/null | wc -l > "$DIAG/node_modules_pkg_count.txt"
174+
ps auxf > "$DIAG/ps_snapshot.txt" 2>&1 || true
175+
dmesg 2>&1 | tail -50 > "$DIAG/dmesg_tail.txt" || true
176+
cp package-lock.json "$DIAG/package-lock.json.in-ci"
177+
echo "=== diag bundle contents ==="
178+
ls -la "$DIAG"
179+
true
180+
- name: Diag — upload bundle
181+
if: failure()
182+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
183+
with:
184+
name: npm-diag-unit-test-node${{ matrix.node-version }}
185+
path: /tmp/npm-diag
186+
if-no-files-found: warn
187+
retention-days: 7
79188
- run: tar -cvf ${{ env.NYC_REPORT_DIR }}.tar ${{ env.NYC_REPORT_DIR }}
80189
- name: Store coverage report
81190
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
@@ -115,10 +224,61 @@ jobs:
115224
${{ runner.os }}-build-${{ env.cache-name }}-
116225
${{ runner.os }}-build-
117226
${{ runner.os }}-
227+
- name: Diag — pre-npm-ci
228+
run: |
229+
set +e
230+
echo "=== effective npm config ==="
231+
npm config list -l 2>&1 | grep -E '^(registry|fetch-|cache|loglevel|prefer-|@databricks)' || true
232+
echo "=== ~/.npmrc (auth masked) ==="
233+
sed 's/_authToken=.*/_authToken=***/' ~/.npmrc 2>/dev/null || echo "(no ~/.npmrc)"
234+
echo "=== npm cache dir ==="
235+
npm config get cache
236+
ls -la "$(npm config get cache)" 2>/dev/null | head -10 || echo "(empty)"
237+
echo "=== package-lock.json resolved URLs by registry ==="
238+
grep -oE '"resolved": "https://[^/]+' package-lock.json | sort | uniq -c
239+
echo "=== reachability probes ==="
240+
for url in https://registry.npmjs.org/ https://databricks.jfrog.io/artifactory/api/npm/db-npm/; do
241+
echo "--- $url ---"
242+
curl -sS -o /dev/null -w "HTTP=%{http_code} connect=%{time_connect}s total=%{time_total}s\n" \
243+
--max-time 10 --connect-timeout 5 \
244+
-H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
245+
"$url" || echo "FAIL (curl exit $?)"
246+
done
247+
echo "=== sample package metadata fetch (basic-ftp, new in this PR) ==="
248+
curl -sS -o /tmp/probe.json -w "basic-ftp: HTTP=%{http_code} size=%{size_download}b time=%{time_total}s\n" \
249+
--max-time 10 -H "Authorization: Bearer $JFROG_ACCESS_TOKEN" \
250+
"https://databricks.jfrog.io/artifactory/api/npm/db-npm/basic-ftp" || echo "FAIL"
251+
head -c 200 /tmp/probe.json 2>/dev/null; echo
252+
echo "=== /probe ==="
253+
true
118254
- name: Run e2e tests
119255
run: |
120-
npm ci
256+
npm ci --loglevel=http --no-progress --foreground-scripts
121257
NODE_OPTIONS="--max-old-space-size=4096" npm run e2e
258+
- name: Diag — post-npm-ci on failure
259+
if: failure()
260+
run: |
261+
set +e
262+
DIAG=/tmp/npm-diag
263+
mkdir -p "$DIAG"
264+
cp -r ~/.npm/_logs "$DIAG/npm_logs" 2>/dev/null || echo "no _logs dir"
265+
du -sh ~/.npm/_cacache 2>/dev/null > "$DIAG/cacache_size.txt"
266+
ls -la node_modules/.bin/ 2>/dev/null > "$DIAG/node_modules_bin.txt" || echo "(no .bin)" > "$DIAG/node_modules_bin.txt"
267+
ls node_modules/ 2>/dev/null | wc -l > "$DIAG/node_modules_pkg_count.txt"
268+
ps auxf > "$DIAG/ps_snapshot.txt" 2>&1 || true
269+
dmesg 2>&1 | tail -50 > "$DIAG/dmesg_tail.txt" || true
270+
cp package-lock.json "$DIAG/package-lock.json.in-ci"
271+
echo "=== diag bundle contents ==="
272+
ls -la "$DIAG"
273+
true
274+
- name: Diag — upload bundle
275+
if: failure()
276+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
277+
with:
278+
name: npm-diag-e2e-test
279+
path: /tmp/npm-diag
280+
if-no-files-found: warn
281+
retention-days: 7
122282
- run: tar -cvf ${{ env.NYC_REPORT_DIR }}.tar ${{ env.NYC_REPORT_DIR }}
123283
- name: Store coverage report
124284
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4

0 commit comments

Comments
 (0)