All checks were successful
Deploy Internet for Kids / Build & Push (push) Successful in 12s
Deploy Internet for Kids / Deploy (push) Successful in 5s
Deploy Internet for Kids / Health Check (push) Successful in 1s
Deploy Internet for Kids / Smoke Tests (push) Successful in 2s
Deploy Internet for Kids / IndexNow Ping (push) Successful in 8s
Deploy Internet for Kids / Promote to Latest (push) Successful in 1s
Deploy Internet for Kids / Rollback (push) Has been skipped
Deploy Internet for Kids / Audit (push) Successful in 2s
Verify external source URLs in content/*.md are reachable. HEAD request with GET fallback, treats known bot-blocker 403s (hhs.gov, jamanetwork, consilium.europa.eu etc.) as OK. Usage: scripts/check-links.sh # all content files scripts/check-links.sh --fast # stop at first failure scripts/check-links.sh --verbose Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
154 lines
4.9 KiB
Bash
Executable File
154 lines
4.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# check-links.sh — Verify all external URLs in content/*.md are reachable.
|
|
#
|
|
# Extracts https?:// URLs from content/*.md, sends a HEAD request (fallback to GET),
|
|
# and reports 404s / connection errors. Known bot-blockers (403 responses from
|
|
# hhs.gov, jamanetwork.com etc.) are treated as OK.
|
|
#
|
|
# Exit codes:
|
|
# 0 — all links OK or only bot-blocked
|
|
# 1 — one or more broken links found
|
|
# 2 — usage error
|
|
#
|
|
# Usage:
|
|
# scripts/check-links.sh # check all content files
|
|
# scripts/check-links.sh --fast # stop at first failure
|
|
# scripts/check-links.sh --verbose # show every URL being checked
|
|
# scripts/check-links.sh path/to.md # check a specific file
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
|
FAST=0
|
|
VERBOSE=0
|
|
FILES=()
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--fast) FAST=1 ;;
|
|
--verbose|-v) VERBOSE=1 ;;
|
|
--help|-h)
|
|
sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//'
|
|
exit 0 ;;
|
|
-*) echo "Unknown option: $arg" >&2; exit 2 ;;
|
|
*) FILES+=("$arg") ;;
|
|
esac
|
|
done
|
|
|
|
# Default: all .md files under content/
|
|
if [ "${#FILES[@]}" -eq 0 ]; then
|
|
mapfile -t FILES < <(find "$PROJECT_ROOT/content" -type f -name '*.md')
|
|
fi
|
|
|
|
# Domains known to return 403 for automated requests but are valid sources.
|
|
# Keep this list short — anything that doesn't load in a browser should fail.
|
|
ALLOW_403='^(www\.)?(hhs\.gov|jamanetwork\.com|commission\.europa\.eu|digital-strategy\.ec\.europa\.eu|pubmed\.ncbi\.nlm\.nih\.gov|who\.int|oecd\.org|unesco\.org|unicef\.org|amnesty\.org|linkedin\.com|sciencedirect\.com|nature\.com|tandfonline\.com|springer\.com|wiley\.com|bmj\.com|nytimes\.com|ft\.com|wsj\.com|spiegel\.de|zeit\.de|lemonde\.fr|lefigaro\.fr|cnn\.com|worldhappiness\.report|danish-presidency\.consilium\.europa\.eu|consilium\.europa\.eu|europarl\.europa\.eu|ec\.europa\.eu)$'
|
|
|
|
# Extract URLs from markdown. Strip trailing punctuation/brackets.
|
|
extract_urls() {
|
|
local file="$1"
|
|
grep -Eo 'https?://[A-Za-z0-9._~:/?#@!$&+,;=%-]+' "$file" \
|
|
| sed -E 's/[,.;:!?)\]]+$//' \
|
|
| sort -u
|
|
}
|
|
|
|
# Returns status code (HTTP number or "000" if fetch failed)
|
|
check_url() {
|
|
local url="$1"
|
|
local status
|
|
status=$(curl -sS -o /dev/null -w "%{http_code}" \
|
|
--max-time 10 \
|
|
--connect-timeout 5 \
|
|
--retry 1 \
|
|
--user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \
|
|
-I -L "$url" 2>/dev/null || echo "000")
|
|
# Many servers reject HEAD → retry with GET
|
|
if [ "$status" = "000" ] || [ "$status" = "405" ] || [ "$status" = "403" ]; then
|
|
status=$(curl -sS -o /dev/null -w "%{http_code}" \
|
|
--max-time 10 \
|
|
--connect-timeout 5 \
|
|
--retry 1 \
|
|
--user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \
|
|
-L "$url" 2>/dev/null || echo "000")
|
|
fi
|
|
echo "$status"
|
|
}
|
|
|
|
# Classify status code as OK / bot-blocked / broken
|
|
classify() {
|
|
local status="$1" url="$2"
|
|
case "$status" in
|
|
2[0-9][0-9]|3[0-9][0-9]) echo "ok" ;;
|
|
403)
|
|
local host
|
|
host=$(echo "$url" | awk -F/ '{print $3}')
|
|
if [[ "$host" =~ $ALLOW_403 ]]; then
|
|
echo "ok-blocked"
|
|
else
|
|
echo "broken"
|
|
fi ;;
|
|
404|410|5[0-9][0-9]|000) echo "broken" ;;
|
|
*) echo "broken" ;;
|
|
esac
|
|
}
|
|
|
|
# Collect URL → files mapping
|
|
declare -A URL_FILES=()
|
|
for file in "${FILES[@]}"; do
|
|
while IFS= read -r url; do
|
|
[ -z "$url" ] && continue
|
|
rel="${file#$PROJECT_ROOT/}"
|
|
if [ -z "${URL_FILES[$url]+x}" ]; then
|
|
URL_FILES[$url]="$rel"
|
|
else
|
|
URL_FILES[$url]="${URL_FILES[$url]}, $rel"
|
|
fi
|
|
done < <(extract_urls "$file")
|
|
done
|
|
|
|
TOTAL=${#URL_FILES[@]}
|
|
echo "Checking $TOTAL unique external URLs across ${#FILES[@]} files..."
|
|
|
|
if [ "$TOTAL" -eq 0 ]; then
|
|
echo "No external URLs found."
|
|
exit 0
|
|
fi
|
|
|
|
BROKEN=0
|
|
OK=0
|
|
BLOCKED=0
|
|
|
|
for url in "${!URL_FILES[@]}"; do
|
|
status=$(check_url "$url")
|
|
result=$(classify "$status" "$url")
|
|
|
|
case "$result" in
|
|
ok)
|
|
OK=$((OK + 1))
|
|
[ "$VERBOSE" -eq 1 ] && printf " OK %s %s\n" "$status" "$url"
|
|
;;
|
|
ok-blocked)
|
|
BLOCKED=$((BLOCKED + 1))
|
|
[ "$VERBOSE" -eq 1 ] && printf " 403* %s (bot-blocked, allowlisted)\n" "$url"
|
|
;;
|
|
broken)
|
|
BROKEN=$((BROKEN + 1))
|
|
printf " FAIL %s %s\n in: %s\n" "$status" "$url" "${URL_FILES[$url]}"
|
|
if [ "$FAST" -eq 1 ]; then
|
|
echo ""
|
|
echo "Aborting early (--fast). $BROKEN broken, $OK ok, $BLOCKED allowlisted."
|
|
exit 1
|
|
fi
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo ""
|
|
echo "Summary: $OK OK, $BLOCKED allowlisted (403), $BROKEN broken"
|
|
|
|
if [ "$BROKEN" -gt 0 ]; then
|
|
exit 1
|
|
fi
|