#!/usr/bin/env bash # check-links.sh — Verify all external URLs in content/*.md are reachable. # # Extracts https?:// URLs from content/*.md, sends a HEAD request (fallback to GET), # and reports 404s / connection errors. Known bot-blockers (403 responses from # hhs.gov, jamanetwork.com etc.) are treated as OK. # # Exit codes: # 0 — all links OK or only bot-blocked # 1 — one or more broken links found # 2 — usage error # # Usage: # scripts/check-links.sh # check all content files # scripts/check-links.sh --fast # stop at first failure # scripts/check-links.sh --verbose # show every URL being checked # scripts/check-links.sh path/to.md # check a specific file set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" FAST=0 VERBOSE=0 FILES=() for arg in "$@"; do case "$arg" in --fast) FAST=1 ;; --verbose|-v) VERBOSE=1 ;; --help|-h) sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//' exit 0 ;; -*) echo "Unknown option: $arg" >&2; exit 2 ;; *) FILES+=("$arg") ;; esac done # Default: all .md files under content/ if [ "${#FILES[@]}" -eq 0 ]; then mapfile -t FILES < <(find "$PROJECT_ROOT/content" -type f -name '*.md') fi # Domains known to return 403 for automated requests but are valid sources. # Keep this list short — anything that doesn't load in a browser should fail. ALLOW_403='^(www\.)?(hhs\.gov|jamanetwork\.com|commission\.europa\.eu|digital-strategy\.ec\.europa\.eu|pubmed\.ncbi\.nlm\.nih\.gov|who\.int|oecd\.org|unesco\.org|unicef\.org|amnesty\.org|linkedin\.com|sciencedirect\.com|nature\.com|tandfonline\.com|springer\.com|wiley\.com|bmj\.com|nytimes\.com|ft\.com|wsj\.com|spiegel\.de|zeit\.de|lemonde\.fr|lefigaro\.fr|cnn\.com|worldhappiness\.report|danish-presidency\.consilium\.europa\.eu|consilium\.europa\.eu|europarl\.europa\.eu|ec\.europa\.eu)$' # Extract URLs from markdown. Strip trailing punctuation/brackets. extract_urls() { local file="$1" grep -Eo 'https?://[A-Za-z0-9._~:/?#@!$&+,;=%-]+' "$file" \ | sed -E 's/[,.;:!?)\]]+$//' \ | sort -u } # Returns status code (HTTP number or "000" if fetch failed) check_url() { local url="$1" local status status=$(curl -sS -o /dev/null -w "%{http_code}" \ --max-time 10 \ --connect-timeout 5 \ --retry 1 \ --user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \ -I -L "$url" 2>/dev/null || echo "000") # Many servers reject HEAD → retry with GET if [ "$status" = "000" ] || [ "$status" = "405" ] || [ "$status" = "403" ]; then status=$(curl -sS -o /dev/null -w "%{http_code}" \ --max-time 10 \ --connect-timeout 5 \ --retry 1 \ --user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \ -L "$url" 2>/dev/null || echo "000") fi echo "$status" } # Classify status code as OK / bot-blocked / broken classify() { local status="$1" url="$2" case "$status" in 2[0-9][0-9]|3[0-9][0-9]) echo "ok" ;; 403) local host host=$(echo "$url" | awk -F/ '{print $3}') if [[ "$host" =~ $ALLOW_403 ]]; then echo "ok-blocked" else echo "broken" fi ;; 404|410|5[0-9][0-9]|000) echo "broken" ;; *) echo "broken" ;; esac } # Collect URL → files mapping declare -A URL_FILES=() for file in "${FILES[@]}"; do while IFS= read -r url; do [ -z "$url" ] && continue rel="${file#$PROJECT_ROOT/}" if [ -z "${URL_FILES[$url]+x}" ]; then URL_FILES[$url]="$rel" else URL_FILES[$url]="${URL_FILES[$url]}, $rel" fi done < <(extract_urls "$file") done TOTAL=${#URL_FILES[@]} echo "Checking $TOTAL unique external URLs across ${#FILES[@]} files..." if [ "$TOTAL" -eq 0 ]; then echo "No external URLs found." exit 0 fi BROKEN=0 OK=0 BLOCKED=0 for url in "${!URL_FILES[@]}"; do status=$(check_url "$url") result=$(classify "$status" "$url") case "$result" in ok) OK=$((OK + 1)) [ "$VERBOSE" -eq 1 ] && printf " OK %s %s\n" "$status" "$url" ;; ok-blocked) BLOCKED=$((BLOCKED + 1)) [ "$VERBOSE" -eq 1 ] && printf " 403* %s (bot-blocked, allowlisted)\n" "$url" ;; broken) BROKEN=$((BROKEN + 1)) printf " FAIL %s %s\n in: %s\n" "$status" "$url" "${URL_FILES[$url]}" if [ "$FAST" -eq 1 ]; then echo "" echo "Aborting early (--fast). $BROKEN broken, $OK ok, $BLOCKED allowlisted." exit 1 fi ;; esac done echo "" echo "Summary: $OK OK, $BLOCKED allowlisted (403), $BROKEN broken" if [ "$BROKEN" -gt 0 ]; then exit 1 fi