From 03fcececfc61d454ce4ecf4173734c3f20f24f91 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Thu, 16 Apr 2026 20:45:14 +0300 Subject: [PATCH] feat: add outgoing link checker script (IFK-14) Verify external source URLs in content/*.md are reachable. HEAD request with GET fallback, treats known bot-blocker 403s (hhs.gov, jamanetwork, consilium.europa.eu etc.) as OK. Usage: scripts/check-links.sh # all content files scripts/check-links.sh --fast # stop at first failure scripts/check-links.sh --verbose Co-Authored-By: Claude Opus 4.6 --- scripts/check-links.sh | 153 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100755 scripts/check-links.sh diff --git a/scripts/check-links.sh b/scripts/check-links.sh new file mode 100755 index 0000000..c8fe158 --- /dev/null +++ b/scripts/check-links.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# check-links.sh — Verify all external URLs in content/*.md are reachable. +# +# Extracts https?:// URLs from content/*.md, sends a HEAD request (fallback to GET), +# and reports 404s / connection errors. Known bot-blockers (403 responses from +# hhs.gov, jamanetwork.com etc.) are treated as OK. +# +# Exit codes: +# 0 — all links OK or only bot-blocked +# 1 — one or more broken links found +# 2 — usage error +# +# Usage: +# scripts/check-links.sh # check all content files +# scripts/check-links.sh --fast # stop at first failure +# scripts/check-links.sh --verbose # show every URL being checked +# scripts/check-links.sh path/to.md # check a specific file + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +FAST=0 +VERBOSE=0 +FILES=() + +for arg in "$@"; do + case "$arg" in + --fast) FAST=1 ;; + --verbose|-v) VERBOSE=1 ;; + --help|-h) + sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//' + exit 0 ;; + -*) echo "Unknown option: $arg" >&2; exit 2 ;; + *) FILES+=("$arg") ;; + esac +done + +# Default: all .md files under content/ +if [ "${#FILES[@]}" -eq 0 ]; then + mapfile -t FILES < <(find "$PROJECT_ROOT/content" -type f -name '*.md') +fi + +# Domains known to return 403 for automated requests but are valid sources. +# Keep this list short — anything that doesn't load in a browser should fail. +ALLOW_403='^(www\.)?(hhs\.gov|jamanetwork\.com|commission\.europa\.eu|digital-strategy\.ec\.europa\.eu|pubmed\.ncbi\.nlm\.nih\.gov|who\.int|oecd\.org|unesco\.org|unicef\.org|amnesty\.org|linkedin\.com|sciencedirect\.com|nature\.com|tandfonline\.com|springer\.com|wiley\.com|bmj\.com|nytimes\.com|ft\.com|wsj\.com|spiegel\.de|zeit\.de|lemonde\.fr|lefigaro\.fr|cnn\.com|worldhappiness\.report|danish-presidency\.consilium\.europa\.eu|consilium\.europa\.eu|europarl\.europa\.eu|ec\.europa\.eu)$' + +# Extract URLs from markdown. Strip trailing punctuation/brackets. +extract_urls() { + local file="$1" + grep -Eo 'https?://[A-Za-z0-9._~:/?#@!$&+,;=%-]+' "$file" \ + | sed -E 's/[,.;:!?)\]]+$//' \ + | sort -u +} + +# Returns status code (HTTP number or "000" if fetch failed) +check_url() { + local url="$1" + local status + status=$(curl -sS -o /dev/null -w "%{http_code}" \ + --max-time 10 \ + --connect-timeout 5 \ + --retry 1 \ + --user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \ + -I -L "$url" 2>/dev/null || echo "000") + # Many servers reject HEAD → retry with GET + if [ "$status" = "000" ] || [ "$status" = "405" ] || [ "$status" = "403" ]; then + status=$(curl -sS -o /dev/null -w "%{http_code}" \ + --max-time 10 \ + --connect-timeout 5 \ + --retry 1 \ + --user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \ + -L "$url" 2>/dev/null || echo "000") + fi + echo "$status" +} + +# Classify status code as OK / bot-blocked / broken +classify() { + local status="$1" url="$2" + case "$status" in + 2[0-9][0-9]|3[0-9][0-9]) echo "ok" ;; + 403) + local host + host=$(echo "$url" | awk -F/ '{print $3}') + if [[ "$host" =~ $ALLOW_403 ]]; then + echo "ok-blocked" + else + echo "broken" + fi ;; + 404|410|5[0-9][0-9]|000) echo "broken" ;; + *) echo "broken" ;; + esac +} + +# Collect URL → files mapping +declare -A URL_FILES=() +for file in "${FILES[@]}"; do + while IFS= read -r url; do + [ -z "$url" ] && continue + rel="${file#$PROJECT_ROOT/}" + if [ -z "${URL_FILES[$url]+x}" ]; then + URL_FILES[$url]="$rel" + else + URL_FILES[$url]="${URL_FILES[$url]}, $rel" + fi + done < <(extract_urls "$file") +done + +TOTAL=${#URL_FILES[@]} +echo "Checking $TOTAL unique external URLs across ${#FILES[@]} files..." + +if [ "$TOTAL" -eq 0 ]; then + echo "No external URLs found." + exit 0 +fi + +BROKEN=0 +OK=0 +BLOCKED=0 + +for url in "${!URL_FILES[@]}"; do + status=$(check_url "$url") + result=$(classify "$status" "$url") + + case "$result" in + ok) + OK=$((OK + 1)) + [ "$VERBOSE" -eq 1 ] && printf " OK %s %s\n" "$status" "$url" + ;; + ok-blocked) + BLOCKED=$((BLOCKED + 1)) + [ "$VERBOSE" -eq 1 ] && printf " 403* %s (bot-blocked, allowlisted)\n" "$url" + ;; + broken) + BROKEN=$((BROKEN + 1)) + printf " FAIL %s %s\n in: %s\n" "$status" "$url" "${URL_FILES[$url]}" + if [ "$FAST" -eq 1 ]; then + echo "" + echo "Aborting early (--fast). $BROKEN broken, $OK ok, $BLOCKED allowlisted." + exit 1 + fi + ;; + esac +done + +echo "" +echo "Summary: $OK OK, $BLOCKED allowlisted (403), $BROKEN broken" + +if [ "$BROKEN" -gt 0 ]; then + exit 1 +fi