Compare commits
2 Commits
session/CF
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c35ff84fc | ||
|
|
03fcececfc |
@@ -53,6 +53,5 @@
|
||||
</footer>
|
||||
|
||||
{{ partial "rybbit.html" . }}
|
||||
{{ partial "structured-data.html" . }}
|
||||
</body>
|
||||
</html>
|
||||
|
||||
153
scripts/check-links.sh
Executable file
153
scripts/check-links.sh
Executable file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env bash
|
||||
# check-links.sh — Verify all external URLs in content/*.md are reachable.
|
||||
#
|
||||
# Extracts https?:// URLs from content/*.md, sends a HEAD request (fallback to GET),
|
||||
# and reports 404s / connection errors. Known bot-blockers (403 responses from
|
||||
# hhs.gov, jamanetwork.com etc.) are treated as OK.
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — all links OK or only bot-blocked
|
||||
# 1 — one or more broken links found
|
||||
# 2 — usage error
|
||||
#
|
||||
# Usage:
|
||||
# scripts/check-links.sh # check all content files
|
||||
# scripts/check-links.sh --fast # stop at first failure
|
||||
# scripts/check-links.sh --verbose # show every URL being checked
|
||||
# scripts/check-links.sh path/to.md # check a specific file
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
FAST=0
|
||||
VERBOSE=0
|
||||
FILES=()
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--fast) FAST=1 ;;
|
||||
--verbose|-v) VERBOSE=1 ;;
|
||||
--help|-h)
|
||||
sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//'
|
||||
exit 0 ;;
|
||||
-*) echo "Unknown option: $arg" >&2; exit 2 ;;
|
||||
*) FILES+=("$arg") ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Default: all .md files under content/
|
||||
if [ "${#FILES[@]}" -eq 0 ]; then
|
||||
mapfile -t FILES < <(find "$PROJECT_ROOT/content" -type f -name '*.md')
|
||||
fi
|
||||
|
||||
# Domains known to return 403 for automated requests but are valid sources.
|
||||
# Keep this list short — anything that doesn't load in a browser should fail.
|
||||
ALLOW_403='^(www\.)?(hhs\.gov|jamanetwork\.com|commission\.europa\.eu|digital-strategy\.ec\.europa\.eu|pubmed\.ncbi\.nlm\.nih\.gov|who\.int|oecd\.org|unesco\.org|unicef\.org|amnesty\.org|linkedin\.com|sciencedirect\.com|nature\.com|tandfonline\.com|springer\.com|wiley\.com|bmj\.com|nytimes\.com|ft\.com|wsj\.com|spiegel\.de|zeit\.de|lemonde\.fr|lefigaro\.fr|cnn\.com|worldhappiness\.report|danish-presidency\.consilium\.europa\.eu|consilium\.europa\.eu|europarl\.europa\.eu|ec\.europa\.eu)$'
|
||||
|
||||
# Extract URLs from markdown. Strip trailing punctuation/brackets.
|
||||
extract_urls() {
|
||||
local file="$1"
|
||||
grep -Eo 'https?://[A-Za-z0-9._~:/?#@!$&+,;=%-]+' "$file" \
|
||||
| sed -E 's/[,.;:!?)\]]+$//' \
|
||||
| sort -u
|
||||
}
|
||||
|
||||
# Returns status code (HTTP number or "000" if fetch failed)
|
||||
check_url() {
|
||||
local url="$1"
|
||||
local status
|
||||
status=$(curl -sS -o /dev/null -w "%{http_code}" \
|
||||
--max-time 10 \
|
||||
--connect-timeout 5 \
|
||||
--retry 1 \
|
||||
--user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \
|
||||
-I -L "$url" 2>/dev/null || echo "000")
|
||||
# Many servers reject HEAD → retry with GET
|
||||
if [ "$status" = "000" ] || [ "$status" = "405" ] || [ "$status" = "403" ]; then
|
||||
status=$(curl -sS -o /dev/null -w "%{http_code}" \
|
||||
--max-time 10 \
|
||||
--connect-timeout 5 \
|
||||
--retry 1 \
|
||||
--user-agent "Mozilla/5.0 (compatible; IFK-LinkChecker/1.0; +https://internetforkids.org)" \
|
||||
-L "$url" 2>/dev/null || echo "000")
|
||||
fi
|
||||
echo "$status"
|
||||
}
|
||||
|
||||
# Classify status code as OK / bot-blocked / broken
|
||||
classify() {
|
||||
local status="$1" url="$2"
|
||||
case "$status" in
|
||||
2[0-9][0-9]|3[0-9][0-9]) echo "ok" ;;
|
||||
403)
|
||||
local host
|
||||
host=$(echo "$url" | awk -F/ '{print $3}')
|
||||
if [[ "$host" =~ $ALLOW_403 ]]; then
|
||||
echo "ok-blocked"
|
||||
else
|
||||
echo "broken"
|
||||
fi ;;
|
||||
404|410|5[0-9][0-9]|000) echo "broken" ;;
|
||||
*) echo "broken" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Collect URL → files mapping
|
||||
declare -A URL_FILES=()
|
||||
for file in "${FILES[@]}"; do
|
||||
while IFS= read -r url; do
|
||||
[ -z "$url" ] && continue
|
||||
rel="${file#$PROJECT_ROOT/}"
|
||||
if [ -z "${URL_FILES[$url]+x}" ]; then
|
||||
URL_FILES[$url]="$rel"
|
||||
else
|
||||
URL_FILES[$url]="${URL_FILES[$url]}, $rel"
|
||||
fi
|
||||
done < <(extract_urls "$file")
|
||||
done
|
||||
|
||||
TOTAL=${#URL_FILES[@]}
|
||||
echo "Checking $TOTAL unique external URLs across ${#FILES[@]} files..."
|
||||
|
||||
if [ "$TOTAL" -eq 0 ]; then
|
||||
echo "No external URLs found."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
BROKEN=0
|
||||
OK=0
|
||||
BLOCKED=0
|
||||
|
||||
for url in "${!URL_FILES[@]}"; do
|
||||
status=$(check_url "$url")
|
||||
result=$(classify "$status" "$url")
|
||||
|
||||
case "$result" in
|
||||
ok)
|
||||
OK=$((OK + 1))
|
||||
[ "$VERBOSE" -eq 1 ] && printf " OK %s %s\n" "$status" "$url"
|
||||
;;
|
||||
ok-blocked)
|
||||
BLOCKED=$((BLOCKED + 1))
|
||||
[ "$VERBOSE" -eq 1 ] && printf " 403* %s (bot-blocked, allowlisted)\n" "$url"
|
||||
;;
|
||||
broken)
|
||||
BROKEN=$((BROKEN + 1))
|
||||
printf " FAIL %s %s\n in: %s\n" "$status" "$url" "${URL_FILES[$url]}"
|
||||
if [ "$FAST" -eq 1 ]; then
|
||||
echo ""
|
||||
echo "Aborting early (--fast). $BROKEN broken, $OK ok, $BLOCKED allowlisted."
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Summary: $OK OK, $BLOCKED allowlisted (403), $BROKEN broken"
|
||||
|
||||
if [ "$BROKEN" -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user