#!/bin/bash
# fix-broken-links - link-fix.sh
#
# Take a set of web files, extract every http(s) URL, and check each with curl.
#   • With file paths passed on the command line, any URL that is not 200 gets
#     spelling variations (http/https, www, trailing slash) then a Copilot CLI
#     agent hand-off for more alternatives, followed by an interactive menu to
#     replace / remove / skip.
#   • With NO file arguments it discovers changed files via git (falling back to a
#     repo scan) and only lists the broken links - no alternative lookups and no
#     prompts.
# Generic anchor text is flagged as an SEO note either way.
#
# Pure bash + grep/sed/curl, plus an optional Copilot CLI hand-off for suggestions.
# Covers: HTML · Markdown · JS/TS · JSON · CSS · SQL · templates (all via URL scan)
# Requires: curl, grep, sed  |  Optional: copilot, jq, git
set -uo pipefail

# The agent hand-off below invokes `copilot`, which may itself re-fire this hook.
# The child run is marked with this env var; exit immediately if it is present so
# we never recurse.
[ -n "${FIX_BROKEN_LINKS_AGENT:-}" ] && exit 0

LIMIT=50
TIMEOUT=10
UA='Mozilla/5.0 (compatible; fix-broken-links/1.0)'
AGENT_MODEL='gpt-5-mini'   # small, low-token model for the suggestion hand-off
AGENT_TIMEOUT=60           # seconds before giving up on the agent
# Cap the agent call with `timeout` when it is available (coreutils; absent on
# some minimal / Git-Bash setups), otherwise run copilot unbounded.
if command -v timeout >/dev/null 2>&1; then AGENT_RUN="timeout ${AGENT_TIMEOUT}"; else AGENT_RUN=""; fi
WEB_RE='\.(html?|xhtml|md|markdown|mdx|js|jsx|ts|tsx|vue|svelte|json|jsonl|css|sql|erb|jinja|j2|twig|ejs|pug|hbs)$'

command -v curl >/dev/null 2>&1 || { printf 'fix-broken-links: curl not found\n' >&2; exit 0; }

# ── Hook stdin ────────────────────────────────────────────────────────────────
# When called as a postToolUse hook, extract edited files from the JSON payload
# and inject them as positional args so collect_input picks them up.
_HOOK=""
if [ "$#" -eq 0 ] && [ ! -t 0 ]; then
  _HOOK=1                      # invoked as a hook: stdin carries the tool payload
  _INPUT=$(cat)
  if command -v jq >/dev/null 2>&1; then
    _TOOL=$(printf '%s' "$_INPUT" | jq -r '.toolName // .tool_name // empty' 2>/dev/null)
    case "$_TOOL" in
      editFiles|edit|write|str_replace_editor|create_file|multiEdit|applyPatch)
        # Only the files this edit tool just changed - never a wider repo scan.
        mapfile -t _FILES < <(
          printf '%s' "$_INPUT" \
            | jq -r '.tool_input.files[]? // .toolInput.files[]? // .tool_input.path // .toolInput.path // empty' 2>/dev/null
        )
        [ "${#_FILES[@]}" -gt 0 ] && set -- "${_FILES[@]}"
        ;;
      "")
        # No tool context - called manually with piped input, fall through
        ;;
      *)
        # Different tool (bash, read, etc.) - nothing to check
        exit 0
        ;;
    esac
  fi
fi

# A non-empty positional list means the caller passed files: the edited files from
# the hook payload above, or paths given on the command line. Only then do we run
# the full repair flow (look up alternatives, then prompt to fix). With no
# parameters we simply list the broken links - no lookups, no prompts.
[ "$#" -gt 0 ] && HAVE_PARAMS=1 || HAVE_PARAMS=0

# Interactive input comes from the terminal, since stdin may carry hook JSON.
# Probe by actually opening /dev/tty - a mere -r/-w test can pass where open fails.
TTY=/dev/tty
if { true >/dev/tty; } 2>/dev/null && { true </dev/tty; } 2>/dev/null; then
  TTY=/dev/tty
else
  TTY=""
fi
ask() {
  local p="$1" ans=""
  [ -z "$TTY" ] && { printf '%s' ""; return; }
  printf '%s' "$p" > "$TTY"
  IFS= read -r ans < "$TTY" || ans=""
  printf '%s' "$ans"
}

# ── Helpers ───────────────────────────────────────────────────────────────────

http_status() {
  curl -s -o /dev/null -w '%{http_code}' --max-time "$TIMEOUT" --location -A "$UA" "$1" 2>/dev/null
}

# Escape ERE metacharacters so a literal string can be used safely inside a bash
# [[ =~ ]] pattern. Only true metacharacters are escaped - backslash-escaping an
# ordinary character (e.g. '\:') is undefined in ERE and would fail to match.
re_escape() {
  local s="$1" out="" c i bs='\' meta='.^$*+?()[]{}|\'
  for ((i = 0; i < ${#s}; i++)); do
    c="${s:i:1}"
    if [[ "$meta" == *"$c"* ]]; then out+="$bs$c"; else out+="$c"; fi
  done
  printf '%s' "$out"
}

# Read an entire file into a variable, preserving newlines.
read_file() { IFS= read -rd '' "$1" < "$2" || true; }

# Escape glob metacharacters (\ * ? [) so a string is matched literally inside
# ${var//pattern/repl}, which otherwise interprets the pattern as a glob. URLs
# and Markdown link spans routinely contain ? and [ ], so this is required for a
# correct fixed-string replacement.
glob_escape() {
  local s="$1" out="" c i
  for ((i = 0; i < ${#s}; i++)); do
    c="${s:i:1}"
    case "$c" in
      '\'|'*'|'?'|'[') out+="\\$c" ;;
      *) out+="$c" ;;
    esac
  done
  printf '%s' "$out"
}

# Print every http(s) URL in a file, trailing punctuation trimmed, de-duplicated.
extract_urls() {
  grep -oiE 'https?://[^"'\''<> )]+' "$1" 2>/dev/null \
    | sed -E 's/[.,;:]+$//' \
    | sort -u
}

# Generic anchor text that weakens SEO.
seo_scan() {
  grep -oiE '<a[^>]*>[[:space:]]*(click here|click|here|read more|more|this page|this|learn more|see more|view|visit|details|info)[[:space:]]*</a>' "$1" 2>/dev/null
  grep -oiE '\[(click here|click|here|read more|more|this page|learn more|see more|details|info)\]\(' "$1" 2>/dev/null
}

# Try common URL variations; echo the first that returns 200, else nothing.
find_variation() {
  local url="$1" scheme rest host path cand
  scheme="${url%%://*}"
  rest="${url#*://}"
  host="${rest%%/*}"
  if [ "$rest" = "$host" ]; then path=""; else path="/${rest#*/}"; fi

  local cands=()
  case "$scheme" in
    http)  cands+=("https://${host}${path}") ;;
    https) cands+=("http://${host}${path}") ;;
  esac
  if [[ "$host" == www.* ]]; then
    cands+=("${scheme}://${host#www.}${path}")
  else
    cands+=("${scheme}://www.${host}${path}")
  fi
  if [ -n "$path" ] && [[ "$path" != */ ]] && [[ "${path##*/}" != *.* ]]; then
    cands+=("${url%/}/")
  fi

  for cand in "${cands[@]}"; do
    [ "$cand" = "$url" ] && continue
    [ "$(http_status "$cand")" = "200" ] && { printf '%s' "$cand"; return 0; }
  done
  return 1
}

# Hand the broken link to the Copilot CLI agent and let it propose alternatives.
# This is a deliberately lightweight, low-token hand-off: a single non-interactive
# prompt to a small model, with no tools enabled - the agent answers from its own
# knowledge, so there are no web fetches, no permission prompts, and no archive
# lookups on our side. The model may prefix a prose line, so we pull http(s) tokens
# from anywhere in the output, trim trailing punctuation, drop the broken URL
# itself, and de-duplicate (case-insensitively). Up to MAX lines, one URL each.
agent_alts() {
  local url="$1" max="$2" prompt out
  command -v copilot >/dev/null 2>&1 || return 0
  prompt="In under $((AGENT_TIMEOUT - 5)) seconds, find up to ${max} working alternative URLs for the broken link ${url}. Hierarchically consider 1. Path and/or page spelling; 2. web.archive.org/wayback; 3. Redirects using redirect destination; 4. The context of the link's text; in order to resolve. Output only the URLs. One per line, and no: prose, numbering, markdown, backticks, special characters, post formatting."
  # FIX_BROKEN_LINKS_AGENT marks the child run so a re-entrant hook exits early.
  out="$(FIX_BROKEN_LINKS_AGENT=1 $AGENT_RUN copilot -p "$prompt" \
          -s --no-color --model "$AGENT_MODEL" --available-tools 2>/dev/null)"
  # If copilot errored, timed out, or produced nothing, offer no alternatives.
  [ $? -eq 0 ] && [ -n "$out" ] || return 0
  printf '%s\n' "$out" \
    | grep -oiE 'https?://[^][:space:]"'\''<>)]+' \
    | sed -E 's/[.,;:]+$//' \
    | awk -v bad="$url" 'tolower($0) != tolower(bad) && !seen[tolower($0)]++' \
    | head -n "$max"
}

# Emit up to MAX viable replacement URLs for a broken link, best first:
#   1. a working scheme/www/slash variation (verified live 200)
#   2. alternatives proposed by the Copilot CLI agent (see agent_alts)
# Output is newline-delimited and de-duplicated (case-insensitively). The first
# line is what `r` uses; the remainder become the numbered alternatives.
suggest_alts() {
  local url="$1" max="${2:-6}" cand key
  local -A seen=()
  local out=()

  cand="$(find_variation "$url")" && [ -n "$cand" ] && { out+=("$cand"); seen["${cand,,}"]=1; }

  while IFS= read -r cand; do
    [ "${#out[@]}" -ge "$max" ] && break
    [ -z "$cand" ] && continue
    key="${cand,,}"; [ -n "${seen[$key]:-}" ] && continue
    out+=("$cand"); seen[$key]=1
  done < <(agent_alts "$url" "$max")

  [ "${#out[@]}" -eq 0 ] && return 0
  printf '%s\n' "${out[@]}"
}

# Replace a literal URL everywhere in a file (pure bash, no regex).
replace_url() {
  local file="$1" old="$2" new="$3" content pat
  read_file content "$file"
  pat="$(glob_escape "$old")"
  printf '%s' "${content//$pat/$new}" > "$file"
}

# Remove the link wrapper but keep the visible text:
#   <a href="URL">text</a>  ->  text
#   [text](URL)             ->  text
# Each matched wrapper is swapped for its inner text via literal replacement.
remove_link() {
  local file="$1" url="$2" content esc re pat
  read_file content "$file"
  esc="$(re_escape "$url")"
  for re in \
    '<a[^>]*href="'"$esc"'"[^>]*>([^<]*)</a>' \
    "<a[^>]*href='${esc}'[^>]*>([^<]*)</a>" \
    '\[([^]]*)\]\('"$esc"'[^)]*\)'; do
    while [[ $content =~ $re ]]; do
      # The matched span often contains [ and ] (Markdown), which are glob
      # metacharacters, so escape it before the literal substitution.
      pat="$(glob_escape "${BASH_REMATCH[0]}")"
      content="${content//$pat/${BASH_REMATCH[1]}}"
    done
  done
  printf '%s' "$content" > "$file"
}

# ── File discovery ────────────────────────────────────────────────────────────

collect_input() {
  if [ "$#" -gt 0 ]; then printf '%s\n' "$@"; return; fi
  # Fired as a hook but the payload carried no (web) files: do nothing rather than
  # fall back to scanning unrelated files - the hook only ever checks edited files.
  [ -n "$_HOOK" ] && return
  local out=""
  if command -v git >/dev/null 2>&1 && git rev-parse --git-dir >/dev/null 2>&1; then
    out="$({ git diff --name-only HEAD; git diff --name-only --cached; } 2>/dev/null)"
  fi
  if [ -n "$out" ]; then printf '%s\n' "$out"; return; fi
  find . -type d \( -name .git -o -name node_modules -o -name dist -o -name build \
    -o -name .next -o -name .venv -o -name __pycache__ \) -prune \
    -o -type f -print 2>/dev/null
}

declare -A SEEN
FILES=()
while IFS= read -r f; do
  [ -z "$f" ] && continue
  [ -f "$f" ] || continue
  case "$f" in */node_modules/*|*/.git/*|*/dist/*|*/build/*) continue ;; esac
  printf '%s\n' "$f" | grep -qiE "$WEB_RE" || continue
  [ -n "${SEEN[$f]:-}" ] && continue
  SEEN[$f]=1
  FILES+=("$f")
done < <(collect_input "$@")

[ "${#FILES[@]}" -eq 0 ] && exit 0

# ── Scan ──────────────────────────────────────────────────────────────────────

B_FILE=(); B_URL=(); B_STATUS=(); B_ALT=()
SEO_LINES=()

for file in "${FILES[@]}"; do
  while IFS= read -r line; do
    [ -n "$line" ] && SEO_LINES+=("$file: $line")
  done < <(seo_scan "$file")

  mapfile -t urls < <(extract_urls "$file")
  [ "${#urls[@]}" -eq 0 ] && continue

  if [ "$HAVE_PARAMS" = "1" ] && [ "${#urls[@]}" -gt "$LIMIT" ]; then
    ans="$(ask "  ${file} has ${#urls[@]} links (limit ${LIMIT}). Continue? [Y/n] ")"
    case "$ans" in n|N|no|NO) continue ;; esac
  fi

  printf '\n  Checking %d link(s) in %s ...\n' "${#urls[@]}" "$file"
  for url in "${urls[@]}"; do
    status="$(http_status "$url")"
    [ "$status" = "200" ] && continue
    printf '    BROKEN (%s) %s\n' "$status" "$url"
    # Only look up replacements when files were passed; otherwise just list.
    alts=""
    [ "$HAVE_PARAMS" = "1" ] && alts="$(suggest_alts "$url" 6)"
    B_FILE+=("$file"); B_URL+=("$url"); B_STATUS+=("$status"); B_ALT+=("$alts")
  done
done

# ── SEO report ────────────────────────────────────────────────────────────────

if [ "${#SEO_LINES[@]}" -gt 0 ]; then
  printf '\n%s\n  SEO anchor issues (consider descriptive link text)\n' "------------------------------------------------------------"
  for s in "${SEO_LINES[@]}"; do printf '    %s\n' "$s"; done
fi

if [ "${#B_URL[@]}" -eq 0 ]; then
  printf '\n  No broken links found.\n\n'
  exit 0
fi

# ── Interactive fix ───────────────────────────────────────────────────────────

printf '\n%s\n  fix-broken-links report\n%s\n' "============================================================" "============================================================"

declare -A CHANGED
n="${#B_URL[@]}"
for ((i=0; i<n; i++)); do
  file="${B_FILE[$i]}"; url="${B_URL[$i]}"; status="${B_STATUS[$i]}"
  printf '\n  [%d] %s\n' "$((i+1))" "$file"
  printf '    URL : %s\n' "$url"
  note=""; case "$status" in ERR|000|TIMEOUT) note="  (unreachable)" ;; esac
  printf '    HTTP: %s%s\n' "$status" "$note"

  # No file parameters → report-only: list the broken link and move on.
  [ "$HAVE_PARAMS" = "1" ] || continue

  alts=(); [ -n "${B_ALT[$i]}" ] && mapfile -t alts <<< "${B_ALT[$i]}"
  printf '\n'
  if [ "${#alts[@]}" -gt 0 ]; then
    printf '    r  Replace -> %s\n' "${alts[0]}"
    for ((k=1; k<${#alts[@]}; k++)); do
      printf '    %d  Replace -> %s\n' "$k" "${alts[$k]}"
    done
  fi
  printf '    d  Remove link, keep text\n'
  printf '    c  Custom replacement URL\n'
  printf '    s  Skip\n'

  if [ -z "$TTY" ]; then
    printf '    (no terminal - reporting only)\n'
    continue
  fi

  while true; do
    ch="$(ask '  > ')"
    case "$ch" in
      s|"") break ;;
      d) remove_link "$file" "$url"; CHANGED[$file]=1; printf '    removed\n'; break ;;
      r) if [ "${#alts[@]}" -gt 0 ]; then
           replace_url "$file" "$url" "${alts[0]}"; CHANGED[$file]=1; printf '    replaced -> %s\n' "${alts[0]}"; break
         fi
         printf '    no suggestion available\n' ;;
      [1-9]) if [ "$ch" -lt "${#alts[@]}" ]; then
               replace_url "$file" "$url" "${alts[$ch]}"; CHANGED[$file]=1; printf '    replaced -> %s\n' "${alts[$ch]}"; break
             else printf '    invalid choice\n'; fi ;;
      c) u="$(ask '  URL: ')"
         if [ -n "$u" ]; then replace_url "$file" "$url" "$u"; CHANGED[$file]=1; printf '    replaced\n'; break; fi ;;
      *) printf '    invalid choice\n' ;;
    esac
  done
done

if [ "${CHANGED[*]+x}" = x ] && [ "${#CHANGED[@]}" -gt 0 ]; then
  printf '\n  %d file(s) updated:\n' "${#CHANGED[@]}"
  for f in "${!CHANGED[@]}"; do printf '    %s\n' "$f"; done
  printf '\n'
fi
exit 0
