#!/usr/bin/env python3
"""
OSINT Investigator — PDF Report Generator
Converts a structured OSINT report (markdown or dict) into a clean PDF.

Usage (from agent):
    python3 generate_pdf.py --title "Jack Gooding" --input report.md --output ~/Desktop/

Usage (direct):
    python3 generate_pdf.py --title "Target Name" --markdown "..." --output /path/to/dir
"""

import argparse
import json
import os
import re
import sys
from datetime import datetime

try:
    from fpdf import FPDF
except ImportError:
    print("ERROR: fpdf2 not installed. Run: pip3 install fpdf2 --break-system-packages")
    sys.exit(1)

# Confidence level styling
CONFIDENCE = {
    "high":   {"emoji": "🟢", "label": "HIGH",   "r": 34,  "g": 139, "b": 34},
    "medium": {"emoji": "🟠", "label": "MED",    "r": 255, "g": 140, "b": 0},
    "low":    {"emoji": "🔴", "label": "LOW",    "r": 204, "g": 0,   "b": 0},
    "unverified": {"emoji": "⚪", "label": "UNVERIFIED", "r": 128, "g": 128, "b": 128},
}

CONFIG_PATH = os.path.join(os.path.dirname(__file__), '..', 'config', 'osint_config.json')


def load_config():
    path = os.path.normpath(CONFIG_PATH)
    if os.path.exists(path):
        with open(path) as f:
            return json.load(f)
    return {"output": {"pdf_enabled": True, "pdf_include_sources": True}}


def sanitize(text):
    """Replace unicode chars that fpdf latin-1 can't handle."""
    replacements = {
        "\u2019": "'", "\u2018": "'", "\u201c": '"', "\u201d": '"',
        "\u2013": "-", "\u2014": "--", "\u2022": "*", "\u2026": "...",
        "\u00a0": " ", "\u2713": "✓", "\u2715": "x",
        # confidence emojis → text
        "🟢": "[HIGH]", "🟠": "[MED]", "🔴": "[LOW]", "⚪": "[?]",
        "✅": "[OK]", "❌": "[NO]", "⚠️": "[WARN]", "🔍": "",
        "🏏": "", "💻": "", "📱": "", "🐦": "", "🏢": "", "📄": "",
        "🔗": "", "🌐": "", "📞": "", "📧": "", "🗺️": "", "🖼️": "",
        "📋": "", "🎯": "",
    }
    for src, dst in replacements.items():
        text = text.replace(src, dst)
    # Strip any remaining non-latin1
    return text.encode('latin-1', errors='replace').decode('latin-1')


class OSINTReport(FPDF):
    def __init__(self, title, target, date_str):
        super().__init__()
        self.report_title = title
        self.target = target
        self.date_str = date_str
        self.set_auto_page_break(auto=True, margin=15)

    def header(self):
        self.set_fill_color(20, 20, 40)
        self.rect(0, 0, 210, 18, 'F')
        self.set_font("Helvetica", "B", 11)
        self.set_text_color(255, 255, 255)
        self.set_xy(10, 4)
        self.cell(0, 10, sanitize(f"OSINT REPORT  |  {self.target}  |  {self.date_str}"), ln=False)
        self.set_text_color(0, 0, 0)
        self.ln(18)

    def footer(self):
        self.set_y(-12)
        self.set_font("Helvetica", "I", 8)
        self.set_text_color(150, 150, 150)
        self.cell(0, 10, f"Page {self.page_no()} | Generated by OSINT Investigator | CONFIDENTIAL", align="C")
        self.set_text_color(0, 0, 0)

    def cover_page(self, query, target_type, summary=None):
        self.add_page()
        # Dark header block
        self.set_fill_color(20, 20, 40)
        self.rect(0, 0, 210, 70, 'F')

        self.set_font("Helvetica", "B", 28)
        self.set_text_color(255, 255, 255)
        self.set_xy(15, 15)
        self.cell(0, 12, "OSINT REPORT", ln=True)

        self.set_font("Helvetica", "B", 18)
        self.set_text_color(100, 200, 255)
        self.set_x(15)
        self.cell(0, 10, sanitize(self.target), ln=True)

        self.set_font("Helvetica", "", 10)
        self.set_text_color(180, 180, 220)
        self.set_x(15)
        self.cell(0, 8, f"Generated: {self.date_str}", ln=True)

        self.set_text_color(0, 0, 0)
        self.set_y(80)

        # Meta boxes
        self.set_font("Helvetica", "B", 10)
        self.set_fill_color(240, 242, 248)
        self.set_x(15)
        self.cell(55, 8, "Target:", fill=True)
        self.set_font("Helvetica", "", 10)
        self.cell(0, 8, sanitize(self.target), ln=True)

        self.set_font("Helvetica", "B", 10)
        self.set_fill_color(240, 242, 248)
        self.set_x(15)
        self.cell(55, 8, "Target Type:", fill=True)
        self.set_font("Helvetica", "", 10)
        self.cell(0, 8, sanitize(target_type), ln=True)

        self.set_font("Helvetica", "B", 10)
        self.set_fill_color(240, 242, 248)
        self.set_x(15)
        self.cell(55, 8, "Query:", fill=True)
        self.set_font("Helvetica", "", 10)
        self.cell(0, 8, sanitize(query[:80]), ln=True)

        # Confidence legend
        self.ln(6)
        self.set_font("Helvetica", "B", 10)
        self.set_x(15)
        self.cell(0, 7, "Confidence Legend:", ln=True)

        legend = [
            ("high",   "Green  [HIGH]",       "Verified from multiple reliable sources"),
            ("medium", "Orange [MED]",        "Likely correct, single source or unverified"),
            ("low",    "Red    [LOW]",        "Possible match, little corroborating evidence"),
            ("unverified", "Grey  [UNVERIFIED]", "User-provided context, not independently confirmed"),
        ]
        for conf_key, label, desc in legend:
            conf = CONFIDENCE[conf_key]
            self.set_font("Helvetica", "B", 9)
            self.set_text_color(conf['r'], conf['g'], conf['b'])
            self.set_x(20)
            self.cell(38, 6, label)
            self.set_font("Helvetica", "", 9)
            self.set_text_color(80, 80, 80)
            self.cell(0, 6, sanitize(desc), ln=True)
        self.set_text_color(0, 0, 0)

        if summary:
            self.ln(4)
            self.set_x(15)
            self.set_font("Helvetica", "B", 10)
            self.set_fill_color(230, 240, 255)
            self.cell(180, 7, "Executive Summary", fill=True, ln=True)
            self.set_font("Helvetica", "", 9)
            self.set_x(15)
            self.multi_cell(180, 5, sanitize(summary))

    def section_header(self, title):
        self.ln(4)
        self.set_fill_color(20, 20, 40)
        self.set_text_color(255, 255, 255)
        self.set_font("Helvetica", "B", 11)
        self.set_x(10)
        self.cell(190, 8, sanitize(f"  {title}"), fill=True, ln=True)
        self.set_text_color(0, 0, 0)
        self.ln(1)

    def finding_row(self, finding, source, confidence_key):
        """Render a single finding row with confidence badge."""
        conf = CONFIDENCE.get(confidence_key.lower(), CONFIDENCE["low"])
        y = self.get_y()

        # Confidence badge
        self.set_fill_color(conf['r'], conf['g'], conf['b'])
        self.set_text_color(255, 255, 255)
        self.set_font("Helvetica", "B", 7)
        self.set_x(10)
        self.cell(22, 6, conf['label'], fill=True, align="C")

        # Finding text
        self.set_text_color(0, 0, 0)
        self.set_font("Helvetica", "", 9)
        self.set_x(35)
        self.cell(105, 6, sanitize(finding[:90]))

        # Source
        self.set_font("Helvetica", "I", 7)
        self.set_text_color(100, 100, 100)
        self.cell(0, 6, sanitize(source[:45]), ln=True)
        self.set_text_color(0, 0, 0)

        # Light separator line
        self.set_draw_color(220, 220, 220)
        self.line(10, self.get_y(), 200, self.get_y())

    def add_text_block(self, heading, body, confidence_key=None):
        """Add a named text block, optionally with a confidence badge."""
        if heading:
            self.set_font("Helvetica", "B", 9)
            self.set_x(12)
            if confidence_key:
                conf = CONFIDENCE.get(confidence_key.lower(), CONFIDENCE["low"])
                self.set_text_color(conf['r'], conf['g'], conf['b'])
                self.cell(25, 5, f"[{conf['label']}]")
                self.set_text_color(30, 30, 30)
                self.cell(0, 5, sanitize(heading), ln=True)
            else:
                self.set_text_color(30, 30, 30)
                self.cell(0, 5, sanitize(heading), ln=True)

        if body:
            self.set_font("Helvetica", "", 9)
            self.set_text_color(60, 60, 60)
            self.set_x(15)
            self.multi_cell(180, 4.5, sanitize(body))
        self.set_text_color(0, 0, 0)
        self.ln(1)

    def sources_section(self, sources):
        self.section_header("Sources")
        self.set_font("Helvetica", "", 8)
        self.set_text_color(40, 40, 120)
        for i, src in enumerate(sources, 1):
            self.set_x(12)
            self.cell(8, 5, f"{i}.")
            self.multi_cell(178, 5, sanitize(src))
        self.set_text_color(0, 0, 0)


def parse_markdown_report(md_text):
    """Parse a markdown OSINT report into structured sections."""
    sections = []
    current_section = None
    current_body = []
    sources = []
    in_sources = False
    meta = {"target": "", "date": "", "query": "", "target_type": "", "summary": ""}

    lines = md_text.split('\n')
    for line in lines:
        # Extract meta from header lines
        if line.startswith("**Date:**"):
            meta["date"] = re.sub(r'\*\*Date:\*\*\s*', '', line).strip()
        elif line.startswith("**Target Type:**"):
            meta["target_type"] = re.sub(r'\*\*Target Type:\*\*\s*', '', line).strip()
        elif line.startswith("**Query:**"):
            meta["query"] = re.sub(r'\*\*Query:\*\*\s*', '', line).strip()

        # Top level H1 = report title / target
        elif line.startswith("# OSINT Report:"):
            meta["target"] = line.replace("# OSINT Report:", "").strip()

        # H2 sections
        elif line.startswith("## "):
            if current_section:
                sections.append((current_section, '\n'.join(current_body).strip()))
            current_section = line[3:].strip()
            current_body = []
            in_sources = "Sources" in current_section

        # H3 subsections — treat as bold heading in body
        elif line.startswith("### "):
            current_body.append(f"\n{line[4:].strip()}:")

        # Source lines
        elif in_sources and line.strip().startswith("- http"):
            sources.append(line.strip()[2:])
        elif in_sources and re.match(r'https?://', line.strip()):
            sources.append(line.strip())

        else:
            current_body.append(line)

    if current_section:
        sections.append((current_section, '\n'.join(current_body).strip()))

    return meta, sections, sources


def confidence_from_text(text):
    """Detect confidence level from text containing HIGH/MEDIUM/LOW/UNVERIFIED."""
    text_upper = text.upper()
    if "HIGH" in text_upper:
        return "high"
    elif "MEDIUM" in text_upper or "MED" in text_upper:
        return "medium"
    elif "LOW" in text_upper:
        return "low"
    elif "UNVERIFIED" in text_upper:
        return "unverified"
    return None


def generate_pdf(markdown_text, output_dir="~/Desktop", target_override=None):
    """Main entry point: convert markdown report to PDF."""
    cfg = load_config()
    output_dir = os.path.expanduser(output_dir or cfg.get("output", {}).get("pdf_output_dir", "~/Desktop"))
    include_sources = cfg.get("output", {}).get("pdf_include_sources", True)

    meta, sections, sources = parse_markdown_report(markdown_text)
    target = target_override or meta.get("target", "Unknown Target")
    date_str = meta.get("date") or datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")

    pdf = OSINTReport(
        title=f"OSINT Report - {target}",
        target=target,
        date_str=date_str,
    )

    # Cover page
    summary_text = ""
    for sec_name, sec_body in sections:
        if "summary" in sec_name.lower() or "identity" in sec_name.lower():
            summary_text = sec_body[:400]
            break

    pdf.cover_page(
        query=meta.get("query", f"OSINT investigation: {target}"),
        target_type=meta.get("target_type", "Person"),
        summary=summary_text if summary_text else None,
    )

    # Content pages
    pdf.add_page()

    for sec_name, sec_body in sections:
        if not sec_body.strip():
            continue
        # Skip cover meta sections and sources (handled separately)
        if any(x in sec_name.lower() for x in ["sources", "confidence & gaps"]):
            continue

        pdf.section_header(sec_name)

        # Check if section has a markdown table (findings table)
        if "|" in sec_body and "---" in sec_body:
            # Parse table rows as finding rows
            rows = [r for r in sec_body.split('\n') if '|' in r and '---' not in r]
            # First row is header
            for row in rows[1:]:  # skip header row
                cells = [c.strip() for c in row.split('|') if c.strip()]
                if len(cells) >= 3:
                    finding = cells[0]
                    source = cells[1] if len(cells) > 1 else ""
                    conf_text = cells[2] if len(cells) > 2 else ""
                    conf_key = confidence_from_text(conf_text) or "low"
                    pdf.finding_row(finding, source, conf_key)
        else:
            # Render as text blocks, detect inline confidence markers
            paragraphs = re.split(r'\n{2,}', sec_body)
            for para in paragraphs:
                para = para.strip()
                if not para:
                    continue
                conf_key = confidence_from_text(para)
                # Clean markdown formatting
                para_clean = re.sub(r'\*\*(.*?)\*\*', r'\1', para)
                para_clean = re.sub(r'\*(.*?)\*', r'\1', para_clean)
                para_clean = re.sub(r'`(.*?)`', r'\1', para_clean)
                para_clean = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', para_clean)
                para_clean = re.sub(r'^[-*]\s+', '', para_clean, flags=re.MULTILINE)
                pdf.add_text_block(None, para_clean, confidence_key=conf_key)

    # Confidence summary section (if present)
    for sec_name, sec_body in sections:
        if "confidence" in sec_name.lower():
            pdf.section_header("Confidence Summary")
            if "|" in sec_body and "---" in sec_body:
                rows = [r for r in sec_body.split('\n') if '|' in r and '---' not in r]
                for row in rows[1:]:
                    cells = [c.strip() for c in row.split('|') if c.strip()]
                    if len(cells) >= 3:
                        finding = cells[0]
                        source = cells[1] if len(cells) > 1 else ""
                        conf_text = cells[2] if len(cells) > 2 else ""
                        conf_key = confidence_from_text(conf_text) or "low"
                        pdf.finding_row(finding, source, conf_key)
            break

    # Sources
    if include_sources and sources:
        pdf.add_page()
        pdf.sources_section(sources)

    # Save
    os.makedirs(output_dir, exist_ok=True)
    safe_target = re.sub(r'[^a-zA-Z0-9_\- ]', '', target).strip().replace(' ', '_')
    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M")
    filename = f"OSINT_{safe_target}_{timestamp}.pdf"
    filepath = os.path.join(output_dir, filename)
    pdf.output(filepath)

    return filepath


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate OSINT PDF report")
    parser.add_argument("--input", help="Path to markdown report file")
    parser.add_argument("--markdown", help="Markdown text directly")
    parser.add_argument("--target", help="Override target name")
    parser.add_argument("--output", default="~/Desktop", help="Output directory")
    args = parser.parse_args()

    md = ""
    if args.input:
        with open(args.input) as f:
            md = f.read()
    elif args.markdown:
        md = args.markdown
    else:
        print("Reading markdown from stdin...")
        md = sys.stdin.read()

    filepath = generate_pdf(md, output_dir=args.output, target_override=args.target)
    print(f"PDF saved: {filepath}")
