#!/usr/bin/env python3
"""
Extract encyclopedia entries from PDFs as formatted HTML/CSS.
Preserves: fonts, sizes, bold, italic, structural hierarchy, colors.
"""
import fitz
import os
import re
import html as htmlmod
import sys

PDF_DIR = '/workspace/pdf'
OUTPUT_DIR = '/workspace/html_entries'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Map font names to CSS font families
FONT_MAP = {
    'Georgia': 'Georgia, serif',
    'PalatinoLinotype': 'Palatino, serif',
    'Times-Roman': 'serif',
    'TimesNewRoman': 'serif',
    'Times': 'serif',
    'BerlinSansFB': 'sans-serif',
    'SegoeUI': 'sans-serif',
    'Helvetica': 'sans-serif',
    'Verdana': 'sans-serif',
    'Arial': 'sans-serif',
    'Courier': 'monospace',
}

def get_font_family(font_name):
    """Map PDF font name to CSS font-family."""
    for prefix, family in FONT_MAP.items():
        if prefix in font_name:
            return family
    return 'Georgia, serif'  # default

def sanitize_filename(name):
    """Convert entry title to safe filename."""
    name = name.strip().strip('"\'"\'').replace('—', '-')
    name = name.replace('[', '').replace(']', '')
    name = re.sub(r'\s+', '_', name)
    name = re.sub(r'[^A-Za-z0-9_\-]', '', name)
    name = name.upper().strip('_-')
    return name

def is_entry_title(span):
    """Check if this span is an entry title (large bold Georgia)."""
    text = span['text'].strip()
    font = span['font']
    size = span['size']
    if not text:
        return False
    return 'Bold' in font and size >= 14

def is_index_line(span):
    """Check if this is an index line (small bold-italic)."""
    return 'BoldItalic' in span['font'] and span['size'] < 9

def is_skip(span):
    """Check if this span should be skipped entirely."""
    text = span['text'].strip()
    font = span['font']
    size = span['size']
    if not text:
        return True
    if size < 6:
        return True
    if is_index_line(span):
        return True
    if text.startswith('OGDEN\'S ENCYCLOPEDIA'):
        return True
    if text.startswith('©'):
        return True
    if 'Library' in text or 'win2pdf' in text or 'purchase' in text or 'purchase' in text.lower():
        return True
    if text in ('`', '(', '—', ' ', '', '©', '© 20'):
        return True
    # Skip page footers with dates
    if size < 6 and re.match(r'\d{2}\s+\w+\s+\d{4}', text):
        return True
    # Skip copyright note
    if 'Bold' in font and size < 9 and text.startswith('NOTE:') and 'intended o' in text:
        return True
    if 'Bold' in font and size < 9 and text.startswith('purposes, as supplemental'):
        return True
    if 'Bold' in font and size < 9 and text.startswith('intended nor assertion'):
        return True
    # Skip PDF generation artifacts
    if 'BizLike' in text or 'comments' in text.lower():
        return True
    return False

def is_section_marker(text):
    """Check if text is a section marker like [RADIO-SERIES]."""
    return bool(re.match(r'^\[.+', text))

def is_sub_header(text, font):
    """Check if text is a sub-header like ORIGINATION:, CAST:, etc."""
    if 'Bold' not in font:
        return False
    # Known sub-header patterns
    sub_patterns = [
        r'^ORIGINATION[:\s]', r'^DURATION[:\s]', r'^PERSONNEL[:\s]',
        r'^CAST[:\s]', r'^CASTS[:\s]', r'^EXTANT RECORDING',
        r'^SCRIPT[:\s]', r'^BBC TITLE CARD[:\s]', r'^RADIOGOLDINDEX[:\s]',
        r'^RADIO TIMES[:\s]', r'^LONDON TIMES[:\s]', r'^KINGSTON GLEANER[:\s]',
        r'^MANCHESTER GUARDIAN[:\s]', r'^ST\. PETERSBURG', r'^THE TIMES[:\s]',
        r'^PERIODICALS[:\s]', r'^EXTANT RECORDINGS[:\s]',
        r'^RADIO DRAMA REVIEWS', r'^HALLOWE', r'^SATURDAY NIGHT',
        r'^PICTURE PAGE', r'^SATURDAY NIGHT PAGE',
    ]
    for pat in sub_patterns:
        if re.match(pat, text, re.IGNORECASE):
            return True
    # All-caps short labels
    if text.isupper() and len(text) < 40 and not text.startswith('['):
        return True
    # Ends with colon, relatively short
    if text.endswith(':') and len(text) < 40:
        return True
    return False

def is_og_note(text, font):
    """Check for OG-NOTE: lines."""
    return 'Berlin' in font and 'Bold' in font and text.startswith('OG-NOTE')

def classify(text, font, size, flags):
    """Classify a span's structural role."""
    if is_entry_title({'text': text, 'font': font, 'size': size, 'flags': flags}):
        return 'entry-title'
    if is_og_note(text, font):
        return 'og-note'
    if is_section_marker(text):
        return 'section-marker'
    if is_sub_header(text, font):
        return 'sub-header'
    is_bold = 'Bold' in font or bool(flags & 1)
    is_italic = 'Italic' in font or bool(flags & 2)
    if is_bold and is_italic:
        return 'bold-italic'
    if is_bold:
        return 'bold'
    if is_italic:
        return 'italic'
    return 'body'

def span_to_html(text, font, size, flags):
    """Convert a single span to HTML with inline styles."""
    cls = classify(text, font, size, flags)
    text_html = htmlmod.escape(text)
    family = get_font_family(font)
    is_bold = 'Bold' in font or bool(flags & 1)
    is_italic = 'Italic' in font or bool(flags & 2)
    color = span.get('color', 0) if isinstance(span, dict) else 0

    if cls == 'section-marker':
        return f'<div class="section">{text_html}</div>'
    elif cls == 'sub-header':
        return f'<div class="sub-header">{text_html}</div>'
    elif cls == 'og-note':
        return f'<div class="og-note">{text_html}</div>'
    elif cls == 'bold-italic':
        return f'<b style="font-family:{family};font-size:{size:.1f}px"><i>{text_html}</i></b>'
    elif cls == 'bold':
        return f'<b style="font-family:{family};font-size:{size:.1f}px">{text_html}</b>'
    elif cls == 'italic':
        return f'<i style="font-family:{family};font-size:{size:.1f}px">{text_html}</i>'
    else:
        return f'<span style="font-family:{family};font-size:{size:.1f}px">{text_html}</span>'

def extract_entry_html(title, spans):
    """Build HTML body for one entry."""
    lines = []
    para_parts = []
    prev_y = None
    title_done = False

    for span in spans:
        text = span['text'].strip()
        if not text:
            continue

        font = span['font']
        size = span['size']
        flags = span['flags']
        y = span['bbox'][1]
        cls = classify(text, font, size, flags)

        # Entry title
        if cls == 'entry-title' and not title_done:
            title_done = True
            family = get_font_family(font)
            if para_parts:
                lines.append('<p>' + ''.join(para_parts) + '</p>')
                para_parts = []
            title_html = htmlmod.escape(text)
            lines.append(f'<h1>{title_html}</h1>')
            continue

        # Structural elements (flush paragraph first)
        if cls in ('section-marker', 'sub-header', 'og-note'):
            if para_parts:
                lines.append('<p>' + ''.join(para_parts) + '</p>')
                para_parts = []

            text_html = htmlmod.escape(text)
            family = get_font_family(font)
            if cls == 'section-marker':
                lines.append(f'<div class="section">{text_html}</div>')
            elif cls == 'og-note':
                lines.append(f'<div class="og-note">{text_html}</div>')
            else:
                lines.append(f'<div class="sub-header">{text_html}</div>')
            continue

        # Body text
        family = get_font_family(font)
        is_bold = 'Bold' in font or bool(flags & 1)
        is_italic = 'Italic' in font or bool(flags & 2)
        text_html = htmlmod.escape(text)

        style = f'font-family:{family};font-size:{size:.1f}px'

        if is_bold and is_italic:
            span_html = f'<b style="{style}"><i>{text_html}</i></b>'
        elif is_bold:
            span_html = f'<b style="{style}">{text_html}</b>'
        elif is_italic:
            span_html = f'<i style="{style}">{text_html}</i>'
        else:
            span_html = f'<span style="{style}">{text_html}</span>'

        # New paragraph detection: significant y-position gap
        if prev_y is not None and abs(y - prev_y) > 5:
            if para_parts:
                lines.append('<p>' + ''.join(para_parts) + '</p>')
                para_parts = []

        para_parts.append(span_html)
        prev_y = y

    # Flush remaining
    if para_parts:
        lines.append('<p>' + ''.join(para_parts) + '</p>')

    return '\n'.join(lines)

def build_html_page(title, body_html):
    """Build complete HTML document."""
    return f'''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{htmlmod.escape(title)}</title>
<style>
  body {{
    max-width: 820px;
    margin: 40px auto;
    padding: 0 24px;
    font-family: Georgia, serif;
    font-size: 10px;
    line-height: 1.4;
    color: #111;
    background: #fafafa;
  }}
  h1 {{
    font-family: Georgia, serif;
    font-size: 16px;
    font-weight: bold;
    margin: 24px 0 10px 0;
    padding-bottom: 8px;
    border-bottom: 2px solid #222;
    color: #111;
  }}
  p {{
    margin: 2px 0;
    line-height: 1.35;
  }}
  .section {{
    font-family: Georgia, serif;
    font-size: 11px;
    font-weight: bold;
    margin: 14px 0 4px 0;
    color: #000;
  }}
  .sub-header {{
    font-family: Georgia, serif;
    font-size: 7.9px;
    font-weight: bold;
    margin: 4px 0 2px 16px;
    color: #222;
  }}
  .og-note {{
    font-family: "Segoe UI", sans-serif;
    font-size: 7.9px;
    font-weight: bold;
    color: #666;
    margin: 8px 0 4px 12px;
  }}
  b, i {{
    line-height: 1.35;
  }}
</style>
</head>
<body>
{body_html}
</body>
</html>'''

def extract_entries_from_pdf(pdf_path, output_dir):
    """Extract all entries from a single PDF as HTML files."""
    doc = fitz.open(pdf_path)
    basename = os.path.basename(pdf_path)
    print(f"\nProcessing: {basename} ({len(doc)} pages)", file=sys.stderr)

    entries = []
    current_title = None
    current_spans = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text('dict', sort=True)['blocks']

        for b in blocks:
            if 'lines' not in b:
                continue
            for line in b['lines']:
                for span in line['spans']:
                    text = span['text'].strip()
                    if not text:
                        continue

                    font = span['font']
                    size = span['size']

                    if is_entry_title(span):
                        # Save previous entry
                        if current_title and current_spans:
                            entries.append((current_title, current_spans))
                        current_title = text
                        current_spans = [span]
                    elif current_title:
                        current_spans.append(span)

    # Save last entry
    if current_title and current_spans:
        entries.append((current_title, current_spans))

    print(f"  Found {len(entries)} entries", file=sys.stderr)

    for title, spans in entries:
        filename = sanitize_filename(title)
        if not filename:
            continue

        body_html = extract_entry_html(title, spans)
        full_html = build_html_page(title, body_html)

        filepath = os.path.join(output_dir, f'{filename}.html')
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(full_html)

        print(f"  {filename}.html", file=sys.stderr)

    doc.close()

def main():
    # Clean output directory
    for f in os.listdir(OUTPUT_DIR):
        os.remove(os.path.join(OUTPUT_DIR, f))

    pdf_files = sorted([
        os.path.join(PDF_DIR, f)
        for f in os.listdir(PDF_DIR)
        if f.endswith('.pdf')
    ])

    total = 0
    for pdf_path in pdf_files:
        try:
            doc = fitz.open(pdf_path)
            count = 0
            for page_num in range(len(doc)):
                page = doc[page_num]
                blocks = page.get_text('dict', sort=True)['blocks']
                for b in blocks:
                    if 'lines' not in b:
                        continue
                    for line in b['lines']:
                        for span in line['spans']:
                            if is_entry_title(span):
                                count += 1
            total += count
            print(f"{os.path.basename(pdf_path)}: {count} entries ({len(doc)} pages)", file=sys.stderr)
            doc.close()
        except Exception as e:
            print(f"  Error: {e}", file=sys.stderr)

    print(f"\nTotal entries: {total}", file=sys.stderr)

    for pdf_path in pdf_files:
        try:
            extract_entries_from_pdf(pdf_path, OUTPUT_DIR)
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}", file=sys.stderr)
            import traceback
            traceback.print_exc(file=sys.stderr)

    # Summary
    count = len([f for f in os.listdir(OUTPUT_DIR) if f.endswith('.html')])
    print(f"\nGenerated {count} HTML files in {OUTPUT_DIR}", file=sys.stderr)

if __name__ == '__main__':
    main()