#!/usr/bin/env python3 """ Extract encyclopedia entries from PDFs as formatted HTML/CSS. Preserves: fonts, sizes, bold, italic, structural hierarchy, colors. """ import fitz import os import re import html as htmlmod import sys PDF_DIR = '/workspace/pdf' OUTPUT_DIR = '/workspace/html_entries' os.makedirs(OUTPUT_DIR, exist_ok=True) # Map font names to CSS font families FONT_MAP = { 'Georgia': 'Georgia, serif', 'PalatinoLinotype': 'Palatino, serif', 'Times-Roman': 'serif', 'TimesNewRoman': 'serif', 'Times': 'serif', 'BerlinSansFB': 'sans-serif', 'SegoeUI': 'sans-serif', 'Helvetica': 'sans-serif', 'Verdana': 'sans-serif', 'Arial': 'sans-serif', 'Courier': 'monospace', } def get_font_family(font_name): """Map PDF font name to CSS font-family.""" for prefix, family in FONT_MAP.items(): if prefix in font_name: return family return 'Georgia, serif' # default def sanitize_filename(name): """Convert entry title to safe filename.""" name = name.strip().strip('"\'"\'').replace('—', '-') name = name.replace('[', '').replace(']', '') name = re.sub(r'\s+', '_', name) name = re.sub(r'[^A-Za-z0-9_\-]', '', name) name = name.upper().strip('_-') return name def is_entry_title(span): """Check if this span is an entry title (large bold Georgia).""" text = span['text'].strip() font = span['font'] size = span['size'] if not text: return False return 'Bold' in font and size >= 14 def is_index_line(span): """Check if this is an index line (small bold-italic).""" return 'BoldItalic' in span['font'] and span['size'] < 9 def is_skip(span): """Check if this span should be skipped entirely.""" text = span['text'].strip() font = span['font'] size = span['size'] if not text: return True if size < 6: return True if is_index_line(span): return True if text.startswith('OGDEN\'S ENCYCLOPEDIA'): return True if text.startswith('©'): return True if 'Library' in text or 'win2pdf' in text or 'purchase' in text or 'purchase' in text.lower(): return True if text in ('`', '(', '—', ' ', '', '©', '© 20'): return True # Skip page footers with dates if size < 6 and re.match(r'\d{2}\s+\w+\s+\d{4}', text): return True # Skip copyright note if 'Bold' in font and size < 9 and text.startswith('NOTE:') and 'intended o' in text: return True if 'Bold' in font and size < 9 and text.startswith('purposes, as supplemental'): return True if 'Bold' in font and size < 9 and text.startswith('intended nor assertion'): return True # Skip PDF generation artifacts if 'BizLike' in text or 'comments' in text.lower(): return True return False def is_section_marker(text): """Check if text is a section marker like [RADIO-SERIES].""" return bool(re.match(r'^\[.+', text)) def is_sub_header(text, font): """Check if text is a sub-header like ORIGINATION:, CAST:, etc.""" if 'Bold' not in font: return False # Known sub-header patterns sub_patterns = [ r'^ORIGINATION[:\s]', r'^DURATION[:\s]', r'^PERSONNEL[:\s]', r'^CAST[:\s]', r'^CASTS[:\s]', r'^EXTANT RECORDING', r'^SCRIPT[:\s]', r'^BBC TITLE CARD[:\s]', r'^RADIOGOLDINDEX[:\s]', r'^RADIO TIMES[:\s]', r'^LONDON TIMES[:\s]', r'^KINGSTON GLEANER[:\s]', r'^MANCHESTER GUARDIAN[:\s]', r'^ST\. PETERSBURG', r'^THE TIMES[:\s]', r'^PERIODICALS[:\s]', r'^EXTANT RECORDINGS[:\s]', r'^RADIO DRAMA REVIEWS', r'^HALLOWE', r'^SATURDAY NIGHT', r'^PICTURE PAGE', r'^SATURDAY NIGHT PAGE', ] for pat in sub_patterns: if re.match(pat, text, re.IGNORECASE): return True # All-caps short labels if text.isupper() and len(text) < 40 and not text.startswith('['): return True # Ends with colon, relatively short if text.endswith(':') and len(text) < 40: return True return False def is_og_note(text, font): """Check for OG-NOTE: lines.""" return 'Berlin' in font and 'Bold' in font and text.startswith('OG-NOTE') def classify(text, font, size, flags): """Classify a span's structural role.""" if is_entry_title({'text': text, 'font': font, 'size': size, 'flags': flags}): return 'entry-title' if is_og_note(text, font): return 'og-note' if is_section_marker(text): return 'section-marker' if is_sub_header(text, font): return 'sub-header' is_bold = 'Bold' in font or bool(flags & 1) is_italic = 'Italic' in font or bool(flags & 2) if is_bold and is_italic: return 'bold-italic' if is_bold: return 'bold' if is_italic: return 'italic' return 'body' def span_to_html(text, font, size, flags): """Convert a single span to HTML with inline styles.""" cls = classify(text, font, size, flags) text_html = htmlmod.escape(text) family = get_font_family(font) is_bold = 'Bold' in font or bool(flags & 1) is_italic = 'Italic' in font or bool(flags & 2) color = span.get('color', 0) if isinstance(span, dict) else 0 if cls == 'section-marker': return f'
' + ''.join(para_parts) + '
') para_parts = [] title_html = htmlmod.escape(text) lines.append(f'' + ''.join(para_parts) + '
') para_parts = [] text_html = htmlmod.escape(text) family = get_font_family(font) if cls == 'section-marker': lines.append(f'' + ''.join(para_parts) + '
') para_parts = [] para_parts.append(span_html) prev_y = y # Flush remaining if para_parts: lines.append('' + ''.join(para_parts) + '
') return '\n'.join(lines) def build_html_page(title, body_html): """Build complete HTML document.""" return f'''