#!/usr/bin/env python3 """ Extract encyclopedia entries from PDFs as formatted HTML/CSS. Preserves: fonts, sizes, bold, italic, structural hierarchy, colors. """ import fitz import os import re import html as htmlmod import sys PDF_DIR = '/workspace/pdf' OUTPUT_DIR = '/workspace/html_entries' os.makedirs(OUTPUT_DIR, exist_ok=True) # Map font names to CSS font families FONT_MAP = { 'Georgia': 'Georgia, serif', 'PalatinoLinotype': 'Palatino, serif', 'Times-Roman': 'serif', 'TimesNewRoman': 'serif', 'Times': 'serif', 'BerlinSansFB': 'sans-serif', 'SegoeUI': 'sans-serif', 'Helvetica': 'sans-serif', 'Verdana': 'sans-serif', 'Arial': 'sans-serif', 'Courier': 'monospace', } def get_font_family(font_name): """Map PDF font name to CSS font-family.""" for prefix, family in FONT_MAP.items(): if prefix in font_name: return family return 'Georgia, serif' # default def sanitize_filename(name): """Convert entry title to safe filename.""" name = name.strip().strip('"\'"\'').replace('—', '-') name = name.replace('[', '').replace(']', '') name = re.sub(r'\s+', '_', name) name = re.sub(r'[^A-Za-z0-9_\-]', '', name) name = name.upper().strip('_-') return name def is_entry_title(span): """Check if this span is an entry title (large bold Georgia).""" text = span['text'].strip() font = span['font'] size = span['size'] if not text: return False return 'Bold' in font and size >= 14 def is_index_line(span): """Check if this is an index line (small bold-italic).""" return 'BoldItalic' in span['font'] and span['size'] < 9 def is_skip(span): """Check if this span should be skipped entirely.""" text = span['text'].strip() font = span['font'] size = span['size'] if not text: return True if size < 6: return True if is_index_line(span): return True if text.startswith('OGDEN\'S ENCYCLOPEDIA'): return True if text.startswith('©'): return True if 'Library' in text or 'win2pdf' in text or 'purchase' in text or 'purchase' in text.lower(): return True if text in ('`', '(', '—', ' ', '', '©', '© 20'): return True # Skip page footers with dates if size < 6 and re.match(r'\d{2}\s+\w+\s+\d{4}', text): return True # Skip copyright note if 'Bold' in font and size < 9 and text.startswith('NOTE:') and 'intended o' in text: return True if 'Bold' in font and size < 9 and text.startswith('purposes, as supplemental'): return True if 'Bold' in font and size < 9 and text.startswith('intended nor assertion'): return True # Skip PDF generation artifacts if 'BizLike' in text or 'comments' in text.lower(): return True return False def is_section_marker(text): """Check if text is a section marker like [RADIO-SERIES].""" return bool(re.match(r'^\[.+', text)) def is_sub_header(text, font): """Check if text is a sub-header like ORIGINATION:, CAST:, etc.""" if 'Bold' not in font: return False # Known sub-header patterns sub_patterns = [ r'^ORIGINATION[:\s]', r'^DURATION[:\s]', r'^PERSONNEL[:\s]', r'^CAST[:\s]', r'^CASTS[:\s]', r'^EXTANT RECORDING', r'^SCRIPT[:\s]', r'^BBC TITLE CARD[:\s]', r'^RADIOGOLDINDEX[:\s]', r'^RADIO TIMES[:\s]', r'^LONDON TIMES[:\s]', r'^KINGSTON GLEANER[:\s]', r'^MANCHESTER GUARDIAN[:\s]', r'^ST\. PETERSBURG', r'^THE TIMES[:\s]', r'^PERIODICALS[:\s]', r'^EXTANT RECORDINGS[:\s]', r'^RADIO DRAMA REVIEWS', r'^HALLOWE', r'^SATURDAY NIGHT', r'^PICTURE PAGE', r'^SATURDAY NIGHT PAGE', ] for pat in sub_patterns: if re.match(pat, text, re.IGNORECASE): return True # All-caps short labels if text.isupper() and len(text) < 40 and not text.startswith('['): return True # Ends with colon, relatively short if text.endswith(':') and len(text) < 40: return True return False def is_og_note(text, font): """Check for OG-NOTE: lines.""" return 'Berlin' in font and 'Bold' in font and text.startswith('OG-NOTE') def classify(text, font, size, flags): """Classify a span's structural role.""" if is_entry_title({'text': text, 'font': font, 'size': size, 'flags': flags}): return 'entry-title' if is_og_note(text, font): return 'og-note' if is_section_marker(text): return 'section-marker' if is_sub_header(text, font): return 'sub-header' is_bold = 'Bold' in font or bool(flags & 1) is_italic = 'Italic' in font or bool(flags & 2) if is_bold and is_italic: return 'bold-italic' if is_bold: return 'bold' if is_italic: return 'italic' return 'body' def span_to_html(text, font, size, flags): """Convert a single span to HTML with inline styles.""" cls = classify(text, font, size, flags) text_html = htmlmod.escape(text) family = get_font_family(font) is_bold = 'Bold' in font or bool(flags & 1) is_italic = 'Italic' in font or bool(flags & 2) color = span.get('color', 0) if isinstance(span, dict) else 0 if cls == 'section-marker': return f'
{text_html}
' elif cls == 'sub-header': return f'
{text_html}
' elif cls == 'og-note': return f'
{text_html}
' elif cls == 'bold-italic': return f'{text_html}' elif cls == 'bold': return f'{text_html}' elif cls == 'italic': return f'{text_html}' else: return f'{text_html}' def extract_entry_html(title, spans): """Build HTML body for one entry.""" lines = [] para_parts = [] prev_y = None title_done = False for span in spans: text = span['text'].strip() if not text: continue font = span['font'] size = span['size'] flags = span['flags'] y = span['bbox'][1] cls = classify(text, font, size, flags) # Entry title if cls == 'entry-title' and not title_done: title_done = True family = get_font_family(font) if para_parts: lines.append('

' + ''.join(para_parts) + '

') para_parts = [] title_html = htmlmod.escape(text) lines.append(f'

{title_html}

') continue # Structural elements (flush paragraph first) if cls in ('section-marker', 'sub-header', 'og-note'): if para_parts: lines.append('

' + ''.join(para_parts) + '

') para_parts = [] text_html = htmlmod.escape(text) family = get_font_family(font) if cls == 'section-marker': lines.append(f'
{text_html}
') elif cls == 'og-note': lines.append(f'
{text_html}
') else: lines.append(f'
{text_html}
') continue # Body text family = get_font_family(font) is_bold = 'Bold' in font or bool(flags & 1) is_italic = 'Italic' in font or bool(flags & 2) text_html = htmlmod.escape(text) style = f'font-family:{family};font-size:{size:.1f}px' if is_bold and is_italic: span_html = f'{text_html}' elif is_bold: span_html = f'{text_html}' elif is_italic: span_html = f'{text_html}' else: span_html = f'{text_html}' # New paragraph detection: significant y-position gap if prev_y is not None and abs(y - prev_y) > 5: if para_parts: lines.append('

' + ''.join(para_parts) + '

') para_parts = [] para_parts.append(span_html) prev_y = y # Flush remaining if para_parts: lines.append('

' + ''.join(para_parts) + '

') return '\n'.join(lines) def build_html_page(title, body_html): """Build complete HTML document.""" return f''' {htmlmod.escape(title)} {body_html} ''' def extract_entries_from_pdf(pdf_path, output_dir): """Extract all entries from a single PDF as HTML files.""" doc = fitz.open(pdf_path) basename = os.path.basename(pdf_path) print(f"\nProcessing: {basename} ({len(doc)} pages)", file=sys.stderr) entries = [] current_title = None current_spans = [] for page_num in range(len(doc)): page = doc[page_num] blocks = page.get_text('dict', sort=True)['blocks'] for b in blocks: if 'lines' not in b: continue for line in b['lines']: for span in line['spans']: text = span['text'].strip() if not text: continue font = span['font'] size = span['size'] if is_entry_title(span): # Save previous entry if current_title and current_spans: entries.append((current_title, current_spans)) current_title = text current_spans = [span] elif current_title: current_spans.append(span) # Save last entry if current_title and current_spans: entries.append((current_title, current_spans)) print(f" Found {len(entries)} entries", file=sys.stderr) for title, spans in entries: filename = sanitize_filename(title) if not filename: continue body_html = extract_entry_html(title, spans) full_html = build_html_page(title, body_html) filepath = os.path.join(output_dir, f'{filename}.html') with open(filepath, 'w', encoding='utf-8') as f: f.write(full_html) print(f" {filename}.html", file=sys.stderr) doc.close() def main(): # Clean output directory for f in os.listdir(OUTPUT_DIR): os.remove(os.path.join(OUTPUT_DIR, f)) pdf_files = sorted([ os.path.join(PDF_DIR, f) for f in os.listdir(PDF_DIR) if f.endswith('.pdf') ]) total = 0 for pdf_path in pdf_files: try: doc = fitz.open(pdf_path) count = 0 for page_num in range(len(doc)): page = doc[page_num] blocks = page.get_text('dict', sort=True)['blocks'] for b in blocks: if 'lines' not in b: continue for line in b['lines']: for span in line['spans']: if is_entry_title(span): count += 1 total += count print(f"{os.path.basename(pdf_path)}: {count} entries ({len(doc)} pages)", file=sys.stderr) doc.close() except Exception as e: print(f" Error: {e}", file=sys.stderr) print(f"\nTotal entries: {total}", file=sys.stderr) for pdf_path in pdf_files: try: extract_entries_from_pdf(pdf_path, OUTPUT_DIR) except Exception as e: print(f"Error processing {pdf_path}: {e}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) # Summary count = len([f for f in os.listdir(OUTPUT_DIR) if f.endswith('.html')]) print(f"\nGenerated {count} HTML files in {OUTPUT_DIR}", file=sys.stderr) if __name__ == '__main__': main()