package pdfcontent import ( "fmt" "pdf-wizard/internal/pdfprocessor" ) // ExtractPDFContent extracts text from a PDF using pdftotext -bbox-layout and // injects form field markers at their respective positions. // // The function: // 1. Extracts form field positions from the PDF using pdfcpu // 2. Extracts text with coordinates using pdftotext -bbox // 3. Detects and flattens multi-column layouts to single-column order // 4. Injects form field markers at their physical positions // 5. Outputs formatted markdown func ExtractPDFContent(pdfPath string) (string, error) { // Step 1: Extract form fields rawFields, err := pdfprocessor.ExtractFormFields(pdfPath) if err != nil { return "", fmt.Errorf("extracting form fields: %w", err) } // Convert to form field positions with annotation labels var fields []FormFieldPos for i, rf := range rawFields { fields = append(fields, FormFieldPos{ Name: rf.Name, PosX: rf.PosX, PosY: rf.PosY, Width: rf.Width, Height: rf.Height, Page: rf.Page, Label: fmt.Sprintf(`[field idx="%d" type="%s"]`, i, rf.Type), }) } // Step 2: Extract text with coordinates using pdftotext pages, err := ExtractText(pdfPath) if err != nil { return "", fmt.Errorf("extracting text: %w", err) } if len(pages) == 0 { return "", fmt.Errorf("no text content found in PDF") } // Step 3: Convert field coordinates from PDF user space (Y from bottom) // to pdftotext screen space (Y from top). Use center Y for both fields // and text items so they align properly. for pi := range pages { pageHeight := pages[pi].Height for fi := range fields { if fields[fi].Page == pi+1 { // pdfcpu PosY is bottom-left in PDF coords; center = PosY + Height/2 // screen center Y = pageHeight - pdfCenterY fields[fi].PosY = pageHeight - fields[fi].PosY - fields[fi].Height/2 } } } // Step 4: Convert to markdown with field markers injected markdown := ToMarkdown(pages, fields) return markdown, nil }