package pdfcontent import ( "bytes" "encoding/xml" "fmt" "math" "os/exec" "strings" ) // pdftotext -bbox-layout HTML output types type PDFDocument struct { XMLName xml.Name `xml:"html"` Body struct { XMLName xml.Name `xml:"body"` Doc struct { Pages []Page `xml:"page"` } `xml:"doc"` } `xml:"body"` } type Page struct { Width float64 `xml:"width,attr"` Height float64 `xml:"height,attr"` Flows []Flow `xml:"flow"` } type Flow struct { Blocks []Block `xml:"block"` } type Block struct { XMin float64 `xml:"xMin,attr"` YMin float64 `xml:"yMin,attr"` XMax float64 `xml:"xMax,attr"` YMax float64 `xml:"yMax,attr"` Lines []Line `xml:"line"` } type Line struct { XMin float64 `xml:"xMin,attr"` YMin float64 `xml:"yMin,attr"` XMax float64 `xml:"xMax,attr"` YMax float64 `xml:"yMax,attr"` Words []Word `xml:"word"` } type Word struct { XMin float64 `xml:"xMin,attr"` YMin float64 `xml:"yMin,attr"` XMax float64 `xml:"xMax,attr"` YMax float64 `xml:"yMax,attr"` Text string `xml:",chardata"` } // ExtractText runs pdftotext -bbox-layout on the PDF and returns structured page data. func ExtractText(pdfPath string) ([]Page, error) { cmd := exec.Command("pdftotext", "-bbox-layout", pdfPath, "-") var out bytes.Buffer cmd.Stdout = &out cmd.Stderr = &out if err := cmd.Run(); err != nil { return nil, fmt.Errorf("pdftotext: %w: %s", err, out.String()) } var doc PDFDocument if err := xml.Unmarshal(out.Bytes(), &doc); err != nil { return nil, fmt.Errorf("parsing pdftotext output: %w", err) } return doc.Body.Doc.Pages, nil } // TextItem represents a piece of text with position. type TextItem struct { Text string X float64 Y float64 XMax float64 YMax float64 OrigY float64 // original page Y before flattening, for form field matching FontSize float64 // estimated from yMax - yMin Page int PageHeight float64 // original page height, for page break detection OrigX float64 // original page X before flattening FormField bool // true for injected form field markers } // blocksToItems converts pdftotext blocks/lines/words to TextItems, // filtering out content in page margins. func blocksToItems(page Page, pageNum int, marginTop, marginBottom float64) []TextItem { var items []TextItem pageBottom := page.Height - marginBottom for _, flow := range page.Flows { for _, block := range flow.Blocks { // Filter blocks in margins if block.YMin < marginTop || block.YMax > pageBottom { continue } for _, line := range block.Lines { lineItems := wordsToItems(line.Words, pageNum, page.Height) items = append(items, lineItems...) } } } return items } // blocksToLines converts pdftotext blocks to pre-grouped lines ([]TextItem per line), // filtering out content in page margins. Returns both the flat items and the line groupings. func blocksToLines(page Page, pageNum int, marginTop, marginBottom float64) ([]TextItem, [][]TextItem) { pageBottom := page.Height - marginBottom var allItems []TextItem var lines [][]TextItem for _, flow := range page.Flows { for _, block := range flow.Blocks { // Filter blocks in margins if block.YMin < marginTop || block.YMax > pageBottom { continue } for _, line := range block.Lines { lineItems := wordsToItems(line.Words, pageNum, page.Height) if len(lineItems) > 0 { lines = append(lines, lineItems) allItems = append(allItems, lineItems...) } } } } return allItems, lines } // BlockLines holds lines from one pdftotext block, preserving the block's xMin. type BlockLines struct { XMin float64 Lines [][]TextItem } // blocksToGroupedLines converts pdftotext blocks to BlockLines, preserving block // boundaries and xMin so that blocks with the same column layout can be merged // before flattening. Returns all items and the grouped block lines. func blocksToGroupedLines(page Page, pageNum int, marginTop, marginBottom float64) ([]TextItem, []BlockLines) { pageBottom := page.Height - marginBottom var allItems []TextItem var blocks []BlockLines for _, flow := range page.Flows { for _, block := range flow.Blocks { // Filter blocks entirely within top margin, or blocks // that extend into the bottom margin (footers). Using YMax // to catch footer blocks that may start above the margin // but extend into it. if block.YMin < marginTop || block.YMax > pageBottom { continue } var lines [][]TextItem for _, line := range block.Lines { lineItems := wordsToItems(line.Words, pageNum, page.Height) if len(lineItems) > 0 { lines = append(lines, lineItems) allItems = append(allItems, lineItems...) } } if len(lines) > 0 { blocks = append(blocks, BlockLines{XMin: block.XMin, Lines: lines}) } } } return allItems, blocks } // wordsToItems converts pdftotext words to TextItems. func wordsToItems(words []Word, pageNum int, pageHeight float64) []TextItem { var items []TextItem for _, w := range words { originalText := strings.TrimSpace(w.Text) if originalText == "" { continue } // Skip standalone right-side PDF decoration (commas, periods at page edge) // that appear on separate elements from the main content. origX := math.Floor(w.XMin) if len(originalText) == 1 && (originalText == "," || originalText == ".") && origX > 500 { continue } // Strip leading/trailing underscores and punctuation from pdftotext output. // pdftotext merges form underline decorations with adjacent text, // producing "business}___," or "___{state}" lines. text, leadingUnderscores, wasWide := stripUnderlineDecoration(originalText) if text == "" && wasWide { // Keep underscore-only words with original text for gap bridging // in FlattenLines. findLineGaps uses isUnderscoreText to detect // these as bridges between text segments on the same logical line, // preventing false column break detection. removeFieldLinesFormatRule // will strip them during markdown rendering. text = originalText leadingUnderscores = 0 wasWide = false } if text == "" { continue } origY := math.Floor(w.YMin) origXAdj := origX // When leading underscores are stripped, shift OrigX rightward to // approximate where the actual text begins (after the underline). // Use the word's actual bounding box width divided by total character // count to estimate per-character width, then shift by stripped count. if leadingUnderscores > 0 { wordWidth := w.XMax - w.XMin totalChars := len([]rune(w.Text)) if totalChars > 0 { charW := wordWidth / float64(totalChars) origXAdj = origX + charW*float64(leadingUnderscores) } } xMax := math.Floor(w.XMax) // When the original text was wide due to underscores, // reduce XMax to approximate the actual text width. This prevents // false column break detection. if wasWide { xMax = origXAdj + float64(len([]rune(text)))*float64(w.YMax-w.YMin)*0.7 } items = append(items, TextItem{ Text: text, X: origXAdj, Y: origY, XMax: xMax, YMax: math.Floor(w.YMax), OrigX: origXAdj, OrigY: origY, FontSize: w.YMax - w.YMin, Page: pageNum, PageHeight: pageHeight, }) } return items } // stripUnderlineDecoration removes leading/trailing underscores and surrounding // punctuation from pdftotext text that merges form underline decorations. // Returns cleaned text, number of leading underscore characters removed, // and whether the original text was wide due to underscores. func stripUnderlineDecoration(text string) (string, int, bool) { uc := strings.Count(text, "_") if uc > 2 && isMostlyUnderscores(text) { text = strings.ReplaceAll(text, "_", "") text = strings.TrimRight(text, ",.") text = strings.TrimSpace(text) if text == "" { return "", 0, true } return text, 0, true } else if uc > 2 { leading := len(text) - len(strings.TrimLeft(text, "_")) text = strings.TrimLeft(text, "_") text = strings.TrimRight(text, "_") text = strings.TrimRight(text, ",.") return text, leading, true } return text, 0, false } // isMostlyUnderscores returns true if 80%+ of the text is underscore-like characters. func isMostlyUnderscores(s string) bool { if len(s) < 3 { return false } count := 0 for range s { count++ } underscoreCount := strings.Count(s, "_") return float64(underscoreCount)/float64(count) > 0.8 } // pageText holds items for one page. type pageText struct { PageNum int Items []TextItem Width float64 Height float64 } // filterMarginWords removes words that fall within marginUnits of any page edge. func filterMarginWords(words []Word, pageWidth, pageHeight, marginTop float64, marginBottom float64) []Word { var filtered []Word for _, w := range words { if w.YMin >= marginTop && w.YMax <= pageHeight-marginBottom { filtered = append(filtered, w) } } return filtered }