package pdfcontent import ( "math" "regexp" "slices" "sort" "strings" ) var orderedListRe = regexp.MustCompile(`^[0-9a-zA-Z]\.$`) var parenNumberRe = regexp.MustCompile(`^\([0-9]+\)$`) func isList(text string) bool { return orderedListRe.MatchString(text) || parenNumberRe.MatchString(text) } func isBulletChar(s string) bool { // Common bullet/list-marker characters from PDFs for _, c := range []string{"•", "‣", "⁃", "◦", "●", "▪", "▫", "-", "–", "—", "✦", "✧", "✩", "✪", "★", "☆", "¤", "§", "¶"} { if s == c { return true } } // Symbol font bullets (U+E000–U+F8FF private use range, commonly used by pdftotext) if len(s) == 1 { r := []rune(s)[0] if r >= 0xE000 && r <= 0xF8FF { return true } } // Multi-byte private-use / CJK extension characters that are single-grapheme bullets if len([]rune(s)) == 1 && len(s) >= 3 { r := []rune(s)[0] if (r >= 0xE000 && r <= 0xF8FF) || (r >= 0xF0000 && r <= 0xFFFFD) { return true } } return false } type FormatRule func(items []TextItem, index int, sb *strings.Builder) int var FormatRules = []FormatRule{ lineBreakFormatRule, removeFieldLinesFormatRule, bulletListFormatRule, headingFormatRule, boldFormatRule, inlineTextFormatRule, } // lineBreakFormatRule handles adding a line break after paragraph blocks func lineBreakFormatRule(items []TextItem, index int, sb *strings.Builder) int { item := items[index] // Y=-1 separator between columns — produce blank line if item.Y < 0 { sb.WriteString("\n\n") return 1 } if index > 0 { lastItem := items[index-1] // Previous item was a Y=-1 column separator — ensure gap if lastItem.Y < 0 { return 0 } if lastItem.Page != item.Page { // Look back to find the last item on the previous page with // the highest OrigY (bottom of content on that page) prevPage := lastItem.Page lastOrigY := lastItem.OrigY for j := index - 1; j >= 0; j-- { if items[j].Page != prevPage { break } if items[j].OrigY > lastOrigY { lastOrigY = items[j].OrigY } } // If trailing blank space > 25% of page, insert double break pageHeight := lastItem.PageHeight if pageHeight > 0 && lastOrigY < pageHeight*0.75 { sb.WriteString("\n\n---\n\n") } else { sb.WriteString("\n\n") } return 0 } origYDiff := item.OrigY - lastItem.OrigY flatYDiff := math.Abs(item.Y - lastItem.Y) threshold := item.FontSize // Paragraph break: large flattened Y gap, or large OrigY gap // (but not when either item is a form field, since form fields // can have different OrigY while sharing the same flattened Y). if flatYDiff > threshold*2 || (origYDiff > threshold*2 && !item.FormField && !lastItem.FormField) { sb.WriteString("\n\n") return 0 } // Line break: items on different flattened lines, but only when // a form field is involved or text items are far apart in OrigY. // Adjacent text items close in OrigY stay on the same line. if flatYDiff > threshold && (item.FormField || lastItem.FormField || origYDiff > 12) { sb.WriteString("\n") return 0 } // Same flattened line but different original Y — items from different // PDF lines merged onto one flattened line (e.g., after column // flattening or block merging). Separate them so bullet chars, // wrapped text, etc. land on the correct output line. // EXCEPTION: form fields should NOT trigger line breaks here. // Form fields share the flattened Y of their matched text but may // have different OrigY — they should stay on the same output line. // EXCEPTION: bullets and list markers should NOT be separated from // the preceding text by a line break — they anchor to the start of // their own bullet text, and the preceding text belongs to a different bullet. // EXCEPTION: short text items that are close in original X should NOT // be broken apart — they're likely part of the same PDF line with // slight Y drift (e.g., "of business" next to "{name"). if flatYDiff < threshold && origYDiff > math.Max(threshold*0.4, 3) && !item.FormField && !lastItem.FormField { if !isBulletChar(item.Text) && !isList(item.Text) { shortItem := len(item.Text) <= 10 shortLast := len(lastItem.Text) <= 10 xClose := math.Abs(item.OrigX-lastItem.OrigX) < 100 if !(shortItem && shortLast && xClose) { sb.WriteString("\n") return 0 } } } } if isList(item.Text) { return 0 } return 0 } func inlineTextFormatRule(items []TextItem, index int, sb *strings.Builder) int { sb.WriteString(items[index].Text + " ") return 1 } func bulletListFormatRule(items []TextItem, index int, sb *strings.Builder) int { cItem := items[index] cItemText := strings.TrimSpace(cItem.Text) // Consume bullet characters — write markdown bullet prefix if isBulletChar(cItemText) { indentX := math.Floor(cItem.OrigX - 72) if indentX <= 0 { sb.WriteString("- ") } else { spaces := int(math.Round(indentX/18.0)) * 2 if spaces > 12 { spaces = 12 } sb.WriteString(strings.Repeat(" ", spaces) + "- ") } return 1 } // Consume ordered list markers (a., b., 1., 2., etc.) — write marker prefix if orderedListRe.MatchString(cItemText) { indentX := math.Floor(cItem.OrigX - 72) if indentX <= 0 { sb.WriteString(cItemText + " ") } else { spaces := int(math.Round(indentX/18.0)) * 2 if spaces > 12 { spaces = 12 } sb.WriteString(strings.Repeat(" ", spaces) + cItemText + " ") } return 1 } // Parenthesized numbers like (1), (2) — write indent prefix, keep marker in text if parenNumberRe.MatchString(cItemText) { indentX := math.Floor(cItem.OrigX - 72) if indentX > 0 { spaces := int(math.Round(indentX/18.0)) * 2 if spaces > 12 { spaces = 12 } sb.WriteString(strings.Repeat(" ", spaces)) } return 0 } return 0 } func headingFormatRule(items []TextItem, index int, sb *strings.Builder) int { item := items[index] itemHeadingLevel := getTextItemHeading(item) if itemHeadingLevel == 0 { return 0 } if index > 0 { lastItem := items[index-1] lastItemHeadingLevel := getTextItemHeading(lastItem) if lastItemHeadingLevel == itemHeadingLevel && lastItem.Y == item.Y && lastItem.Page == item.Page { sb.WriteString(item.Text + " ") return 1 } } sb.WriteString(strings.Repeat("#", itemHeadingLevel) + " " + item.Text + " ") return 1 } func getTextItemHeading(item TextItem) int { if item.FontSize > 18.0 { return 1 } if item.FontSize > 16.0 { return 2 } return 0 } const boldFontSize = 15.0 func boldFormatRule(items []TextItem, index int, sb *strings.Builder) int { item := items[index] if item.FontSize <= boldFontSize { return 0 } // Look ahead to find all consecutive bold items on the same line/page end := index + 1 for end < len(items) && end <= index+100 { next := items[end] if next.FontSize <= boldFontSize { break } if next.Page != item.Page || next.Y != item.Y { break } end++ } // Group all bold text into a single **...** block var parts []string for i := index; i < end; i++ { parts = append(parts, items[i].Text) } sb.WriteString("**" + strings.Join(parts, " ") + "** ") return end - index } func removeFieldLinesFormatRule(items []TextItem, index int, sb *strings.Builder) int { itemText := strings.TrimSpace(items[index].Text) if strings.Count(itemText, "_") > 2 || strings.Count(itemText, "_.") > 0 { return 1 } return 0 } // FormFieldPos represents a form field's position on a page for markdown injection. type FormFieldPos struct { Name string // PDF field name PosX float64 // X position (bottom-left of field rect) PosY float64 // Y position (bottom-left of field rect) Width float64 Height float64 Page int Label string // Annotation label like "+AA" } // ToMarkdown converts pdftotext page data and form fields into markdown. // // Pipeline: // 1. Convert blocks/lines/words to TextItems (with OrigY preserved) // 2. Flatten columns using pre-grouped lines from blocks // 3. Match form fields to nearest text item by original Y position // 4. Inject form fields at their computed flattened Y positions func ToMarkdown(pages []Page, fields []FormFieldPos) string { // Group fields by page (1-based) pageFields := make(map[int][]FormFieldPos) for _, f := range fields { pageFields[f.Page] = append(pageFields[f.Page], f) } // Compute average font size across all pages for heading detection sum := 0.0 count := 0 for _, page := range pages { for _, flow := range page.Flows { for _, block := range flow.Blocks { for _, line := range block.Lines { for _, w := range line.Words { sum += w.YMax - w.YMin count++ } } } } } bodyFontSize := 12.0 if count > 0 { bodyFontSize = sum / float64(count) } var sb strings.Builder allItems := make([]TextItem, 0) for pi, page := range pages { // Convert blocks to grouped lines preserving block boundaries pageItems, blockLines := blocksToGroupedLines(page, pi+1, 50, 90) avgFontSize := averageFontSize(pageItems) // Merge narrow blocks with the same column layout and split // into left/right columns. Wide blocks are flattened normally. merged := mergeSameColumnBlocks(blockLines, page.Width) var flattened []TextItem lineHeight := math.Ceil(avgFontSize) + 4 flatLine := 0 // Pre-compute last OrigY of each block for gap detection lastOrigYs := make([]float64, len(merged)) for bi, m := range merged { if len(m.Lines) > 0 && len(m.Lines[len(m.Lines)-1]) > 0 { lastOrigYs[bi] = m.Lines[len(m.Lines)-1][len(m.Lines[len(m.Lines)-1])-1].OrigY } } // Pre-compute first OrigY of each block firstOrigYs := make([]float64, len(merged)) for bi, m := range merged { if len(m.Lines) > 0 && len(m.Lines[0]) > 0 { firstOrigYs[bi] = m.Lines[0][0].OrigY } } for bi, m := range merged { isMC := isMultiColumnBlock(m) if isMC { // Separate multi-column block from preceding content if len(flattened) > 0 { flattened = append(flattened, TextItem{Text: "", Y: -1, Page: pi + 1, PageHeight: page.Height}) } // Multi-column merged block: split into N columns columns := splitBlockColumns(m) if len(columns) < 2 { // Not truly multi-column (wide lines rejected) — fall through f := FlattenLines(m.Lines, page.Width, avgFontSize) for _, it := range f { it.Y += float64(flatLine) * lineHeight flattened = append(flattened, it) } flatLine += len(m.Lines) if bi == len(merged)-1 || (firstOrigYs[bi+1]-lastOrigYs[bi] > lineHeight) { flatLine++ } } else { for ci, colLines := range columns { if ci > 0 { flattened = append(flattened, TextItem{Text: "", Y: -1, Page: pi + 1, PageHeight: page.Height}) } for _, line := range colLines { flattened = append(flattened, assignLinePositions(line, float64(flatLine)*lineHeight)...) flatLine++ } } flatLine++ // paragraph gap after column block // Separate from following blocks flattened = append(flattened, TextItem{Text: "", Y: -1, Page: pi + 1, PageHeight: page.Height}) } } else { // Normal block: flatten normally f := FlattenLines(m.Lines, page.Width, avgFontSize) for _, it := range f { it.Y += float64(flatLine) * lineHeight flattened = append(flattened, it) } flatLine += len(m.Lines) // Add paragraph gap (+1) only if this is the last block or // the next block has a significant OrigY gap (not a continuation) if bi == len(merged)-1 || (firstOrigYs[bi+1]-lastOrigYs[bi] > lineHeight) { flatLine++ } } } // Build index of text items for form field matching posIndex := buildPositionIndex(flattened) // Inject form fields matched to nearest text item by original X+Y fieldsOnPage := pageFields[pi+1] for _, f := range fieldsOnPage { flattenedY := matchFormFieldToText(f.PosX, f.PosY, posIndex, bodyFontSize) // Checkbox form fields (small, roughly square) have their PDF X-position // at the left edge of the checkbox, which often aligns with the "(" character // in "( ) Label" patterns. Due to coordinate system differences between pdfcpu // and pdftotext, the checkbox X may be slightly less than "("'s X, causing it // to sort before "(" in mergeByY. Offset checkbox OrigX right by 2pt to ensure // it appears inside the parentheses. origX := f.PosX if f.Width <= 25 && f.Height <= 25 { origX += 2 } flattened = append(flattened, TextItem{ Text: f.Label, X: math.Floor(f.PosX), Y: flattenedY, XMax: math.Floor(f.PosX + f.Width), YMax: flattenedY + (f.Height / 2), OrigX: origX, OrigY: f.PosY, FontSize: bodyFontSize, Page: f.Page, PageHeight: page.Height, FormField: true, }) } // Merge form fields into their correct Y positions within each // column block, preserving Y=-1 separators between columns. flattened = mergeFormFields(flattened, bodyFontSize) allItems = slices.DeleteFunc(append(allItems, flattened...), func(item TextItem) bool { // Keep Y=-1 column separators; remove other Page=0 items return item.Page == 0 && item.Y >= 0 }) } writeMarkdownItems(&sb, allItems, bodyFontSize) return sb.String() } // buildPositionIndex creates a slice of text items with original and flattened // coordinates, for form field placement after column flattening. func buildPositionIndex(items []TextItem) []TextItem { var idx []TextItem for _, item := range items { if item.FormField || item.Text == "" { continue } idx = append(idx, item) } return idx } // matchFormFieldToText finds the best flattened Y position for a form field // by matching its original X+Y to the nearest text item's original X+Y. // Uses column detection based on X-position gaps to match within same column. // Only matches within maxMatchDist Y distance to prevent cross-block matching. // When the match is found but the Y distance exceeds inlineDist, the field // is placed on the next flattened line (closestY + lineHeight) rather than // inline with the text. func matchFormFieldToText(fieldOrigX, fieldOrigY float64, idx []TextItem, bodyFontSize float64) float64 { bestY := math.Floor(fieldOrigY) if len(idx) == 0 { return bestY } // Use all items as candidates; rely on maxOrigYRange + yTolerance // to prevent matching to distant page sections. candidates := idx // maxMatchDist: only match to text within ~2.5 line heights for inline // placement. Prevents form fields from matching to text in different // block groups (e.g., signature checkbox matching a full-width paragraph). const maxMatchDist = 30.0 // inlineDist: maximum Y distance for truly inline placement. Fields // farther than this are placed on the next flattened line below the // matched text, rather than sharing the same line. const inlineDist = 15.0 lineHeight := math.Ceil(bodyFontSize) + 4 // maxInlineX: maximum X distance for inline placement. Fields far from // their matched text in X (e.g., state checkbox at right edge matching // city text at left edge) should be placed on the next flattened line. const maxInlineX = 100.0 // Two-pass matching: first find the best Y-distance, then among all // candidates within a Y-tolerance, prefer the one with minimum X-distance. // This prevents a checkbox at X=310 from matching text at X=200 on the // same line (due to tiny FontSize-driven Y-center differences) when a // perfect X-match like "(" at X=310 exists at nearly the same Y. const yTolerance = 3.0 // maxOrigYRange: limit candidate text to within this OrigY range of the // form field. This prevents matching to text from distant page sections // (e.g., instructions) that happen to share a similar Y-coordinate. const maxOrigYRange = 30.0 bestDist := math.MaxFloat64 for _, item := range candidates { // Skip mixed underscore+text words (e.g., "name}___,") that have // significant underscore content. Pure underscore-only words are // kept as valid candidates — they serve as Y-position anchors for // form fields on form underline lines. uc := strings.Count(item.Text, "_") if uc > 2 && len(item.Text) > uc { continue } if math.Abs(fieldOrigY-item.OrigY) > maxOrigYRange { continue } textCenterY := item.OrigY + item.FontSize/2 yDist := math.Abs(fieldOrigY - textCenterY) if yDist < bestDist { bestDist = yDist } } closestY := math.Floor(fieldOrigY) // fallback: closest text item's flattened Y closestOrigX := math.MaxFloat64 for _, item := range candidates { uc := strings.Count(item.Text, "_") if uc > 2 && len(item.Text) > uc { continue } if math.Abs(fieldOrigY-item.OrigY) > maxOrigYRange { continue } textCenterY := item.OrigY + item.FontSize/2 yDist := math.Abs(fieldOrigY - textCenterY) if yDist > bestDist+yTolerance { continue } xDist := math.Abs(fieldOrigX - item.OrigX) if xDist < math.Abs(fieldOrigX-closestOrigX) { closestY = item.Y closestOrigX = item.OrigX } } if bestDist > maxMatchDist { // No text within inline range — use closest text item's flattened Y // to keep the form field within the flattened coordinate space. // mergeFormFields will place it on its own line (no text at that Y). return closestY } if bestDist > inlineDist { // Match found but too far for inline — only bump to next flattened // line if the field is BELOW the matched text. Fields positioned // ABOVE the text (e.g., checkbox before column header) stay inline. closestBaseline := math.MaxFloat64 for _, item := range candidates { textCenterY := item.OrigY + item.FontSize/2 yDist := math.Abs(fieldOrigY - textCenterY) if yDist == bestDist { closestBaseline = item.OrigY break } } if fieldOrigY > closestBaseline { // Field is below the text baseline — bump to next line return closestY + lineHeight } } if math.Abs(fieldOrigX-closestOrigX) > maxInlineX { // Match found but too far for inline X — bump to next flattened // line. This handles cases where form fields far to the right // (e.g., state checkbox) match text far to the left (e.g., city label) // because they share the same Y. return closestY + lineHeight } return closestY } // repositionBullets moves bullet/list-marker items that are stranded at the // end of one flattened line to the start of the next flattened line, so they // appear before their associated text rather than after the previous item's // wrapped text. Bullets and their text sometimes land on different flattened // lines (due to Y-offsets in pdftotext extraction or column flattening). func repositionBullets(items []TextItem, bodyFontSize float64) []TextItem { items = append([]TextItem(nil), items...) // copy threshold := bodyFontSize for i := 0; i < len(items); i++ { cur := items[i] if !isBulletChar(cur.Text) && !isList(cur.Text) { continue } // Find previous non-separator text item prevIdx := i - 1 for prevIdx >= 0 && items[prevIdx].Y < 0 { prevIdx-- } if prevIdx < 0 { continue // first item on page } prev := items[prevIdx] // Find next text item (bullet's associated text) nextIdx := i + 1 for nextIdx < len(items) && items[nextIdx].Y < 0 { nextIdx++ } if nextIdx >= len(items) { continue // no text after bullet } next := items[nextIdx] // Bullet is on same flat line as previous text, but on a different // flat line from its own text. This means the bullet got stranded // at the end of the previous item's wrapped text. Move it to the // next flat line where its text lives. sameLineAsPrev := math.Abs(cur.Y-prev.Y) < threshold differentLineFromNext := math.Abs(cur.Y-next.Y) >= threshold origYMatchesNext := math.Abs(next.OrigY-cur.OrigY) <= threshold*2 if sameLineAsPrev && differentLineFromNext && origYMatchesNext { // Rebuild: items[:i] + items[i+1:nextIdx] + [bullet] + items[nextIdx:] bullet := cur bullet.Y = next.Y // update Y to match text's line items = append(items[:i], append(items[i+1:nextIdx], append([]TextItem{bullet}, items[nextIdx:]...)...)...) // nextIdx shifts by 0 (removed 1, added 1) } } return items } // fixOrphanedBullets post-processes the rendered markdown to handle bullet // markers that got separated from their associated text. Common causes: // bullets at the end of a text block after column flattening, or bullets // that appear inline at the end of a wrapped text line. // // Strategy: // 1. Remove orphan bullets (standalone "-" lines and trailing "-" at end of text lines) // 2. Prepend collected bullets to paragraphs in bullet sections that need them func fixOrphanedBullets(md string) string { lines := strings.Split(md, "\n") var cleaned []string orphanCount := 0 // Phase 1: collect orphan bullets and clean lines for _, line := range lines { t := strings.TrimSpace(line) // Standalone orphan bullet line (just whitespace + "-") if t == "-" || t == "- " { orphanCount++ continue } // Trailing bullet at end of line: "text... -" // Requires at least 2 spaces before the dash to distinguish from // inline dashes like "or - and" stripped := strings.TrimRight(line, " \t\r") if strings.HasSuffix(stripped, "-") && len(stripped) > 4 { before := stripped[:len(stripped)-1] if strings.HasSuffix(before, " ") { beforeDash := strings.TrimRight(before, " \t") if beforeDash != "" && !strings.HasSuffix(beforeDash, "-") { orphanCount++ cleaned = append(cleaned, beforeDash) continue } } } cleaned = append(cleaned, line) } if orphanCount == 0 { return md } // Phase 2: prepend orphan bullets to paragraphs that need them result := make([]string, 0, len(cleaned)) orphanIdx := 0 lastNonEmptyWasBullet := false seenFirstBullet := false // only add orphans AFTER seeing first "- " line // continuationMaxLen: after a bullet line, short lines are wrapped // continuations of that bullet. Longer lines are likely new bullet items. const continuationMaxLen = 20 for _, line := range cleaned { t := strings.TrimSpace(line) if t == "" { result = append(result, line) continue } if strings.HasPrefix(t, "- ") || t == "-" { lastNonEmptyWasBullet = true seenFirstBullet = true result = append(result, line) continue } if !seenFirstBullet { // Haven't entered a bullet section yet — don't add bullets result = append(result, line) continue } if lastNonEmptyWasBullet { if len(t) < continuationMaxLen { // Short line — likely a wrapped continuation of the bullet lastNonEmptyWasBullet = false result = append(result, line) } else if orphanIdx < orphanCount { // Long line — likely a new bullet item orphanIdx++ indent := "" for _, c := range line { if c == ' ' { indent += " " } else { break } } result = append(result, indent+"- "+t) lastNonEmptyWasBullet = true } else { result = append(result, line) } } else if orphanIdx < orphanCount { // New bullet item after a continuation line orphanIdx++ indent := "" for _, c := range line { if c == ' ' { indent += " " } else { break } } result = append(result, indent+"- "+t) lastNonEmptyWasBullet = true } else { result = append(result, line) } } return strings.Join(result, "\n") } // writeMarkdownItems converts text items to markdown. // Items are expected to be in correct line order (from FlattenColumns). // Within each line, items are sorted by X for proper left-to-right text. // The Y=-1 separator between columns is treated as a blank line. func writeMarkdownItems(sb *strings.Builder, items []TextItem, bodyFontSize float64) { if len(items) == 0 { return } items = repositionBullets(items, bodyFontSize) for i := 0; i < len(items); { consumed := false for _, rule := range FormatRules { n := rule(items, i, sb) if n > 0 { i += n consumed = true break } } if !consumed { i++ } } // Post-process: fix orphaned bullets that couldn't be repositioned md := sb.String() md = fixOrphanedBullets(md) sb.Reset() sb.WriteString(md) } // splitColumnBlocks splits items at Y=-1 separators into column blocks. func splitColumnBlocks(items []TextItem) [][]TextItem { var blocks [][]TextItem var current []TextItem for _, item := range items { if item.Y < 0 { if len(current) > 0 { blocks = append(blocks, current) current = nil } } else { current = append(current, item) } } if len(current) > 0 { blocks = append(blocks, current) } return blocks } // mergeFormFields inserts form field items (marked FormField=true, appended // at the end) into their correct Y positions within each column block, // preserving Y=-1 separators. func mergeFormFields(items []TextItem, fontSize float64) []TextItem { // Separate form fields from text items var formFields []TextItem var textItems []TextItem for _, item := range items { if item.FormField { formFields = append(formFields, item) } else { textItems = append(textItems, item) } } if len(formFields) == 0 { return items } // Split text items into column blocks at Y=-1 separators type blockInfo struct { items []TextItem isSep bool } var blocks []blockInfo var current []TextItem for _, item := range textItems { if item.Y < 0 { blocks = append(blocks, blockInfo{items: current, isSep: false}) blocks = append(blocks, blockInfo{isSep: true}) current = nil } else { current = append(current, item) } } blocks = append(blocks, blockInfo{items: current, isSep: false}) // Compute Y ranges, OrigX ranges, and column split boundaries for each block type blockMeta struct { yMin float64 yMax float64 origYMin float64 // original Y range, unchanged by column splitting origYMax float64 xMin float64 xMax float64 lineYs []float64 rightSplitX float64 // fields >= rightSplitX should go to the next column } var metas []blockMeta for bi, b := range blocks { if b.isSep { metas = append(metas, blockMeta{}) continue } if len(b.items) == 0 { metas = append(metas, blockMeta{yMin: math.MaxFloat64, yMax: -math.MaxFloat64, origYMin: math.MaxFloat64, origYMax: -math.MaxFloat64}) continue } yMin, yMax := b.items[0].Y, b.items[0].Y origYMin, origYMax := b.items[0].OrigY, b.items[0].OrigY xMin, xMax := b.items[0].OrigX, b.items[0].OrigX ySet := make(map[float64]bool) for _, it := range b.items { if it.Y < yMin { yMin = it.Y } if it.Y > yMax { yMax = it.Y } if it.OrigY < origYMin { origYMin = it.OrigY } if it.OrigY > origYMax { origYMax = it.OrigY } if it.OrigX < xMin { xMin = it.OrigX } if it.OrigX > xMax { xMax = it.OrigX } ySet[it.Y] = true } var lineYs []float64 for y := range ySet { lineYs = append(lineYs, y) } // Infer column split boundaries from adjacent separator blocks. // When two column blocks are separated by Y=-1, use the adjacent // block's edge as the split boundary. This prevents overlap where // left column text extends into right column territory. // Only apply to LEFT column blocks (xMax < next block's xMin). rightSplitX := 0.0 // fields >= rightSplitX should go to the next block if bi+1 < len(blocks) && blocks[bi+1].isSep && bi+2 < len(blocks) { next := blocks[bi+2] if !next.isSep && len(next.items) > 0 { nextXMin := next.items[0].OrigX for _, it := range next.items { if it.OrigX < nextXMin { nextXMin = it.OrigX } } // Only apply rightSplitX to left column block if nextXMin > xMax { rightSplitX = nextXMin } } } // Cap effective xMax at right split boundary effectiveXMax := xMax if rightSplitX > 0 && rightSplitX < xMax { effectiveXMax = rightSplitX } metas = append(metas, blockMeta{yMin: yMin, yMax: yMax, origYMin: origYMin, origYMax: origYMax, xMin: xMin, xMax: effectiveXMax, lineYs: lineYs, rightSplitX: rightSplitX}) } // Assign each form field to the best block, and remap its flatY // to match the nearest text item's flatY within that block. // This is necessary because form field matching happens before column // splitting, so the field's flatY may not match any text line in the // correct column block after splitting. for _, ff := range formFields { bestIdx := -1 bestScore := -math.MaxFloat64 for bi, m := range metas { if blocks[bi].isSep { continue } // Y distance: use OrigY (unchanged by column splitting) // instead of flatY, so fields match the correct column block. yDist := 0.0 if ff.OrigY < m.origYMin { yDist = m.origYMin - ff.OrigY } else if ff.OrigY > m.origYMax { yDist = ff.OrigY - m.origYMax } // Hard Y-distance cutoff: skip blocks that are too far away. // This prevents narrow column blocks from absorbing fields // that belong to adjacent full-width blocks. if yDist > 30 { continue } // Column split enforcement: if this block has a right split // boundary and the form field is on or past it, hard-reject. // This prevents left-column blocks from absorbing right-column // form fields when text extends across the column gap. if m.rightSplitX > 0 && ff.OrigX >= m.rightSplitX { continue } // X distance: penalize if outside block's X range (using OrigX) xDist := 0.0 if ff.OrigX < m.xMin { xDist = m.xMin - ff.OrigX } else if ff.OrigX > m.xMax { xDist = ff.OrigX - m.xMax } // X boundary distance: prefer block whose left edge (xMin) is // closest to the field's X. This correctly assigns form fields // to their column. xMinDist := math.Abs(ff.OrigX - m.xMin) // Prefer narrow blocks (column blocks) when X fits blockWidth := m.xMax - m.xMin narrowBonus := 0.0 if xDist == 0 && blockWidth < 300 { narrowBonus = 100 } score := -xMinDist + narrowBonus - yDist*10 - xDist if score > bestScore { bestScore = score bestIdx = bi } } if bestIdx >= 0 && !blocks[bestIdx].isSep { // Remap form field's flatY to the nearest text item's flatY // within this block. First, check if the field's matched flatY // already has text items at that Y — if so, keep it. Only remap // if the field came from a different column block (its flatY // doesn't match any text in this block). existingFlatY := ff.Y hasMatchAtY := false for _, item := range blocks[bestIdx].items { if math.Abs(item.Y-existingFlatY) < fontSize*0.5 { hasMatchAtY = true break } } if !hasMatchAtY { // Remap: use raw OrigY distance for matching. // Text center Y (OrigY + fontSize/2) can misplace fields that // sit between two text lines (e.g., email field between line 1 // and line 2 of a multi-line block) — center-based matching // pulls them toward the line with the closer center, but the // field often belongs to the lower line. Raw OrigY distance // correctly snaps inter-line fields to the physically nearer line. bestFlatY := existingFlatY bestRawDist := math.MaxFloat64 for _, item := range blocks[bestIdx].items { rawDist := math.Abs(ff.OrigY - item.OrigY) if rawDist < bestRawDist { bestRawDist = rawDist bestFlatY = item.Y } } ff.Y = bestFlatY } blocks[bestIdx].items = mergeByY(blocks[bestIdx].items, []TextItem{ff}, fontSize) } } // Reconstruct var result []TextItem for bi, b := range blocks { if b.isSep { // Infer page from adjacent block items pg := 0 if bi > 0 && len(blocks[bi-1].items) > 0 { pg = blocks[bi-1].items[0].Page } result = append(result, TextItem{Text: "", Y: -1, Page: pg}) } else { result = append(result, b.items...) } } return result } // mergeByY merges form fields into text items, preserving Y order. // Items with similar Y are kept adjacent so writeMarkdownItems groups them on the same line. // Within each Y-level, items (text + form fields) are sorted by OrigX for proper left-to-right order. // Wide text input fields (width > 50) are placed at the end of the line only when // the field's X position is to the right of ALL text items on the line. If there's // text further right (multi-label lines like "Case No.: [F] Division: [F]"), // fields are sorted by OrigX to interleave correctly between labels. func mergeByY(texts []TextItem, formFFs []TextItem, fontSize float64) []TextItem { threshold := fontSize * 0.5 sort.Slice(texts, func(i, j int) bool { return texts[i].Y < texts[j].Y }) sort.Slice(formFFs, func(i, j int) bool { return formFFs[i].Y < formFFs[j].Y }) var result []TextItem tIdx, fIdx := 0, 0 for tIdx < len(texts) || fIdx < len(formFFs) { if fIdx >= len(formFFs) { result = append(result, texts[tIdx:]...) break } if tIdx >= len(texts) { result = append(result, formFFs[fIdx:]...) break } if texts[tIdx].Y <= formFFs[fIdx].Y { // Text comes first or is near form field. // Collect all text at this Y level. currentY := texts[tIdx].Y var textAtY []TextItem for tIdx < len(texts) && math.Abs(texts[tIdx].Y-currentY) < threshold { textAtY = append(textAtY, texts[tIdx]) tIdx++ } // Compute max text X on this line for trailing decision maxTextX := 0.0 for _, t := range textAtY { if t.XMax > maxTextX { maxTextX = t.XMax } } // Collect form fields at this Y var ffAtY []TextItem var ffTrailing []TextItem for fIdx < len(formFFs) && math.Abs(formFFs[fIdx].Y-currentY) < threshold { fieldWidth := formFFs[fIdx].XMax - formFFs[fIdx].X if fieldWidth > 50 && formFFs[fIdx].X > maxTextX { ffTrailing = append(ffTrailing, formFFs[fIdx]) } else { ffAtY = append(ffAtY, formFFs[fIdx]) } fIdx++ } var lineItems []TextItem lineItems = append(lineItems, textAtY...) lineItems = append(lineItems, ffAtY...) sort.SliceStable(lineItems, func(i, j int) bool { return lineItems[i].OrigX < lineItems[j].OrigX }) result = append(result, lineItems...) result = append(result, ffTrailing...) } else { // Form field comes before text — collect both at this Y. currentY := formFFs[fIdx].Y // First, collect all form fields and text at this Y var ffAtY []TextItem var ffTrailing []TextItem var textAtY []TextItem for fIdx < len(formFFs) && math.Abs(formFFs[fIdx].Y-currentY) < threshold { ffAtY = append(ffAtY, formFFs[fIdx]) // temporarily add all fIdx++ } for tIdx < len(texts) && math.Abs(texts[tIdx].Y-currentY) < threshold { textAtY = append(textAtY, texts[tIdx]) tIdx++ } // Compute max text X for trailing decision maxTextX := 0.0 for _, t := range textAtY { if t.XMax > maxTextX { maxTextX = t.XMax } } // Reclassify: only trail if field is wide AND starts right of all text ffAtY, ffTrailing = nil, nil for _, ff := range ffAtY { fieldWidth := ff.XMax - ff.X if fieldWidth > 50 && ff.X > maxTextX { ffTrailing = append(ffTrailing, ff) } else { ffAtY = append(ffAtY, ff) } } var lineItems []TextItem lineItems = append(lineItems, textAtY...) lineItems = append(lineItems, ffAtY...) sort.SliceStable(lineItems, func(i, j int) bool { return lineItems[i].OrigX < lineItems[j].OrigX }) result = append(result, lineItems...) result = append(result, ffTrailing...) } } return result }