package pdfcontent import ( "cmp" "math" "slices" ) // FlattenColumns detects multi-column layouts and reorders text for single-column reading. // // Algorithm: // 1. Group items into lines by Y proximity // 2. For each line, find large horizontal gaps (>15% of page width) // 3. Detect "column break regions" — X ranges where gaps consistently appear // across 3+ consecutive lines // 4. Split multi-column lines at break regions into left/right parts // 5. Output all left parts top-to-bottom, then all right parts top-to-bottom // (preserving single-column lines in their original positions) // FlattenLines takes pre-grouped lines (e.g., from pdftotext -bbox-layout blocks) // and flattens multi-column layouts to single-column reading order. Returns items // with flattened Y/X positions and preserved OrigY for form field matching. func FlattenLines(lines [][]TextItem, pageWidth float64, avgFontSize float64) []TextItem { if len(lines) == 0 { return nil } if len(lines) < 3 { return assignFlattenedPositions(lines, avgFontSize) } // Step 2: Find gaps in each line minGap := pageWidth * 0.15 var allGaps []lineGap for li, line := range lines { gaps := findLineGaps(line, minGap) for _, g := range gaps { allGaps = append(allGaps, lineGap{ lineIndex: li, gapStart: g.start, gapEnd: g.end, gapWidth: g.width, }) } } // Step 3: Find column break regions breakRegions := findBreakRegions(allGaps, lines, pageWidth) if len(breakRegions) == 0 { return assignFlattenedPositions(lines, avgFontSize) } // Step 4: Split lines and reorder return reorderWithBreaks(lines, breakRegions, avgFontSize, pageWidth) } // FlattenColumns detects multi-column layouts and reorders text for single-column reading. // Groups items into lines by Y proximity, then delegates to FlattenLines. func FlattenColumns(items []TextItem, pageWidth float64) []TextItem { if len(items) <= 1 { return items } avgFontSize := averageFontSize(items) lineThreshold := avgFontSize * 0.5 lines := groupIntoLines(items, lineThreshold) return FlattenLines(lines, pageWidth, avgFontSize) } // gap represents a horizontal gap in a line type gap struct { start float64 end float64 width float64 } // lineGap tracks a gap found on a specific line. type lineGap struct { lineIndex int gapStart float64 gapEnd float64 gapWidth float64 } // breakRegion represents a consistent column break at a specific X position type breakRegion struct { xMin float64 xMax float64 // Line indices that have a gap in this region, in order lineIndices []int } func averageFontSize(items []TextItem) float64 { if len(items) == 0 { return 12 } sum := 0.0 for _, it := range items { sum += it.FontSize } return sum / float64(len(items)) } func groupIntoLines(items []TextItem, threshold float64) [][]TextItem { // Sort by Y, then X sorted := make([]TextItem, len(items)) copy(sorted, items) sortByYThenX(sorted) var lines [][]TextItem currentLine := []TextItem{sorted[0]} for i := 1; i < len(sorted); i++ { if math.Abs(sorted[i].Y-currentLine[0].Y) < threshold { currentLine = append(currentLine, sorted[i]) } else { // Sort line by X sortByX(currentLine) lines = append(lines, currentLine) currentLine = []TextItem{sorted[i]} } } sortByX(currentLine) lines = append(lines, currentLine) return lines } func sortByYThenX(items []TextItem) { for i := 1; i < len(items); i++ { for j := i; j > 0 && items[j].Y < items[j-1].Y; j-- { items[j], items[j-1] = items[j-1], items[j] } } } func sortByX(items []TextItem) { for i := 1; i < len(items); i++ { for j := i; j > 0 && items[j].X < items[j-1].X; j-- { items[j], items[j-1] = items[j-1], items[j] } } } // isUnderscoreText returns true if the text is mostly underscores // (form underline decorations like "________," or "________"). func isUnderscoreText(text string) bool { uc := 0 for _, r := range text { if r == '_' { uc++ } } return uc >= 3 && float64(uc)/float64(len(text)) > 0.5 } func findLineGaps(line []TextItem, minGap float64) []gap { if len(line) <= 1 { return nil } var rawGaps []gap for i := 1; i < len(line); i++ { // Use edge-to-edge gap: right edge of previous word to left edge of next word gw := line[i].X - line[i-1].XMax if gw >= minGap { // Skip gaps bridged by underscore decorations. These are form // underline fields (e.g., "_____files_____") that span the gap // between left and right text on the same logical line. They are // not genuine column breaks. gapStart := line[i-1].XMax gapEnd := line[i].X gapBridged := false for j := i - 1; j < len(line) && j >= 0; j++ { if line[j].XMax > gapStart && line[j].X < gapEnd { if isUnderscoreText(line[j].Text) { gapBridged = true break } } if line[j].XMax > gapEnd { break } } if !gapBridged { rawGaps = append(rawGaps, gap{ start: gapStart, end: gapEnd, width: gw, }) } } } // Merge adjacent/overlapping gaps: if two gaps are close enough // (gap between them < minGap), treat them as one column break. if len(rawGaps) <= 1 { return rawGaps } var merged []gap merged = append(merged, rawGaps[0]) for i := 1; i < len(rawGaps); i++ { prev := &merged[len(merged)-1] cur := rawGaps[i] if cur.start-prev.end < minGap { // Merge: extend previous gap prev.end = cur.end prev.width = prev.end - prev.start } else { merged = append(merged, cur) } } return merged } // findBreakRegions identifies X ranges where gaps consistently appear // across consecutive lines (indicating true column breaks). func findBreakRegions(allGaps []lineGap, lines [][]TextItem, pageWidth float64) []breakRegion { if len(allGaps) == 0 { return nil } // Collect unique gap center positions (rounded) type gapInfo struct { center float64 lineIndices []int } // Group gaps by approximate center X (within 15% of page width) var gapCenters []gapInfo for _, g := range allGaps { center := (g.gapStart + g.gapEnd) / 2 found := false for ci := range gapCenters { if math.Abs(center-gapCenters[ci].center) < pageWidth*0.15 { gapCenters[ci].lineIndices = append(gapCenters[ci].lineIndices, g.lineIndex) found = true break } } if !found { gapCenters = append(gapCenters, gapInfo{ center: center, lineIndices: []int{g.lineIndex}, }) } } // Keep only centers with 2+ consecutive lines having gaps var regions []breakRegion for _, gc := range gapCenters { if consecutiveCount(gc.lineIndices) >= 2 { // Find the actual gap boundaries from the lines xMin := pageWidth xMax := 0.0 for _, li := range gc.lineIndices { for _, g := range findLineGaps(lines[li], pageWidth*0.05) { center := (g.start + g.end) / 2 if math.Abs(center-gc.center) < pageWidth*0.1 { if g.start < xMin { xMin = g.start } if g.end > xMax { xMax = g.end } } } } regions = append(regions, breakRegion{ xMin: xMin, xMax: xMax, lineIndices: gc.lineIndices, }) } } return regions } // consecutiveCount finds the longest run of consecutive integers in a sorted list. func consecutiveCount(indices []int) int { if len(indices) == 0 { return 0 } // Sort for i := 1; i < len(indices); i++ { for j := i; j > 0 && indices[j] < indices[j-1]; j-- { indices[j], indices[j-1] = indices[j-1], indices[j] } } // Deduplicate deduped := []int{indices[0]} for i := 1; i < len(indices); i++ { if indices[i] != indices[i-1] { deduped = append(deduped, indices[i]) } } maxConsec := 1 currentConsec := 1 for i := 1; i < len(deduped); i++ { if deduped[i] == deduped[i-1]+1 { currentConsec++ if currentConsec > maxConsec { maxConsec = currentConsec } } else { currentConsec = 1 } } return maxConsec } // reorderWithBreaks splits lines at break regions and reorders for single-column output. // // Strategy: group consecutive lines that all have column breaks into "column blocks". // Extend blocks to adjacent lines that also span the break. Within each block, // output left column top-to-bottom, then right column. Lines without breaks // (between blocks) are output as-is. // // As items are reordered, their Y values are updated to reflect their position // in the flattened output. This allows downstream formatting logic (paragraph // detection, heading rules, etc.) to work correctly on the rearranged text. func reorderWithBreaks(lines [][]TextItem, regions []breakRegion, fontSize float64, pageWidth float64) []TextItem { if len(regions) == 0 { return assignFlattenedPositions(lines, fontSize) } // Build set of line indices that have a confirmed break breakLineSet := make(map[int]bool) for _, r := range regions { for _, li := range r.lineIndices { breakLineSet[li] = true } } // Extend break blocks: include adjacent lines that span the break region // (have items on both sides of the break) for _, r := range regions { start, end := startLineRange(r.lineIndices) // Extend upward for start > 0 && spansBreak(lines[start-1], r, pageWidth) { start-- breakLineSet[start] = true } // Extend downward for end < len(lines)-1 && spansBreak(lines[end+1], r, pageWidth) { end++ breakLineSet[end] = true } } // Group into segments: consecutive lines with breaks form "column blocks" type segment struct { startLine int endLine int // inclusive isBlock bool } var segments []segment i := 0 for i < len(lines) { if breakLineSet[i] { start := i for i < len(lines) && breakLineSet[i] { i++ } segments = append(segments, segment{startLine: start, endLine: i - 1, isBlock: true}) } else { start := i for i < len(lines) && !breakLineSet[i] { i++ } segments = append(segments, segment{startLine: start, endLine: i - 1, isBlock: false}) } } // lineHeight is the vertical spacing between consecutive lines in flattened output. // Must be > 16 so paragraph gaps (> 2*lineHeight) are detectable by format rules. lineHeight := math.Ceil(fontSize) + 4 flattenedLine := 0 var result []TextItem for _, seg := range segments { if seg.isBlock { // Blank line before column block result = append(result, TextItem{Text: "", Y: -1}) // Split each line in block at break region, collect left/right parts var leftLines [][]TextItem var rightLines [][]TextItem for li := seg.startLine; li <= seg.endLine; li++ { left, right := splitLine(lines[li], regions) if len(left) > 0 { leftLines = append(leftLines, left) } if len(right) > 0 { rightLines = append(rightLines, right) } } // Output left column top-to-bottom for _, line := range leftLines { flattenedY := float64(flattenedLine) * lineHeight flattened := assignLinePositions(line, flattenedY) result = append(result, flattened...) flattenedLine++ } // Add blank line between columns if both exist if len(leftLines) > 0 && len(rightLines) > 0 { result = append(result, TextItem{Text: "", Y: -1}) } // Output right column top-to-bottom for _, line := range rightLines { flattenedY := float64(flattenedLine) * lineHeight flattened := assignLinePositions(line, flattenedY) result = append(result, flattened...) flattenedLine++ } // Blank line after column block — skip a line to create paragraph gap flattenedLine++ result = append(result, TextItem{Text: "", Y: -1}) } else { // Normal lines, output with flattened Y positions for li := seg.startLine; li <= seg.endLine; li++ { flattenedY := float64(flattenedLine) * lineHeight flattened := assignLinePositions(lines[li], flattenedY) result = append(result, flattened...) flattenedLine++ } } } return result } // assignFlattenedPositions assigns sequential Y positions to items across all lines, // preserving line grouping. Used for the no-column-detected path. func assignFlattenedPositions(lines [][]TextItem, fontSize float64) []TextItem { lineHeight := math.Ceil(fontSize) + 4 // Merge lines at the same Y position into single lines. // pdftotext sometimes splits a single PDF text line into multiple // elements (e.g., around underscores or form fields), // causing text like "{name of business}" to fragment across // separate flattened Y positions. merged := mergeSameYLinesForFlat(lines, fontSize*0.5) var result []TextItem for li, line := range merged { flattenedY := float64(li) * lineHeight flattened := assignLinePositions(line, flattenedY) result = append(result, flattened...) } return result } // mergeSameYLinesForFlat merges lines that share the same Y position // (within tolerance) into single combined lines, sorted by X. func mergeSameYLinesForFlat(lines [][]TextItem, yTol float64) [][]TextItem { if len(lines) <= 1 { return lines } var result [][]TextItem var current []TextItem if len(lines[0]) == 0 { return lines } currentY := lines[0][0].Y for _, line := range lines { if len(line) == 0 { continue } if math.Abs(line[0].Y-currentY) <= yTol { current = append(current, line...) } else { if len(current) > 0 { sortByX(current) result = append(result, current) } current = line currentY = line[0].Y } } if len(current) > 0 { sortByX(current) result = append(result, current) } return result } // assignLinePositions creates a copy of the line items with Y set to the given // flattened Y value and X normalized to a single-column layout. OrigY is preserved. func assignLinePositions(line []TextItem, flattenedY float64) []TextItem { if len(line) == 0 { return nil } // Find the leftmost X to use as column baseline minX := line[0].X for _, item := range line { if item.X < minX { minX = item.X } } result := make([]TextItem, len(line)) for i, item := range line { result[i] = TextItem{ Text: item.Text, X: math.Floor(item.X - minX), Y: flattenedY, XMax: math.Floor(item.XMax - minX), YMax: flattenedY + (item.YMax - item.Y), OrigX: item.OrigX, OrigY: item.OrigY, FontSize: item.FontSize, Page: item.Page, PageHeight: item.PageHeight, FormField: item.FormField, } } return result } // startLineRange returns the min and max consecutive line indices from a list. func startLineRange(indices []int) (int, int) { if len(indices) == 0 { return 0, 0 } // Sort sorted := make([]int, len(indices)) copy(sorted, indices) for i := 1; i < len(sorted); i++ { for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- { sorted[j], sorted[j-1] = sorted[j-1], sorted[j] } } // Find longest consecutive run minLine := sorted[0] maxLine := sorted[0] runStart := sorted[0] runEnd := sorted[0] bestLen := 1 for i := 1; i < len(sorted); i++ { if sorted[i] == sorted[i-1]+1 { runEnd = sorted[i] if runEnd-runStart+1 > bestLen { bestLen = runEnd - runStart + 1 minLine = runStart maxLine = runEnd } } else { runStart = sorted[i] runEnd = sorted[i] } } return minLine, maxLine } // spansBreak checks if a line belongs in a column block. // Returns true when: // 1. The line has items on both sides AND a significant gap at the break position // (standard two-column row like "Name: +AH ... Name: +AI") // 2. ALL content stays within one column AND is narrow (<40% page width) // (single-column row like "Florida Bar Number:" that only appears on one side) // // Full-width paragraph lines fail both checks — they span the entire page. func spansBreak(line []TextItem, r breakRegion, pageWidth float64) bool { mid := (r.xMin + r.xMax) / 2 hasLeft := false hasRight := false gapAtBreak := 0.0 prevXMax := 0.0 minX := math.MaxFloat64 maxX := 0.0 for _, item := range line { if item.XMax < mid { hasLeft = true } else if item.X > mid { hasRight = true } else { // Item spans the break midpoint — bridges both columns hasLeft = true hasRight = true } if prevXMax > 0 { gw := item.X - prevXMax if prevXMax < r.xMax && item.X > r.xMin { gapAtBreak = gw } } prevXMax = item.XMax if item.X < minX { minX = item.X } if item.XMax > maxX { maxX = item.XMax } } // Case 1: items on both sides with significant gap at break if hasLeft && hasRight && gapAtBreak >= (r.xMax-r.xMin)*0.3 { return true } // Case 2: all content on exactly one side, narrow enough to be a column entry contentWidth := maxX - minX if contentWidth < pageWidth*0.4 && (hasLeft != hasRight) { return true } return false } // splitLine splits a line at break regions into left and right parts. // Uses midpoint of break region as the split boundary. func splitLine(line []TextItem, regions []breakRegion) (left, right []TextItem) { for _, item := range line { // Use midpoint of the primary break region mid := (regions[0].xMin + regions[0].xMax) / 2 if item.X < mid { left = append(left, item) } else { right = append(right, item) } } return left, right } // mergeSameColumnBlocks groups narrow blocks (width < 40% page width) that // share the same column layout AND are vertically proximate into combined // blocks. Lines from different columns at the same Y position are kept // separate so FlattenLines can detect column breaks. Wide blocks // (full-width paragraphs) are returned unchanged. func mergeSameColumnBlocks(blocks []BlockLines, pageWidth float64) []BlockLines { if len(blocks) == 0 { return blocks } // Sort blocks by Y position, then xMin for stable ordering slices.SortFunc(blocks, func(a, b BlockLines) int { if cmp.Compare(blockMinY(a), blockMinY(b)) != 0 { return cmp.Compare(blockMinY(a), blockMinY(b)) } return cmp.Compare(a.XMin, b.XMin) }) // Separate narrow (single-column) blocks from wide (full-width) blocks var narrow []BlockLines var wide []BlockLines for _, bl := range blocks { if blockMaxWidth(bl) < pageWidth*0.4 { narrow = append(narrow, bl) } else { wide = append(wide, bl) } } if len(narrow) == 0 { return blocks } // Cluster narrow blocks by Y proximity: only blocks within yClusterGap // of each other form a merge group. This prevents distant blocks (e.g., // signature area at Y=621) from merging with column sections (Y=439-567). yClusterGap := 30.0 // 30pt — enough to bridge inter-block gaps but not paragraphs // Track cluster Y ranges to avoid recomputing on each iteration type clusterRange struct { blocks []BlockLines yMin float64 yMax float64 } var clusters []clusterRange for _, bl := range narrow { yMin := blockMinY(bl) yMax := blockMaxY(bl) placed := false for ci := range clusters { if yMin <= clusters[ci].yMax+yClusterGap && yMax >= clusters[ci].yMin-yClusterGap { clusters[ci].blocks = append(clusters[ci].blocks, bl) if yMin < clusters[ci].yMin { clusters[ci].yMin = yMin } if yMax > clusters[ci].yMax { clusters[ci].yMax = yMax } placed = true break } } if !placed { clusters = append(clusters, clusterRange{ blocks: []BlockLines{bl}, yMin: yMin, yMax: yMax, }) } } // Process each cluster independently var result []BlockLines for _, cr := range clusters { clusterResult := mergeOneColumnCluster(cr.blocks, pageWidth) result = append(result, clusterResult...) } // Merge wide blocks into the result at proper Y positions for _, wb := range wide { inserted := false var sorted []BlockLines for _, r := range result { if !inserted && blockMinY(r) > blockMinY(wb) { sorted = append(sorted, wb) inserted = true } sorted = append(sorted, r) } if !inserted { sorted = append(sorted, wb) } result = sorted } return result } // blockMaxY returns the maximum Y of all items in a block. func blockMaxY(bl BlockLines) float64 { maxY := 0.0 for _, line := range bl.Lines { for _, item := range line { if item.Y > maxY { maxY = item.Y } } } return maxY } // mergeOneColumnCluster merges blocks in one Y-proximate cluster. // Only merges when there are 2+ columns with significant horizontal gaps // (median gap > 60pt). Otherwise returns blocks unchanged. func mergeOneColumnCluster(cluster []BlockLines, pageWidth float64) []BlockLines { // Check for multi-column layout (2+ columns with significant gap) xMins := make([]float64, len(cluster)) for i, bl := range cluster { xMins[i] = bl.XMin } sorted := make([]float64, len(xMins)) copy(sorted, xMins) for i := 1; i < len(sorted); i++ { for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- { sorted[j], sorted[j-1] = sorted[j-1], sorted[j] } } clusters := clusterFloats(sorted, 20) if len(clusters) < 2 { return cluster // not multi-column } if medianGap(clusters) < 60 { return cluster // gaps too small for true columns } // Collect all lines from the cluster and merge lines at the same Y allLines := collectAllLines(cluster) mergedLines := mergeSameYLines(allLines, clusters, 8) return []BlockLines{{XMin: clusters[0], Lines: mergedLines}} } // collectAllLines gathers all lines from narrow blocks and sorts by Y. func collectAllLines(narrow []BlockLines) [][]TextItem { var all [][]TextItem for _, bl := range narrow { all = append(all, bl.Lines...) } slices.SortFunc(all, func(a, b []TextItem) int { return cmp.Compare(a[0].Y, b[0].Y) }) return all } // mergeSameYLines combines lines that share the same Y position (within tolerance) // into single lines. Each original line is treated as an atomic group — all items // from the same original line stay together. Groups are sorted by the leftmost X // of each group, so that FlattenLines sees proper multi-column lines. // Lines from different columns at the same Y are kept separate so that // splitBlockColumns can assign them to the correct column. func mergeSameYLines(lines [][]TextItem, columnXMins []float64, yTol float64) [][]TextItem { if len(lines) == 0 { return nil } // Group lines by Y position type yGroup struct { yRef float64 lines [][]TextItem } var groups []yGroup used := make([]bool, len(lines)) for i, line := range lines { if used[i] { continue } g := yGroup{yRef: line[0].Y, lines: [][]TextItem{line}} used[i] = true for j := i + 1; j < len(lines); j++ { if used[j] { continue } if math.Abs(lines[j][0].Y-line[0].Y) <= yTol { g.lines = append(g.lines, lines[j]) used[j] = true } } groups = append(groups, g) } // For each group with 2+ lines (from different columns), keep lines separate // but sort by X. Don't concatenate — splitBlockColumns will assign each line // to the correct column based on its X position. var result [][]TextItem for _, g := range groups { if len(g.lines) <= 1 { result = append(result, g.lines[0]) continue } // Sort lines by their leftmost X to determine column order slices.SortFunc(g.lines, func(a, b []TextItem) int { return cmp.Compare(minX(a), minX(b)) }) // Keep each line separate (don't concatenate across columns) for _, line := range g.lines { result = append(result, line) } } return result } // minX returns the minimum X in a line. func minX(line []TextItem) float64 { if len(line) == 0 { return 0 } m := line[0].X for _, it := range line { if it.X < m { m = it.X } } return m } // blockMaxWidth returns the maximum line width in a block. func blockMaxWidth(bl BlockLines) float64 { maxW := 0.0 for _, line := range bl.Lines { minX, maxX := line[0].X, line[0].XMax for _, it := range line { if it.X < minX { minX = it.X } if it.XMax > maxX { maxX = it.XMax } } if maxX-minX > maxW { maxW = maxX - minX } } return maxW } // blockMinY returns the minimum Y of all items in a block. func blockMinY(bl BlockLines) float64 { if len(bl.Lines) == 0 { return math.MaxFloat64 } return bl.Lines[0][0].Y } // medianGap returns the median gap between adjacent sorted values. func medianGap(sorted []float64) float64 { if len(sorted) < 2 { return 0 } gaps := make([]float64, len(sorted)-1) for i := 1; i < len(sorted); i++ { gaps[i-1] = sorted[i] - sorted[i-1] } for i := 1; i < len(gaps); i++ { for j := i; j > 0 && gaps[j] < gaps[j-1]; j-- { gaps[j], gaps[j-1] = gaps[j-1], gaps[j] } } if len(gaps)%2 == 0 { return (gaps[len(gaps)/2-1] + gaps[len(gaps)/2]) / 2 } return gaps[len(gaps)/2] } // clusterFloats groups sorted values into natural clusters where adjacent // values are within tol of each other. Returns the cluster midpoints. func clusterFloats(sorted []float64, tol float64) []float64 { if len(sorted) == 0 { return nil } var clusters [][]float64 var current []float64 for _, v := range sorted { if len(current) == 0 || v-current[len(current)-1] <= tol { current = append(current, v) } else { clusters = append(clusters, current) current = []float64{v} } } clusters = append(clusters, current) // Return cluster midpoints mids := make([]float64, len(clusters)) for i, c := range clusters { mids[i] = (c[0] + c[len(c)-1]) / 2 } return mids } // isMultiColumnBlock checks if a merged block has lines at 2+ distinct X // positions separated by significant gaps (indicating multi-column layout). // Uses two methods: // 1. Line-start clustering: groups lines by their leftmost X position // 2. Intra-line gap detection: finds large horizontal gaps within combined lines // Either method can trigger multi-column detection. func isMultiColumnBlock(bl BlockLines) bool { // Method 1: Check line-start X positions for distinct column groups. // A true multi-column block has lines that start at exactly 2 distinct X // positions separated by a significant gap (>=100pt). Blocks with 3+ // scattered line-start positions are full-width blocks with varied // indentation, not multi-column. xMins := make([]float64, 0, len(bl.Lines)) for _, line := range bl.Lines { if len(line) > 0 { xMins = append(xMins, line[0].X) } } sorted := make([]float64, len(xMins)) copy(sorted, xMins) for i := 1; i < len(sorted); i++ { for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- { sorted[j], sorted[j-1] = sorted[j-1], sorted[j] } } clusters := clusterFloats(sorted, 30) if len(clusters) == 2 && clusters[1]-clusters[0] >= 100 { // Count lines per cluster — reject if one cluster has only 1 line. // A true multi-column layout has meaningful content in both columns. // Single-line outliers (e.g., "Dated:" at left margin followed by // right-aligned signature fields, or one indented continuation line) // are not multi-column layouts. c0, c1 := 0, 0 for _, x := range xMins { if x <= (clusters[0]+clusters[1])/2 { c0++ } else { c1++ } } if c0 >= 2 && c1 >= 2 { return true } } return false } // splitBlockColumns splits lines in a multi-column merged block into // separate columns. Handles both separate-column lines (each line belongs // to one column) and combined-column lines (items from multiple columns // on the same line, split at the column boundary). // Returns 2 columns ordered left to right. Each column is [][]TextItem. func splitBlockColumns(bl BlockLines) [][][]TextItem { if !isMultiColumnBlock(bl) { return [][][]TextItem{bl.Lines} } // Use line-start X clustering to find column boundaries, // consistent with isMultiColumnBlock Method 1. xMins := make([]float64, 0, len(bl.Lines)) for _, line := range bl.Lines { if len(line) > 0 { xMins = append(xMins, line[0].X) } } sorted := make([]float64, len(xMins)) copy(sorted, xMins) for i := 1; i < len(sorted); i++ { for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- { sorted[j], sorted[j-1] = sorted[j-1], sorted[j] } } clusters := clusterFloats(sorted, 30) if len(clusters) < 2 { return [][][]TextItem{bl.Lines} } // Find the largest gap between clusters — this is the column boundary maxGap := 0.0 splitIdx := 0 for i := 1; i < len(clusters); i++ { gap := clusters[i] - clusters[i-1] if gap > maxGap { maxGap = gap splitIdx = i } } splitX := (clusters[splitIdx-1] + clusters[splitIdx]) / 2 // Split items within each line at the column boundary. // However, only split individual items when the line genuinely has // content from both columns. If all items are continuous text from one // column (e.g., "Termination of Parental Rights" where "Rights" at X=223 // crosses splitX=201), assign the entire line to the column of its start. var leftCol, rightCol [][]TextItem for _, line := range bl.Lines { if len(line) == 0 { continue } // Classify items by their X (left edge), not XMax, so bridging items // (e.g., "Contempt" at X=114, XMax=282 crossing splitX=201) are still // considered left-side items. hasLeft := false hasRight := false lastLeftXMax := 0.0 firstRightX := math.MaxFloat64 for _, item := range line { if item.X < splitX { hasLeft = true if item.XMax > lastLeftXMax { lastLeftXMax = item.XMax } } else { hasRight = true if item.X < firstRightX { firstRightX = item.X } } } if hasLeft && hasRight { // Check the actual gap between left and right items. // A real column break has a significant gap (e.g., 50+ pt). // Continuous text crossing splitX has a small gap (e.g., 3 pt // between "Contempt" at XMax=282 and "Proceedings" at X=285). gapAtSplit := firstRightX - lastLeftXMax if gapAtSplit > 50 { // Real gap — split items at boundary var leftLine, rightLine []TextItem for _, item := range line { if item.X < splitX { leftLine = append(leftLine, item) } else { rightLine = append(rightLine, item) } } if len(leftLine) > 0 { leftCol = append(leftCol, leftLine) } if len(rightLine) > 0 { rightCol = append(rightCol, rightLine) } } else { // Small gap — continuous text, assign to column of line start if hasLeft { leftCol = append(leftCol, line) } else { rightCol = append(rightCol, line) } } } else { // All items on one side — assign to that column if hasLeft { leftCol = append(leftCol, line) } else { rightCol = append(rightCol, line) } } } sortLinesByY(leftCol) sortLinesByY(rightCol) return [][][]TextItem{leftCol, rightCol} } // sortLinesByY sorts a slice of lines by their first item's Y coordinate. func sortLinesByY(lines [][]TextItem) { for i := 1; i < len(lines); i++ { for j := i; j > 0 && lines[j][0].Y < lines[j-1][0].Y; j-- { lines[j], lines[j-1] = lines[j-1], lines[j] } } } // uniqueSortedFloats returns sorted unique floats within tolerance. func uniqueSortedFloats(vals []float64, tol float64) []float64 { if len(vals) == 0 { return nil } sorted := make([]float64, len(vals)) copy(sorted, vals) slices.Sort(sorted) unique := []float64{sorted[0]} for i := 1; i < len(sorted); i++ { if sorted[i]-unique[len(unique)-1] > tol { unique = append(unique, sorted[i]) } } return unique }