package pdfcontent

import (
	"bytes"
	"encoding/xml"
	"fmt"
	"math"
	"os/exec"
	"strings"
)

// pdftotext -bbox-layout HTML output types

type PDFDocument struct {
	XMLName xml.Name `xml:"html"`
	Body    struct {
		XMLName xml.Name `xml:"body"`
		Doc     struct {
			Pages []Page `xml:"page"`
		} `xml:"doc"`
	} `xml:"body"`
}

type Page struct {
	Width  float64 `xml:"width,attr"`
	Height float64 `xml:"height,attr"`
	Flows  []Flow  `xml:"flow"`
}

type Flow struct {
	Blocks []Block `xml:"block"`
}

type Block struct {
	XMin  float64 `xml:"xMin,attr"`
	YMin  float64 `xml:"yMin,attr"`
	XMax  float64 `xml:"xMax,attr"`
	YMax  float64 `xml:"yMax,attr"`
	Lines []Line  `xml:"line"`
}

type Line struct {
	XMin  float64  `xml:"xMin,attr"`
	YMin  float64  `xml:"yMin,attr"`
	XMax  float64  `xml:"xMax,attr"`
	YMax  float64  `xml:"yMax,attr"`
	Words []Word   `xml:"word"`
}

type Word struct {
	XMin float64 `xml:"xMin,attr"`
	YMin float64 `xml:"yMin,attr"`
	XMax float64 `xml:"xMax,attr"`
	YMax float64 `xml:"yMax,attr"`
	Text string  `xml:",chardata"`
}

// ExtractText runs pdftotext -bbox-layout on the PDF and returns structured page data.
func ExtractText(pdfPath string) ([]Page, error) {
	cmd := exec.Command("pdftotext", "-bbox-layout", pdfPath, "-")
	var out bytes.Buffer
	cmd.Stdout = &out
	cmd.Stderr = &out
	if err := cmd.Run(); err != nil {
		return nil, fmt.Errorf("pdftotext: %w: %s", err, out.String())
	}

	var doc PDFDocument
	if err := xml.Unmarshal(out.Bytes(), &doc); err != nil {
		return nil, fmt.Errorf("parsing pdftotext output: %w", err)
	}

	return doc.Body.Doc.Pages, nil
}

// TextItem represents a piece of text with position.
type TextItem struct {
	Text      string
	X         float64
	Y         float64
	XMax      float64
	YMax      float64
	OrigY     float64 // original page Y before flattening, for form field matching
	FontSize  float64 // estimated from yMax - yMin
	Page      int
	PageHeight float64 // original page height, for page break detection
	OrigX     float64 // original page X before flattening
	FormField bool    // true for injected form field markers
}

// blocksToItems converts pdftotext blocks/lines/words to TextItems,
// filtering out content in page margins.
func blocksToItems(page Page, pageNum int, marginTop, marginBottom float64) []TextItem {
	var items []TextItem
	pageBottom := page.Height - marginBottom
	for _, flow := range page.Flows {
		for _, block := range flow.Blocks {
			// Filter blocks in margins
			if block.YMin < marginTop || block.YMax > pageBottom {
				continue
			}
			for _, line := range block.Lines {
				lineItems := wordsToItems(line.Words, pageNum, page.Height)
				items = append(items, lineItems...)
			}
		}
	}
	return items
}

// blocksToLines converts pdftotext blocks to pre-grouped lines ([]TextItem per line),
// filtering out content in page margins. Returns both the flat items and the line groupings.
func blocksToLines(page Page, pageNum int, marginTop, marginBottom float64) ([]TextItem, [][]TextItem) {
	pageBottom := page.Height - marginBottom
	var allItems []TextItem
	var lines [][]TextItem
	for _, flow := range page.Flows {
		for _, block := range flow.Blocks {
			// Filter blocks in margins
			if block.YMin < marginTop || block.YMax > pageBottom {
				continue
			}
			for _, line := range block.Lines {
				lineItems := wordsToItems(line.Words, pageNum, page.Height)
				if len(lineItems) > 0 {
					lines = append(lines, lineItems)
					allItems = append(allItems, lineItems...)
				}
			}
		}
	}
	return allItems, lines
}

// BlockLines holds lines from one pdftotext block, preserving the block's xMin.
type BlockLines struct {
	XMin  float64
	Lines [][]TextItem
}

// blocksToGroupedLines converts pdftotext blocks to BlockLines, preserving block
// boundaries and xMin so that blocks with the same column layout can be merged
// before flattening. Returns all items and the grouped block lines.
func blocksToGroupedLines(page Page, pageNum int, marginTop, marginBottom float64) ([]TextItem, []BlockLines) {
	pageBottom := page.Height - marginBottom
	var allItems []TextItem
	var blocks []BlockLines
	for _, flow := range page.Flows {
		for _, block := range flow.Blocks {
			// Filter blocks entirely within top margin, or blocks
			// that extend into the bottom margin (footers). Using YMax
			// to catch footer blocks that may start above the margin
			// but extend into it.
			if block.YMin < marginTop || block.YMax > pageBottom {
				continue
			}
			var lines [][]TextItem
			for _, line := range block.Lines {
				lineItems := wordsToItems(line.Words, pageNum, page.Height)
				if len(lineItems) > 0 {
					lines = append(lines, lineItems)
					allItems = append(allItems, lineItems...)
				}
			}
			if len(lines) > 0 {
				blocks = append(blocks, BlockLines{XMin: block.XMin, Lines: lines})
			}
		}
	}
	return allItems, blocks
}

// wordsToItems converts pdftotext words to TextItems.
func wordsToItems(words []Word, pageNum int, pageHeight float64) []TextItem {
	var items []TextItem
	for _, w := range words {
		originalText := strings.TrimSpace(w.Text)
		if originalText == "" {
			continue
		}
		// Skip standalone right-side PDF decoration (commas, periods at page edge)
		// that appear on separate <line> elements from the main content.
		origX := math.Floor(w.XMin)
		if len(originalText) == 1 && (originalText == "," || originalText == ".") && origX > 500 {
			continue
		}
		// Strip leading/trailing underscores and punctuation from pdftotext output.
		// pdftotext merges form underline decorations with adjacent text,
		// producing "business}___," or "___{state}" lines.
		text, leadingUnderscores, wasWide := stripUnderlineDecoration(originalText)
		if text == "" && wasWide {
			// Keep underscore-only words with original text for gap bridging
			// in FlattenLines. findLineGaps uses isUnderscoreText to detect
			// these as bridges between text segments on the same logical line,
			// preventing false column break detection. removeFieldLinesFormatRule
			// will strip them during markdown rendering.
			text = originalText
			leadingUnderscores = 0
			wasWide = false
		}
		if text == "" {
			continue
		}
		origY := math.Floor(w.YMin)
		origXAdj := origX
		// When leading underscores are stripped, shift OrigX rightward to
		// approximate where the actual text begins (after the underline).
		// Use the word's actual bounding box width divided by total character
		// count to estimate per-character width, then shift by stripped count.
		if leadingUnderscores > 0 {
			wordWidth := w.XMax - w.XMin
			totalChars := len([]rune(w.Text))
			if totalChars > 0 {
				charW := wordWidth / float64(totalChars)
				origXAdj = origX + charW*float64(leadingUnderscores)
			}
		}
		xMax := math.Floor(w.XMax)
		// When the original text was wide due to underscores,
		// reduce XMax to approximate the actual text width. This prevents
		// false column break detection.
		if wasWide {
			xMax = origXAdj + float64(len([]rune(text)))*float64(w.YMax-w.YMin)*0.7
		}
		items = append(items, TextItem{
			Text:       text,
			X:          origXAdj,
			Y:          origY,
			XMax:       xMax,
			YMax:       math.Floor(w.YMax),
			OrigX:      origXAdj,
			OrigY:      origY,
			FontSize:   w.YMax - w.YMin,
			Page:       pageNum,
			PageHeight: pageHeight,
		})
	}
	return items
}

// stripUnderlineDecoration removes leading/trailing underscores and surrounding
// punctuation from pdftotext text that merges form underline decorations.
// Returns cleaned text, number of leading underscore characters removed,
// and whether the original text was wide due to underscores.
func stripUnderlineDecoration(text string) (string, int, bool) {
	uc := strings.Count(text, "_")
	if uc > 2 && isMostlyUnderscores(text) {
		text = strings.ReplaceAll(text, "_", "")
		text = strings.TrimRight(text, ",.")
		text = strings.TrimSpace(text)
		if text == "" {
			return "", 0, true
		}
		return text, 0, true
	} else if uc > 2 {
		leading := len(text) - len(strings.TrimLeft(text, "_"))
		text = strings.TrimLeft(text, "_")
		text = strings.TrimRight(text, "_")
		text = strings.TrimRight(text, ",.")
		return text, leading, true
	}
	return text, 0, false
}

// isMostlyUnderscores returns true if 80%+ of the text is underscore-like characters.
func isMostlyUnderscores(s string) bool {
	if len(s) < 3 {
		return false
	}
	count := 0
	for range s {
		count++
	}
	underscoreCount := strings.Count(s, "_")
	return float64(underscoreCount)/float64(count) > 0.8
}

// pageText holds items for one page.
type pageText struct {
	PageNum int
	Items   []TextItem
	Width   float64
	Height  float64
}

// filterMarginWords removes words that fall within marginUnits of any page edge.
func filterMarginWords(words []Word, pageWidth, pageHeight, marginTop float64, marginBottom float64) []Word {
	var filtered []Word
	for _, w := range words {
		if w.YMin >= marginTop && w.YMax <= pageHeight-marginBottom {
			filtered = append(filtered, w)
		}
	}
	return filtered
}