package pdfprocessor import ( "fmt" "os" "strings" "sync/atomic" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" ) // FormFieldInfo represents a PDF form field extracted from the document type FormFieldInfo struct { Name string // PDF field name or synthetic name for unnamed fields Type string PosX float64 PosY float64 Width float64 Height float64 Page int Ident string // indirect reference string, unique per widget annotation } var unnamedCounter int32 // ExtractFormFields extracts form fields from a PDF using pdfcpu. // Includes fields without names (synthetic __unnamed_N names assigned). func ExtractFormFields(pdfPath string) ([]FormFieldInfo, error) { // Reset counter so __unnamed_N names are stable across calls atomic.StoreInt32(&unnamedCounter, 0) f, err := os.Open(pdfPath) if err != nil { return nil, fmt.Errorf("opening PDF: %w", err) } defer f.Close() conf := model.NewDefaultConfiguration() ctx, err := api.ReadValidateAndOptimize(f, conf) if err != nil { return nil, fmt.Errorf("reading PDF: %w", err) } xref := ctx.XRefTable fields, err := formFieldsArray(xref) if err != nil { return nil, fmt.Errorf("getting form fields array: %w", err) } var result []FormFieldInfo for page := 1; page <= xref.PageCount; page++ { pgAnnots := xref.PageAnnots[page] if len(pgAnnots) == 0 { continue } wAnnots, ok := pgAnnots[model.AnnWidget] if !ok { continue } pageFields, err := extractPageFields(xref, wAnnots, fields, page) if err != nil { return nil, fmt.Errorf("extracting fields for page %d: %w", page, err) } result = append(result, pageFields...) } if len(result) == 0 { return nil, fmt.Errorf("no form fields found in PDF") } return result, nil } // formFieldsArray extracts the /Fields array from the PDF's AcroForm func formFieldsArray(xref *model.XRefTable) (types.Array, error) { catalog, err := xref.Catalog() if err != nil { return nil, err } acroFormRef, ok := catalog.Find("AcroForm") if !ok { return nil, fmt.Errorf("no AcroForm in PDF catalog") } acroFormRefTyped, ok := acroFormRef.(types.IndirectRef) if !ok { return nil, fmt.Errorf("AcroForm is not an indirect reference") } acroForm, err := xref.DereferenceDict(acroFormRefTyped) if err != nil { return nil, fmt.Errorf("dereferencing AcroForm: %w", err) } fieldsObj, ok := acroForm.Find("Fields") if !ok { return nil, fmt.Errorf("no Fields array in AcroForm") } fieldsArray, err := xref.DereferenceArray(fieldsObj) if err != nil { return nil, fmt.Errorf("dereferencing Fields array: %w", err) } return fieldsArray, nil } // extractPageFields extracts form field info from widget annotations on a single page. // Named fields are deduplicated by name; unnamed fields by indirect reference. func extractPageFields(xref *model.XRefTable, wAnnots model.Annot, fields types.Array, page int) ([]FormFieldInfo, error) { var result []FormFieldInfo seenNamed := make(map[string]bool) seenUnnamed := make(map[string]bool) indRefs := wAnnots.IndRefs if indRefs == nil { return result, nil } for _, indRef := range *indRefs { d, err := xref.DereferenceDict(indRef) if err != nil || len(d) == 0 { continue } info, err := fieldInfoFromWidget(xref, d, indRef, fields) if err != nil { continue } // Dedup: named fields by name, unnamed fields by indirect ref if info.Name != "" && !strings.HasPrefix(info.Name, "__unnamed_") { if seenNamed[info.Name] { continue } seenNamed[info.Name] = true } else { if seenUnnamed[info.Ident] { continue } seenUnnamed[info.Ident] = true } info.Page = page result = append(result, info) } return result, nil } // fieldInfoFromWidget extracts field name, type, rect, and ident from a widget annotation dict func fieldInfoFromWidget(xref *model.XRefTable, d types.Dict, indRef types.IndirectRef, fields types.Array) (FormFieldInfo, error) { var info FormFieldInfo info.Ident = indRef.String() // Extract field name from /T entry info.Name = extractFieldName(d, xref, indRef, fields) // Generate synthetic name for unnamed fields if info.Name == "" { idx := atomic.AddInt32(&unnamedCounter, 1) info.Name = fmt.Sprintf("__unnamed_%d", idx) } // Extract field type from /FT entry info.Type = extractFieldType(d, xref, indRef) // Extract rect from /Rect entry rect := extractRect(d) if rect != nil { info.PosX = rect.LL.X info.PosY = rect.LL.Y info.Width = rect.UR.X - rect.LL.X info.Height = rect.UR.Y - rect.LL.Y } return info, nil } // extractFieldName extracts the field name, walking up parent chain if needed func extractFieldName(d types.Dict, xref *model.XRefTable, indRef types.IndirectRef, fields types.Array) string { // Try /T directly on this dict if name := tryExtractName(d); name != "" { return name } // Walk up parent chain parentRef := d.IndirectRefEntry("Parent") for parentRef != nil { parent, err := xref.DereferenceDict(*parentRef) if err != nil || len(parent) == 0 { break } if name := tryExtractName(parent); name != "" { // For Btn fields (radio groups, checkboxes), build fully qualified name if ft := parent.NameEntry("FT"); ft != nil && *ft == "Btn" { return fullyQualifiedName(xref, indRef, parent) } return name } parentRef = parent.IndirectRefEntry("Parent") } return "" } // tryExtractName attempts to read the /T (title) entry from a dict func tryExtractName(d types.Dict) string { // Try as string literal or hex string (most common: /T (FieldName)) if name, err := d.StringOrHexLiteralEntry("T"); err == nil && name != nil { return *name } // Try as PDF name (e.g., /T /FieldName) if name := d.NameEntry("T"); name != nil { return *name } return "" } // fullyQualifiedName builds the full dotted field name for button fields // by traversing the parent hierarchy func fullyQualifiedName(xref *model.XRefTable, indRef types.IndirectRef, topParent types.Dict) string { var parts []string // Collect names from child up to (but not including) topParent currentRef := &indRef for currentRef != nil { d, err := xref.DereferenceDict(*currentRef) if err != nil || len(d) == 0 { break } if name := tryExtractName(d); name != "" { parts = append([]string{name}, parts...) } parentRef := d.IndirectRefEntry("Parent") if parentRef == nil { break } // Check if parent is the top parent (has /FT) parentDict, err := xref.DereferenceDict(*parentRef) if err != nil || parentDict.NameEntry("FT") == nil { currentRef = parentRef continue } break } if len(parts) > 0 { return strings.Join(parts, ".") } return "" } // extractFieldType extracts and maps the field type func extractFieldType(d types.Dict, xref *model.XRefTable, indRef types.IndirectRef) string { // Try /FT directly if ft := d.NameEntry("FT"); ft != nil { return mapFieldType(*ft) } // Walk up parent chain parentRef := d.IndirectRefEntry("Parent") for parentRef != nil { parent, err := xref.DereferenceDict(*parentRef) if err != nil || len(parent) == 0 { break } if ft := parent.NameEntry("FT"); ft != nil { return mapFieldType(*ft) } parentRef = parent.IndirectRefEntry("Parent") } return "text" } // extractRect extracts the rectangle from a dict's /Rect entry func extractRect(d types.Dict) *types.Rectangle { rectObj, ok := d.Find("Rect") if !ok { return nil } arr, ok := rectObj.(types.Array) if !ok { return nil } rect := types.RectForArray(arr) if rect == nil { return nil } return rect } // mapFieldType maps PDF field type names to human-readable types func mapFieldType(ft string) string { switch ft { case "Tx": return "text" case "Btn": return "button" case "Ch": return "choice" case "Sig": return "signature" default: return "text" } }