package handlers import ( "database/sql" "encoding/json" "fmt" "log" "net/http" "net/url" "os" "path/filepath" "strings" "pdf-wizard/internal/db" "pdf-wizard/internal/htmlform" "pdf-wizard/internal/llm" "pdf-wizard/internal/models" "pdf-wizard/internal/pdfcontent" "pdf-wizard/internal/pdfprocessor" "pdf-wizard/internal/shortcode" ) // normalizeLigatures replaces common Unicode ligatures with their decomposed // equivalents so browsers don't choke on fi/fl/ff/ffi/ffl characters from PDF extraction. func normalizeLigatures(s string) string { return strings.NewReplacer( "\ufb00", "ff", // ff "\ufb01", "fi", // fi "\ufb02", "fl", // fl "\ufb03", "ffi", // ffi "\ufb04", "ffl", // ffl "\ufb05", "st", // ſt "\ufb06", "st", // st "\ufb13", "ct", // ﬓ "\ufb14", "ck", // ﬔ "\ufb15", "ck", // ﬕ "\ufb16", "ts", // ﬖ "\ufb17", "tz", // ﬗ ).Replace(s) } type Handler struct { db *sql.DB dataDir string pdfDir string llm *llm.Client } func NewHandler(db *sql.DB, dataDir string) (*Handler, error) { pdfDir := filepath.Join(dataDir, "pdfs") for _, dir := range []string{dataDir, pdfDir} { if err := os.MkdirAll(dir, 0755); err != nil { return nil, fmt.Errorf("creating directory %s: %w", dir, err) } } // LLM client is optional — processing works without it var lclient *llm.Client if llm.HasOpenAIKey() { cl, err := llm.NewClient() if err == nil { lclient = cl log.Printf("LLM client initialized (model: %s)", os.Getenv("OPENAI_MODEL")) } else { log.Printf("Warning: LLM client not available: %v", err) } } else { log.Printf("Warning: OPENAI_API_KEY not set — LLM processing disabled") } return &Handler{ db: db, dataDir: dataDir, pdfDir: pdfDir, llm: lclient, }, nil } func (h *Handler) PDF(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet && r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } url := r.URL.Query().Get("url") idStr := r.URL.Query().Get("id") // By ID: view existing PDF if idStr != "" { var pdfID int _, err := fmt.Sscanf(idStr, "%d", &pdfID) if err != nil { http.Error(w, "invalid id", http.StatusBadRequest) return } h.viewPDF(w, pdfID) return } // By URL: add/process PDF if url == "" { http.Error(w, "missing url or id parameter", http.StatusBadRequest) return } h.addPDF(w, url) } func (h *Handler) addPDF(w http.ResponseWriter, urlStr string) { // Step 1: Validate domain parsedURL, err := url.Parse(urlStr) if err != nil { writeJSON(w, http.StatusBadRequest, models.AddPDFResponse{ Status: "failed", Error: fmt.Sprintf("invalid URL: %v", err), }) return } domain := parsedURL.Hostname() orgID, err := db.ResolveOrgByDomain(h.db, domain) if err != nil { log.Printf("Error resolving org for domain %s: %v", domain, err) writeJSON(w, http.StatusInternalServerError, models.AddPDFResponse{ Status: "failed", Error: err.Error(), }) return } if orgID == nil { writeJSON(w, http.StatusBadRequest, models.AddPDFResponse{ Status: "failed", Error: fmt.Sprintf("domain %s is not associated with any organization", domain), }) return } // Step 2: Download PDF and compute MD5 pdfPath, md5Hash, err := pdfprocessor.DownloadPDF(urlStr, h.pdfDir) if err != nil { log.Printf("Error downloading PDF: %v", err) writeJSON(w, http.StatusBadRequest, models.AddPDFResponse{ Status: "failed", Error: fmt.Sprintf("download failed: %v", err), }) return } // Step 3: Check if already processed existing, err := db.GetPDFByMD5(h.db, md5Hash) if err != nil { log.Printf("Error checking existing PDF: %v", err) writeJSON(w, http.StatusInternalServerError, models.AddPDFResponse{ Status: "failed", Error: err.Error(), }) return } if existing != nil { h.viewPDF(w, existing.ID) return } // Step 4: Create DB record pdf, err := db.CreatePDF(h.db, urlStr, md5Hash, orgID) if err != nil { log.Printf("Error creating PDF record: %v", err) writeJSON(w, http.StatusInternalServerError, models.AddPDFResponse{ Status: "failed", Error: err.Error(), }) return } // Step 5: Set status to processing and kick off background work db.UpdatePDFStatus(h.db, pdf.ID, "processing") go h.processPDF(pdf.ID, pdfPath) writeJSON(w, http.StatusOK, models.AddPDFResponse{ ID: pdf.ID, Status: "processing", }) } func (h *Handler) Markdown(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } idStr := r.URL.Query().Get("id") if idStr == "" { http.Error(w, "missing id parameter", http.StatusBadRequest) return } var pdfID int _, err := fmt.Sscanf(idStr, "%d", &pdfID) if err != nil { http.Error(w, "invalid id", http.StatusBadRequest) return } pdf, err := db.GetPDF(h.db, pdfID) if err != nil { http.Error(w, err.Error(), http.StatusNotFound) return } pdfPath := filepath.Join(h.pdfDir, fmt.Sprintf("%s.pdf", pdf.MD5Hash)) if _, err := os.Stat(pdfPath); err != nil { http.Error(w, "PDF file not found", http.StatusNotFound) return } markdown, err := pdfcontent.ExtractPDFContent(pdfPath) if err != nil { log.Printf("PDF %d: content extraction failed: %v", pdfID, err) http.Error(w, fmt.Sprintf("extraction failed: %v", err), http.StatusInternalServerError) return } w.Header().Set("Content-Type", "text/plain; charset=utf-8") w.Write([]byte(normalizeLigatures(markdown))) } func (h *Handler) ViewPage(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } url := r.URL.Query().Get("url") if url == "" { http.Error(w, "missing url parameter", http.StatusBadRequest) return } http.ServeFile(w, r, "frontend/dist/index.html") } func (h *Handler) viewPDF(w http.ResponseWriter, pdfID int) { pdf, err := db.GetPDF(h.db, pdfID) if err != nil { http.Error(w, err.Error(), http.StatusNotFound) return } fields, _ := db.GetFormFields(h.db, pdfID) writeJSON(w, http.StatusOK, models.ViewPDFResponse{ ID: pdf.ID, URL: pdf.URL, Status: pdf.Status, LLMStatus: pdf.LLMStatus, FormFields: fields, CreatedAt: pdf.CreatedAt, }) } func (h *Handler) processPDF(pdfID int, pdfPath string) { defer func() { if r := recover(); r != nil { log.Printf("Panic processing PDF %d: %v", pdfID, r) db.UpdatePDFStatus(h.db, pdfID, "failed") } }() // Step 1: Extract form field positions (needed for FillPDF) log.Printf("Extracting form fields from PDF (id=%d)", pdfID) rawFields, err := pdfprocessor.ExtractFormFields(pdfPath) if err != nil { log.Printf("PDF %d: extract fields failed: %v", pdfID, err) db.UpdatePDFStatus(h.db, pdfID, "failed") return } log.Printf("Found %d form fields", len(rawFields)) // Step 2: Mark as completed (basic extraction done) db.UpdatePDFStatus(h.db, pdfID, "completed") // Step 3: LLM processing (non-blocking, doesn't fail the PDF) go func() { pdf, err := db.GetPDF(h.db, pdfID) if err != nil { log.Printf("PDF %d: failed to load for LLM processing: %v", pdfID, err) return } RunLLMProcessing(h.db, pdf, pdfPath) }() } func (h *Handler) FillPDF(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } idStr := r.URL.Query().Get("id") if idStr == "" { http.Error(w, "missing id parameter", http.StatusBadRequest) return } var pdfID int _, err := fmt.Sscanf(idStr, "%d", &pdfID) if err != nil { http.Error(w, "invalid id", http.StatusBadRequest) return } pdf, err := db.GetPDF(h.db, pdfID) if err != nil { http.Error(w, err.Error(), http.StatusNotFound) return } if pdf.Status != "completed" { writeJSON(w, http.StatusConflict, map[string]string{ "error": fmt.Sprintf("PDF is not ready (status: %s)", pdf.Status), }) return } // Parse request body var values []struct { ID int `json:"id"` Value string `json:"value"` } if err := json.NewDecoder(r.Body).Decode(&values); err != nil { http.Error(w, "invalid JSON body", http.StatusBadRequest) return } // Find original PDF path pdfPath := filepath.Join(h.pdfDir, fmt.Sprintf("%s.pdf", pdf.MD5Hash)) // Get markdown (custom or extracted) to parse shortcodes var markdown string if pdf.CustomMarkdown != nil && *pdf.CustomMarkdown != "" { markdown = *pdf.CustomMarkdown } else { markdown, err = pdfcontent.ExtractPDFContent(pdfPath) if err != nil { log.Printf("PDF %d: extraction for fill failed: %v", pdfID, err) writeJSON(w, http.StatusInternalServerError, map[string]string{ "error": fmt.Sprintf("extraction failed: %v", err), }) return } } // Re-extract raw field positions from PDF rawFields, err := pdfprocessor.ExtractFormFields(pdfPath) if err != nil { log.Printf("PDF %d: re-extract fields for fill failed: %v", pdfID, err) writeJSON(w, http.StatusInternalServerError, map[string]string{ "error": fmt.Sprintf("extracting field positions: %v", err), }) return } // Build default field types from raw PDF extraction rawDefaultTypes := make(map[int]string) for i, rf := range rawFields { rawDefaultTypes[i] = rf.Type } // Parse field shortcodes from markdown, with raw PDF types as fallback scFields := shortcode.GetFieldsWithDefaults(markdown, rawDefaultTypes) if len(scFields) == 0 { http.Error(w, "no form fields found in markdown", http.StatusBadRequest) return } // Build shortcode field map by idx scFieldMap := make(map[int]*shortcode.Field) for _, f := range scFields { scFieldMap[f.Idx] = f } // Build field values map (idx -> value) fieldValues := make(map[int]string) for _, v := range values { fieldValues[v.ID] = v.Value } // Build group map from shortcodes groupFields := make(map[string][]int) // group name -> field indices for _, f := range scFields { if f.Group != "" { groupFields[f.Group] = append(groupFields[f.Group], f.Idx) } } // Build group type from member types groupTypes := make(map[string]string) for gName, indices := range groupFields { var types []string for _, idx := range indices { if f, ok := scFieldMap[idx]; ok { types = append(types, f.Type) } } groupTypes[gName] = inferGroupTypeFromTypes(types) } // Convert multiline group values for gName, indices := range groupFields { if groupTypes[gName] != "multiline" { continue } if len(indices) < 2 { continue } // First field has the combined value combined := fieldValues[indices[0]] if combined == "" { for _, idx := range indices[1:] { fieldValues[idx] = "" } continue } // Split lines and distribute across fields lines := strings.Split(combined, "\n") n := len(indices) linesPerField := len(lines) / n extra := len(lines) % n start := 0 for i, idx := range indices { count := linesPerField if i < extra { count++ } end := start + count if end > len(lines) { end = len(lines) } fieldValues[idx] = strings.Join(lines[start:end], "\n") start = end } } // Build fill field list fillFields := make([]pdfprocessor.FillField, 0, len(fieldValues)) for idx, val := range fieldValues { sf, ok := scFieldMap[idx] if !ok { http.Error(w, fmt.Sprintf("field idx %d not found", idx), http.StatusBadRequest) return } // idx maps to raw field position if idx >= len(rawFields) { http.Error(w, fmt.Sprintf("field idx %d out of range", idx), http.StatusBadRequest) return } rf := rawFields[idx] fillFields = append(fillFields, pdfprocessor.FillField{ Name: rf.Name, Type: sf.Type, Value: val, PosX: rf.PosX, PosY: rf.PosY, Width: rf.Width, Height: rf.Height, Page: rf.Page, }) } // Fill the PDF filledBytes, err := pdfprocessor.FillPDF(pdfPath, fillFields) if err != nil { log.Printf("PDF %d: fill failed: %v", pdfID, err) writeJSON(w, http.StatusInternalServerError, map[string]string{ "error": fmt.Sprintf("filling PDF failed: %v", err), }) return } w.Header().Set("Content-Type", "application/pdf") w.Header().Set("Content-Disposition", fmt.Sprintf("inline; filename=filled_pdf_%d.pdf", pdfID)) w.Write(filledBytes) } func buildDefaultFieldTypes(pdfPath string) (htmlform.DefaultFieldTypes, error) { rawFields, err := pdfprocessor.ExtractFormFields(pdfPath) if err != nil { return nil, err } defaults := make(htmlform.DefaultFieldTypes) for i, rf := range rawFields { defaults[i] = rf.Type } return defaults, nil } func inferGroupTypeFromTypes(types []string) string { if len(types) < 2 { return "" } allSame := true firstType := types[0] for _, t := range types[1:] { if t != firstType { allSame = false break } } if !allSame { return "" } switch firstType { case "text", "email", "tel", "number", "url", "date", "textarea": return "multiline" case "button": return "checkbox_group" case "radio": return "radio_group" } return "" } func (h *Handler) ExtractContent(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } idStr := r.URL.Query().Get("id") if idStr == "" { http.Error(w, "missing id parameter", http.StatusBadRequest) return } var pdfID int _, err := fmt.Sscanf(idStr, "%d", &pdfID) if err != nil { http.Error(w, "invalid id", http.StatusBadRequest) return } pdf, err := db.GetPDF(h.db, pdfID) if err != nil { http.Error(w, err.Error(), http.StatusNotFound) return } // Use LLM-processed CustomMarkdown (has labels) when available var markdown string if pdf.CustomMarkdown != nil && *pdf.CustomMarkdown != "" { markdown = *pdf.CustomMarkdown } else { pdfPath := filepath.Join(h.pdfDir, fmt.Sprintf("%s.pdf", pdf.MD5Hash)) if _, err := os.Stat(pdfPath); err != nil { http.Error(w, "PDF file not found", http.StatusNotFound) return } markdown, err = pdfcontent.ExtractPDFContent(pdfPath) if err != nil { log.Printf("PDF %d: content extraction failed: %v", pdfID, err) http.Error(w, fmt.Sprintf("extraction failed: %v", err), http.StatusInternalServerError) return } } w.Header().Set("Content-Type", "text/plain; charset=utf-8") w.Write([]byte(normalizeLigatures(markdown))) } func (h *Handler) RenderForm(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } idStr := r.URL.Query().Get("id") if idStr == "" { http.Error(w, "missing id parameter", http.StatusBadRequest) return } var pdfID int _, err := fmt.Sscanf(idStr, "%d", &pdfID) if err != nil { http.Error(w, "invalid id", http.StatusBadRequest) return } pdf, err := db.GetPDF(h.db, pdfID) if err != nil { http.Error(w, err.Error(), http.StatusNotFound) return } var markdown string pdfPath := filepath.Join(h.pdfDir, fmt.Sprintf("%s.pdf", pdf.MD5Hash)) if pdf.CustomMarkdown != nil && *pdf.CustomMarkdown != "" { markdown = *pdf.CustomMarkdown } else { if _, statErr := os.Stat(pdfPath); statErr != nil { http.Error(w, "PDF file not found", http.StatusNotFound) return } markdown, err = pdfcontent.ExtractPDFContent(pdfPath) if err != nil { log.Printf("PDF %d: content extraction failed: %v", pdfID, err) http.Error(w, fmt.Sprintf("extraction failed: %v", err), http.StatusInternalServerError) return } } // Build default field types from raw PDF extraction defaultTypes, _ := buildDefaultFieldTypes(pdfPath) html := htmlform.Render(markdown, defaultTypes) w.Header().Set("Content-Type", "text/html") w.Write([]byte(html)) } func writeJSON(w http.ResponseWriter, status int, v interface{}) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) json.NewEncoder(w).Encode(v) }