#!/bin/bash
#
# doc2md - Convert documents (PDF, DOCX, etc.) to Markdown
#

set -e

# Ensure tools are in PATH
export PATH="$HOME/.local/bin:/opt/homebrew/bin:$PATH"

# Mode flags
LEGAL_MODE=${LEGAL_MODE:-0}
CLEAN_MODE=${CLEAN_MODE:-1}  # Clean mode is now DEFAULT for PDFs

show_help() {
    cat << 'EOF'
Usage: doc2md <input-file> [output-file]

Convert documents to Markdown format.

Supported formats:
  - PDF (.pdf) - extracts text and converts to markdown
  - Word (.docx, .doc) - converts with formatting
  - HTML (.html, .htm) - converts to markdown
  - PowerPoint (.pptx) - converts to markdown
  - Many more formats supported by pandoc

Tools used:
  - markitdown (Microsoft) - Best for complex PDFs with tables/formulas
  - marker (AI-powered) - Best for academic/scientific PDFs
  - pandoc - Universal converter for DOCX, HTML, etc.
  - pdftotext - Fast simple PDF text extraction

Examples:
  doc2md document.pdf                    # Basic conversion
  doc2md report.docx output.md           # Specify output
  doc2md paper.pdf                       # Uses markitdown by default
  USE_MARKER=1 doc2md research.pdf       # Use marker for complex PDFs
  LEGAL_MODE=1 doc2md contract.docx      # Legal mode - explicit strikethrough
  CLEAN_MODE=0 doc2md document.pdf       # Fast mode - skip line break cleanup

Environment variables:
  USE_MARKER=1    - Use marker (AI-powered) instead of markitdown for PDFs
  USE_PANDOC=1    - Use pandoc instead of markitdown for PDFs
  LEGAL_MODE=1    - Convert strikethrough ~~text~~ to [DELETED: text] for LLMs
  CLEAN_MODE=0    - Skip line break cleanup (old behavior, faster)

CLEAN MODE (DEFAULT for PDFs):
  Automatically fixes unnecessary line breaks in PDF conversions by:
  - Joining lines that are part of the same paragraph
  - Preserving paragraph breaks
  - Removing hyphenation at line breaks
  - Preserving markdown headers and lists
  
  Clean mode is now ON by default! Use CLEAN_MODE=0 to skip.

If output-file is not specified, output goes to stdout.
EOF
}

# Apply legal mode formatting (make strikethrough explicit)
apply_legal_formatting() {
    local file="$1"
    # Use sed to replace ~~text~~ with [DELETED: text]
    if [ -f "$file" ]; then
        sed -i.bak -E 's/~~([^~]+)~~/[DELETED: \1]/g' "$file" 2>/dev/null || \
            perl -i -pe 's/~~(.*?)~~/[DELETED: $1]/g' "$file" 2>/dev/null || true
        rm -f "${file}.bak" 2>/dev/null || true
    fi
}

# Clean up line breaks in PDF text and remove headers/footers/page numbers
cleanup_line_breaks() {
    local file="$1"
    local temp_file="${file}.tmp"
    
    if [ ! -f "$file" ]; then
        return
    fi
    
    # Use Perl for sophisticated cleanup
    perl -0777 -pe '
        # First, handle hyphenation at line breaks (word- \n word -> wordword)
        s/(\w)-\s*\n\s*(\w)/$1$2/g;
        
        # Join lines within paragraphs (lines that dont end with sentence-ending punctuation)
        # Dont join if next line is a header, list item, or blank
        while (s/([^\n\.\?\!])\n(?!\n)(?!#)(?!\s*[-*])(?!\s*\d+\.)([A-Z][^\n])/$1 $2/g) {}
        
        # Also join lines that are clearly mid-sentence
        while (s/([^\.\?\!\n])\n([a-z][^\n]{0,50})\n/$1 $2\n/g) {}
        
        # Clean up excessive blank lines (more than 2 consecutive newlines)
        s/\n{3,}/\n\n/g;
        
        # Clean up multiple spaces
        s/  +/ /g;
        
        # Ensure blank line before and after headers
        s/([^\n])\n(#)/$1\n\n$2/g;
        s/(#{1,6}[^\n]+)\n([^\n#])/$1\n\n$2/g;
        
        # Remove standalone page numbers (lines with just 1-4 digit numbers)
        s/\n\s*\d{1,4}\s*\n/\n/g;
        
        # Remove common footer patterns (e.g., "Page 5 of 10", "5/10", "- 5 -")
        s/\n\s*(Page\s+\d+\s+(of|of\s+\d+))?\s*\n/\n/gi;
        s/\n\s*-?\s*\d+\s*\/\s*\d+\s*-?\s*\n/\n/g;
        s/\n\s*-\s*\d+\s*-\s*\n/\n/g;
    ' "$file" > "$temp_file"
    
    if [ -s "$temp_file" ]; then
        mv "$temp_file" "$file"
    else
        rm -f "$temp_file"
    fi
    
    # Second pass: remove repeated headers/footers
    perl -i -0777 -pe '
        my %lines;
        my @all_lines = split(/\n/, $_);
        
        # Count occurrences of short lines
        foreach my $line (@all_lines) {
            if (length($line) < 60 && length($line) > 3) {
                $lines{$line}++;
            }
        }
        
        # Remove lines that appear 3+ times (likely headers/footers)
        foreach my $line (keys %lines) {
            if ($lines{$line} >= 3) {
                my $escaped = quotemeta($line);
                s/\n$escaped\n/\n/g;
            }
        }
    ' "$file" 2>/dev/null || true
}

# Check arguments
if [ $# -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
    show_help
    exit 0
fi

INPUT_FILE="$1"
OUTPUT_FILE="${2:-}"

# Check if input file exists
if [ ! -f "$INPUT_FILE" ]; then
    echo "Error: File not found: $INPUT_FILE" >&2
    exit 1
fi

# Get file extension
EXT="${INPUT_FILE##*.}"
EXT_LOWER=$(echo "$EXT" | tr '[:upper:]' '[:lower:]')

# Create temp file for processing if needed
TEMP_OUTPUT=""
if [ -z "$OUTPUT_FILE" ]; then
    TEMP_OUTPUT=$(mktemp)
    OUTPUT_FILE="$TEMP_OUTPUT"
fi

# Convert based on file type
case "$EXT_LOWER" in
    pdf)
        if [ -n "$USE_MARKER" ]; then
            # Use marker for AI-powered conversion (best for complex layouts)
            TEMP_DIR=$(mktemp -d)
            marker_single "$INPUT_FILE" --output_format markdown --disable_image_extraction 2>/dev/null
            MD_FILE="${INPUT_FILE%.pdf}.md"
            if [ -f "$MD_FILE" ]; then
                mv "$MD_FILE" "$OUTPUT_FILE"
                echo "Converted: $INPUT_FILE -> $OUTPUT_FILE (using marker)" >&2
            else
                echo "Error: Marker conversion failed" >&2
                rm -f "$TEMP_OUTPUT"
                exit 1
            fi
            rmdir "$TEMP_DIR" 2>/dev/null || true
        elif [ -n "$USE_PANDOC" ]; then
            # Use pandoc (fallback)
            pandoc "$INPUT_FILE" -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE"
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE (using pandoc)" >&2
        else
            # Default: use markitdown (best overall)
            markitdown "$INPUT_FILE" > "$OUTPUT_FILE"
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE (using markitdown)" >&2
        fi
        ;;
    docx|doc)
        # Use pandoc for Word documents
        pandoc "$INPUT_FILE" -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE"
        echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        ;;
    pptx|ppt)
        # Use markitdown for PowerPoint
        if command -v markitdown >/dev/null 2>&1; then
            markitdown "$INPUT_FILE" > "$OUTPUT_FILE"
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        else
            pandoc "$INPUT_FILE" -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE"
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        fi
        ;;
    html|htm)
        # Use pandoc for HTML
        pandoc "$INPUT_FILE" -f html -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE"
        echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        ;;
    xlsx|xls|csv)
        # Excel/CSV - use markitdown if available
        if command -v markitdown >/dev/null 2>&1; then
            markitdown "$INPUT_FILE" > "$OUTPUT_FILE"
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        else
            echo "Error: Excel/CSV conversion requires markitdown" >&2
            rm -f "$TEMP_OUTPUT"
            exit 1
        fi
        ;;
    txt|md|markdown)
        # Text files - just copy or cat
        cp "$INPUT_FILE" "$OUTPUT_FILE"
        echo "Copied: $INPUT_FILE -> $OUTPUT_FILE" >&2
        ;;
    *)
        # Try markitdown first for unknown formats
        if command -v markitdown >/dev/null 2>&1; then
            if markitdown "$INPUT_FILE" > "$OUTPUT_FILE" 2>/dev/null; then
                echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
            else
                # Fallback to pandoc
                pandoc "$INPUT_FILE" -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE" 2>/dev/null || {
                    echo "Error: Unsupported file format: .$EXT" >&2
                    echo "Supported: pdf, docx, doc, pptx, html, xlsx, csv, txt, md" >&2
                    rm -f "$TEMP_OUTPUT"
                    exit 1
                }
                echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
            fi
        else
            # Fallback to pandoc
            pandoc "$INPUT_FILE" -t markdown+strikeout --wrap=none -o "$OUTPUT_FILE" 2>/dev/null || {
                echo "Error: Unsupported file format: .$EXT" >&2
                echo "Supported: pdf, docx, doc, pip install markitdown[pdf]
pptx, html, xlsx, csv, txt, md" >&2
                rm -f "$TEMP_OUTPUT"
                exit 1
            }
            echo "Converted: $INPUT_FILE -> $OUTPUT_FILE" >&2
        fi
        ;;
esac

# Apply clean mode formatting by default for PDFs (unless disabled)
if [ "$CLEAN_MODE" != "0" ] && [ "$EXT_LOWER" = "pdf" ] && [ -f "$OUTPUT_FILE" ]; then
    cleanup_line_breaks "$OUTPUT_FILE"
fi

# Apply legal mode formatting if enabled
if [ "$LEGAL_MODE" = "1" ] && [ -f "$OUTPUT_FILE" ]; then
    apply_legal_formatting "$OUTPUT_FILE"
    deletions=$(grep -c '\[DELETED:' "$OUTPUT_FILE" 2>/dev/null || echo "0")
    if [ "$deletions" -gt 0 ]; then
        echo "Legal mode: $deletions deletion(s) made explicit" >&2
    fi
fi

# If we used a temp file, output to stdout and cleanup
if [ -n "$TEMP_OUTPUT" ]; then
    cat "$TEMP_OUTPUT"
    rm -f "$TEMP_OUTPUT"
fi
