import React, { useState } from 'react';
// pdfjs for text extraction
import * as pdfjsLib from 'pdfjs-dist';
// pdf-lib for splitting and creating new PDFs
import { PDFDocument } from 'pdf-lib';
import JSZip from 'jszip';

// Adjust worker source to match your pdfjs-dist version
pdfjsLib.GlobalWorkerOptions.workerSrc =
  '//cdnjs.cloudflare.com/ajax/libs/pdf.js/3.3.122/pdf.worker.min.js';

interface InvoiceMap {
  [invoiceNumber: string]: number[]; // maps invoice # to array of page indices
}

const SplitPdf: React.FC = () => {
  const [files, setFiles] = useState<File[]>([]);
  const [loading, setLoading] = useState<boolean>(false);
  const [customFileName, setCustomFileName] = useState('');
  const [totalPages, setTotalPages] = useState(0);

  /**
   * Helper: Extract text from each page in a PDF
   */
  const extractPdfText = async (pdfData: ArrayBuffer): Promise<string[]> => {
    const loadingTask = pdfjsLib.getDocument({ data: pdfData });
    const pdf = await loadingTask.promise;

    const pageTexts: string[] = [];
    for (let pageIndex = 0; pageIndex < pdf.numPages; pageIndex++) {
      const page = await pdf.getPage(pageIndex + 1);
      const textContent = await page.getTextContent();
      const strings = textContent.items
        .map((item) => (item as any).str)
        .join(' ');
      pageTexts.push(strings);
    }
    return pageTexts;
  };

  /**
   * Helper: Group page indices by invoice number
   * This is just an example using a simple regex.
   * Adjust the logic/regex to your actual invoice format.
   */
  const groupPagesByInvoice = (pageTexts: string[]): InvoiceMap => {
    const invoiceRegex = /Invoice NO:\s?(\d+)/i;
    // Matches "Inv No:", "In% No:", or just "In No:"
    const invoiceRegex2 = /In(?:v|%)?\s?No:\s?(\d+)/i;
    const invoiceRegex3 = /Tax Invoice:\s?(\d+)/i;
    const invoicesByPage: InvoiceMap = {};

    pageTexts.forEach((text, pageIndex) => {
      const cleanedText = text.replace(/\s+/g, ' ');
      const match =
        cleanedText.match(invoiceRegex) ||
        cleanedText.match(invoiceRegex2) ||
        cleanedText.match(invoiceRegex3);
      if (match) {
        const invoiceNumber = match[1];
        if (!invoicesByPage[invoiceNumber]) {
          invoicesByPage[invoiceNumber] = [];
        }
        invoicesByPage[invoiceNumber].push(pageIndex);
      } else {
        // Optionally handle scenario if no invoice found
        // e.g. "Unknown", or skip the page
      }
    });

    return invoicesByPage;
  };

  /**
   * Helper: Download a PDF byte array as a file
   */
  const downloadPdf = async (zip: JSZip): Promise<void> => {
    // Generate the ZIP as a blob
    const zipContent = await zip.generateAsync({ type: 'blob' });

    // Download
    const url = URL.createObjectURL(zipContent);
    const link = document.createElement('a');
    link.href = url;
    link.download = `invoices ${new Date()
      .toLocaleDateString('en-GB')
      .replace(/\//g, '-')}.zip`;
    link.click();
    URL.revokeObjectURL(url);
  };

  /**
   * Handle `<input type="file" />` changes
   */
  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
    if (!e.target.files || e.target.files.length === 0) return;
    // Convert FileList to an array of File
    const selectedFiles = Array.from(e.target.files);
    setFiles(selectedFiles);
  };

  /**
   * Main process: read PDF, find invoice boundaries, **just log invoice numbers**.
   */
  const processPDFs = async () => {
    if (files.length === 0) return;

    const zip = new JSZip();
    const pagesFound: number[] = [];

    setLoading(true);
    try {
      for (const file of files) {
        // 1) Read the file's ArrayBuffer once
        const originalArrayBuffer = await file.arrayBuffer();

        // 2) Make a copy for pdfjs-dist
        const arrayBufferForPdfJs = originalArrayBuffer.slice(0);
        // Use it to extract text with pdfjs-dist
        const pageTexts = await extractPdfText(arrayBufferForPdfJs);

        // 3) Make another copy for pdf-lib
        const arrayBufferForPdfLib = originalArrayBuffer.slice(0);

        // 4) Determine the pages for each invoice
        const invoicesByPage = groupPagesByInvoice(pageTexts);

        setTotalPages(Object.keys(invoicesByPage).length);

        // 5) For each invoice, copy just those pages into a new PDF & download
        for (const [invoiceNumber, pageIndices] of Object.entries(
          invoicesByPage
        )) {
          pagesFound.push(...pageIndices.map((i) => i + 1));

          // Load the original PDF copy for pdf-lib
          const originalPdf = await PDFDocument.load(arrayBufferForPdfLib);
          const newPdf = await PDFDocument.create();

          // Copy the relevant pages for this invoice
          for (const pageIndex of pageIndices) {
            const [copiedPage] = await newPdf.copyPages(originalPdf, [
              pageIndex,
            ]);
            newPdf.addPage(copiedPage);
          }

          // Serialize the new PDF
          const pdfBytes = await newPdf.save();

          // Name the downloaded PDF by the invoice number
          const fileName = `${customFileName.trim()}${invoiceNumber}.pdf`;
          zip.file(fileName, pdfBytes);
        }
        console.log(
          'Pages found:',
          pagesFound.sort((a, b) => a - b)
        );
      }
      downloadPdf(zip);
    } catch (error) {
      console.error('Error processing PDFs:', error);
    } finally {
      setLoading(false);
    }
  };

  return (
    <div
      style={{
        marginTop: '20px',
      }}
    >
      <h2>Invoice Splitter</h2>
      <input
        type='file'
        accept='application/pdf'
        multiple
        onChange={handleFileChange}
      />
      <button onClick={processPDFs} disabled={loading || files.length === 0}>
        {loading ? 'Splitting...' : 'Split by Invoice'}
      </button>
      <div className='mt-3'>
        <p>Custom Files Name</p>
        <input
          type='text'
          value={customFileName}
          onChange={(e) => setCustomFileName(e.target.value)}
        />
      </div>
      <p className='mt-3'>Total pages found: {totalPages}</p>
    </div>
  );
};

export default SplitPdf;
