<?php
// file_extract_functions.php
require_once 'pdfparser-master/alt_autoload.php-dist';
include_once "ai_keys.php";

function extractTextFromPDF($pdf_string)
{
    //var_dump($pdf_string);
    $tempFilePath = tempnam(sys_get_temp_dir(), 'pdfparser_');
    if ($tempFilePath === false) {
        die("Error: Could not create temporary file name.");
    }
    if (file_put_contents($tempFilePath, $pdf_string) === false) {
        unlink($tempFilePath); // Clean up
        die("Error: Could not write PDF string to temporary file '{$tempFilePath}'.");
    }
            
    $extractedText = '';
    $parser = null; // Initialize parser outside try block for potential cleanup
            
    try {
        // 3. Instantiate Parser and Parse
        $parser = new \Smalot\PdfParser\Parser();
        $pdf = $parser->parseFile($tempFilePath);
            
        // 4. Extract Text
        $extractedText = $pdf->getText();
            
    } catch (\Exception $e) {
        // 5. Handle Errors during parsing
        //echo "Error parsing PDF: " . htmlspecialchars($e->getMessage());
        // Optional: Log the full error $e
    } finally {
        // 6. Clean up Temporary File (always runs)
        if (file_exists($tempFilePath)) {
            unlink($tempFilePath);
        }
    }
    
    return $extractedText;
}

/**
 * REVISED EXTRACTION FUNCTION: Uses a command-line tool with layout preservation
 * to correctly extract text from structured PDFs like invoices.
 *
 * PRE-REQUISITE: The 'pdftotext' command-line utility (from the Poppler package)
 * must be installed on the server where this script runs.
 *
 * @param string $pdfContent The raw binary content of the PDF file.
 * @return string The text extracted from the PDF, with layout preserved.
 */
function extractTextFromPDF_withLayout(string $pdfContent): string {
    // Define the command to execute. The '-layout' flag is critical.
    // The first '-' tells pdftotext to read from standard input.
    // The second '-' tells it to write the output to standard output.
    $command = 'pdftotext -layout -enc UTF-8 - -';

    // Define process descriptors
    $descriptorspec = [
       0 => ["pipe", "r"],  // stdin is a pipe that the child will read from
       1 => ["pipe", "w"],  // stdout is a pipe that the child will write to
       2 => ["pipe", "w"]   // stderr is a pipe to capture errors
    ];

    // Start the process
    $process = proc_open($command, $descriptorspec, $pipes);

    if (is_resource($process)) {
        // Write the PDF content to the process's standard input
        fwrite($pipes[0], $pdfContent);
        fclose($pipes[0]);

        // Read the extracted text from the process's standard output
        $extractedText = stream_get_contents($pipes[1]);
        fclose($pipes[1]);

        // Read any errors
        $errors = stream_get_contents($pipes[2]);
        fclose($pipes[2]);

        // Close the process
        proc_close($process);

        if (!empty($errors)) {
            error_log("pdftotext execution error: " . $errors);
        }

        return $extractedText;
    }

    error_log("Failed to open process for pdftotext command.");
    return ''; // Return empty string on failure
}

/**
 * THE MODERN SOLUTION (PRODUCTION-READY): Uploads a full PDF directly to the Gemini API for analysis.
 * This function uses the same robust error-handling and response-parsing patterns as the
 * call_gemini_for_organisation_disambiguation() example.
 *
 * It requires NO special server installations (other than PHP's cURL extension).
 *
 * @param string $pdfContent The raw binary content of the PDF file.
 * @param string $fileName   A descriptive name for the file, e.g., 'invoice.pdf'.
 * @return array An associative array containing 'success' (bool), 'data' (array|null), and 'error' (string|null).
 */
function extractDataFromPdfWithGemini(string $pdfContent, string $fileName): array {
    // In a real application, get these securely
    $apiKey = getGeminiApiKey(); // You need a function to provide your key
    $uploadApiUrl = "https://generativelanguage.googleapis.com/v1beta/files?key=" . $apiKey;
    $generateApiUrl = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" . $apiKey;

    if (empty($apiKey) || $apiKey === 'YOUR_GEMINI_API_KEY_HERE') {
        return ['success' => false, 'data' => null, 'error' => 'Gemini API key is not configured.'];
    }

    // --- STEP 1: UPLOAD THE FILE TO THE GEMINI FILE SERVICE ---
    $upload_ch = curl_init();
    curl_setopt_array($upload_ch, [
        CURLOPT_URL => $uploadApiUrl,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => $pdfContent,
        CURLOPT_HTTPHEADER => [
            'Content-Type: application/pdf',
            'x-goog-file-name: ' . $fileName
        ],
        CURLOPT_CONNECTTIMEOUT => 20,
        CURLOPT_TIMEOUT => 60,
    ]);
    $upload_response_body = curl_exec($upload_ch);
    $upload_http_code = curl_getinfo($upload_ch, CURLINFO_HTTP_CODE);
    $upload_curl_error = curl_error($upload_ch);
    curl_close($upload_ch);

    if ($upload_curl_error || $upload_http_code >= 400) {
        $error_message = "Gemini API File Upload Error (HTTP {$upload_http_code}, cURL {$upload_curl_error}): " . $upload_response_body;
        error_log("extractDataFromPdfWithGemini: " . $error_message);
        return ['success' => false, 'data' => null, 'error' => "Failed to upload PDF to Gemini service (HTTP {$upload_http_code})."];
    }

    $upload_data = json_decode($upload_response_body, true);
    $file_uri = $upload_data['file']['uri'] ?? null;

    if (!$file_uri) {
        error_log("extractDataFromPdfWithGemini: Could not get file URI from response. Raw: " . $upload_response_body);
        return ['success' => false, 'data' => null, 'error' => 'Could not get file URI from Gemini API upload response.'];
    }

    // --- STEP 2: MAKE THE PROMPT REQUEST, REFERENCING THE UPLOADED FILE ---
    $prompt = "
    Analyze the provided PDF invoice statement.
    Extract all applicable line items from all pages.
    Fields to extract for EACH item:
        'supplier_name', 'supplier_email_domain', 'tdu_class_number', 'passenger_name', 'confirmation_number',
        'invoice_number', 'service_date', 'invoice_amount', 'description'.
    Output Instructions:
        Return a single, valid JSON array. Each element an object. If a field is not found, use 'N/A'.
        For 'supplier_email_domain', extract the domain from any supplier email address.
        class number starts with a TDU and maybe ends in a G or not depending if its FIT or Group and there will never be whitespace in between example: TDU12345G(correct), TDU88333(correct), TDU 12345(wrong), TDU 12345 G(wrong).
        Our company is Turtle Down Under, it is exempt as supplier, never extract it as a supplier.
    Example Output:
    ```json
    [
      {
        \"supplier_name\": \"Sydney Opera House Management\",
        \"supplier_email_domain\": \"sydneyoperahouse.com\",
        \"tdu_class_number\": \"TDU26972\",
        \"passenger_name\": \"Banwari Lal Rathi\",
        \"confirmation_number\": \"48910505\",
        \"invoice_number\": \"SD051775\",
        \"service_date\": \"2025-05-14\",
        \"invoice_amount\": 180.00,
        \"description\": \"NO SHOW Sydney Opera House Tour/Wed 14 May 2025/Pax:5\"
      }
    ]
    ```";

    $payload = [
        'contents' => [
            [
                'parts' => [
                    ['text' => $prompt],
                    ['file_data' => ['mime_type' => 'application/pdf', 'file_uri' => $file_uri]]
                ]
            ]
        ],
        'generationConfig' => [
            'temperature' => 0.2,
            'response_mime_type' => "application/json"
        ]
    ];

    $generate_ch = curl_init();
    curl_setopt_array($generate_ch, [
        CURLOPT_URL => $generateApiUrl,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => json_encode($payload),
        CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
        CURLOPT_CONNECTTIMEOUT => 25,
        CURLOPT_TIMEOUT => 120,
    ]);
    
    $generate_response_body = curl_exec($generate_ch);
    $generate_http_code = curl_getinfo($generate_ch, CURLINFO_HTTP_CODE);
    $generate_curl_error = curl_error($generate_ch);
    curl_close($generate_ch);

    if ($generate_curl_error || $generate_http_code >= 400) {
        $error_message = "Gemini API Content Generation Error (HTTP {$generate_http_code}, cURL {$generate_curl_error}): " . $generate_response_body;
        error_log("extractDataFromPdfWithGemini: " . $error_message);
        return ['success' => false, 'data' => null, 'error' => "Gemini API failed to process the PDF (HTTP {$generate_http_code})."];
    }

    $responseData = json_decode($generate_response_body, true);
    if (json_last_error() !== JSON_ERROR_NONE) {
        error_log("extractDataFromPdfWithGemini: Failed to decode main API JSON response. Raw: " . $generate_response_body);
        return ['success' => false, 'data' => null, 'error' => 'Gemini API returned an invalid outer JSON response.'];
    }

    if (isset($responseData['promptFeedback']['blockReason'])) {
        $blockReason = $responseData['promptFeedback']['blockReason'] ?? 'Unknown';
        error_log("extractDataFromPdfWithGemini: Gemini Content Blocked: " . $blockReason);
        return ['success' => false, 'data' => null, 'error' => 'Gemini: Content blocked by safety settings (' . $blockReason . ').'];
    }

    $extractedJsonText = $responseData['candidates'][0]['content']['parts'][0]['text'] ?? null;
    if ($extractedJsonText === null) {
        error_log("extractDataFromPdfWithGemini: Could not extract text part from Gemini response. Raw: " . $generate_response_body);
        return ['success' => false, 'data' => null, 'error' => 'Gemini API: No text part in the response payload.'];
    }

    // Robustly trim markdown fences, just like in your example
    $trimmedText = trim($extractedJsonText);
    if (substr($trimmedText, 0, 7) === "```json") {
        $trimmedText = substr($trimmedText, 7);
        if (substr($trimmedText, -3) === "```") {
            $trimmedText = substr($trimmedText, 0, -3);
        }
    }
    $trimmedText = trim($trimmedText);

    $parsedData = json_decode($trimmedText, true);
    if (json_last_error() !== JSON_ERROR_NONE && is_array($parsedData)) {
        // SUCCESS!
        return ['success' => true, 'data' => $parsedData, 'error' => null];
    } else {
        error_log("extractDataFromPdfWithGemini: Failed to decode extracted JSON from AI. Error: " . json_last_error_msg() . ". JSON Text: " . $trimmedText);
        return ['success' => false, 'data' => null, 'error' => 'Failed to parse the final JSON data returned by the AI.'];
    }
}

/**
 * Helper function to provide the Gemini API Key.
 * In a real application, fetch this from a secure configuration file or environment variable.
 *
 * @return string The Gemini API Key.
 */
function getGeminiApiKey(): string {
    // Replace with your actual key or a secure retrieval method.
    return apikey();
}

/**
 * NEW FUNCTION (PLAN B): Converts a PDF to a series of images and sends them to a multimodal AI for OCR.
 *
 * @param string $pdfContent The raw binary content of the PDF file.
 * @return array The structured data extracted from all pages, or an error message.
 */
function extractDataFromPdfImagesAI(string $pdfContent): array {
    if (!class_exists('Imagick')) {
        return ['success' => false, 'error' => 'The Imagick PHP extension is not installed on the server. This method is not available.'];
    }

    try {
        $imagick = new Imagick();
        // Set a high resolution for better OCR accuracy
        $imagick->setResolution(300, 300);
        // Load all pages from the PDF blob
        $imagick->readImageBlob($pdfContent);

        $allItems = [];
        $pageNumber = 1;
        // Loop through each page of the PDF
        foreach ($imagick as $pageImage) {
            $pageImage->setImageFormat('jpeg'); // Use JPEG for smaller file size
            $pageImage->setImageCompressionQuality(90);

            // Get the binary image data for the current page
            $imageBlob = $pageImage->getImageBlob();

            // Send this single image to the AI for processing
            $pageResult = extractDataFromImageAI($imageBlob, $pageNumber);

            if ($pageResult['success'] && is_array($pageResult['data'])) {
                // Add the items from this page to our master list
                $allItems = array_merge($allItems, $pageResult['data']);
            } else {
                error_log("AI OCR failed for page {$pageNumber}: " . $pageResult['error']);
                // Optionally, you could stop on the first error or try to continue
            }
            $pageNumber++;
        }

        $imagick->clear();

        if (empty($allItems)) {
             return ['success' => false, 'error' => 'AI OCR did not extract any items from any page of the PDF.'];
        }

        // Return a response compatible with the rest of your script
        return ['success' => true, 'data' => $allItems, 'error' => null];

    } catch (Exception $e) {
        return ['success' => false, 'error' => 'Imagick processing failed: ' . $e->getMessage()];
    }
}

/**
 * NEW AI HELPER: Sends a single image to the AI and asks for structured data extraction.
 *
 * @param string $imageBlob The binary content of the image file (e.g., JPEG).
 * @param int $pageNum The page number, for context in the prompt.
 * @return array The result from the AI.
 */
function extractDataFromImageAI(string $imageBlob, int $pageNum): array {
    // Base64-encode the image data to embed it in the JSON payload
    $base64Image = base64_encode($imageBlob);

    $prompt = "
    Analyze the following image of an invoice statement (Page {$pageNum}).
    Extract all applicable line items visible on THIS PAGE.
    Fields to extract for EACH item:
        'supplier_name', 'supplier_email_domain', 'tdu_class_number', 'passenger_name', 'confirmation_number',
        'invoice_number', 'service_date', 'invoice_amount', 'description'.
    Output Instructions:
        Return a single, valid JSON array. Each element an object. If a field is not found, use 'N/A'.
        For 'supplier_email_domain', extract the domain (e.g., 'example.com') from any supplier email address.
        class number starts with a TDU and maybe ends in a G or not depending if its FIT or Group and there will never be whitespace in between example: TDU12345G(correct), TDU88333(correct), TDU 12345(wrong), TDU 12345 G(wrong).
        Our company is Turtle Down Under, it is exempt as supplier, never extract it as a supplier.
    Example:
        [{\"supplier_name\": \"Sydney Opera House\", \"supplier_email_domain\": \"sydneyoperahouse.com\", \"tdu_class_number\": \"TDU27014G\", ...}]";

    $apiUrl = apiURL_Flash(); // Assuming this is your Gemini API URL
    $payload = [
        'contents' => [
            [
                'parts' => [
                    ['text' => $prompt],
                    [
                        'inline_data' => [
                            'mime_type' => 'image/jpeg',
                            'data' => $base64Image
                        ]
                    ]
                ]
            ]
        ],
        'generationConfig' => [
            'temperature' => 0.2,
            'response_mime_type' => "application/json"
        ]
    ];
    
    // The rest of the cURL logic is the same as in your original function...
    $headers = ['Content-Type: application/json'];
    $ch = curl_init($apiUrl);
    curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_POSTFIELDS => json_encode($payload), CURLOPT_HTTPHEADER => $headers, CURLOPT_TIMEOUT => 300]);
    $response_body = curl_exec($ch);
    $responseData = json_decode($response_body, true);
    $extractedJsonText = $responseData['candidates'][0]['content']['parts'][0]['text'] ?? null;
    $parsedData = json_decode($extractedJsonText, true);

    if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
        return ['success' => true, 'data' => $parsedData, 'error' => null];
    }
    
    return ['success' => false, 'data' => null, 'error' => 'Failed to parse AI JSON response.'];
}

function extractTextfromWordDoc(string $content_string)
{
    // 1. Handle empty input
    if (empty($content_string)) {
        return '';
    }

    // 2. Check for DOCX (ZIP format signature 'PK..')
    if (strncmp($content_string, "PK\x03\x04", 4) === 0) {
        // --- DOCX Handling using ZipArchive ---
        $extractedText = '';
        $tempFilePath = tempnam(sys_get_temp_dir(), 'docx_');
        if ($tempFilePath === false) {
            error_log("lightweightWordDocExtract: Could not create temp file name for DOCX.");
            return ''; // Cannot proceed
        }

        if (file_put_contents($tempFilePath, $content_string) === false) {
            unlink($tempFilePath); // Clean up
            error_log("lightweightWordDocExtract: Could not write DOCX content to temp file: " . $tempFilePath);
            return ''; // Cannot proceed
        }

        $zip = new \ZipArchive();
        try {
            $res = $zip->open($tempFilePath);
            if ($res === TRUE) {
                // Main content is usually here
                $xmlContent = $zip->getFromName('word/document.xml');

                if ($xmlContent !== false) {
                    // Basic extraction: Strip all XML tags.
                    // This is crude and will lose structure/formatting,
                    // but it's lightweight. It targets text within <w:t> tags.
                    $text = strip_tags($xmlContent);

                    // Decode common XML entities
                    $text = html_entity_decode($text, ENT_QUOTES | ENT_XML1); // Use ENT_XML1 for XML entities

                    // Normalize whitespace
                    $text = preg_replace('/\s+/s', ' ', $text);
                    $extractedText = trim($text);

                } else {
                     error_log("lightweightWordDocExtract: 'word/document.xml' not found in the DOCX file: " . $tempFilePath);
                     // Optionally, could try iterating entries if path varies, but keep it simple
                }
                $zip->close();
            } else {
                 error_log("lightweightWordDocExtract: Failed to open DOCX (ZipArchive error code: " . $res . ") file: " . $tempFilePath);
            }
        } catch (\Exception $e) {
             error_log("lightweightWordDocExtract: Exception processing DOCX file: " . $e->getMessage());
             // Ensure zip is closed if open
             if ($zip->status !== \ZipArchive::ER_OK && $zip->status !== \ZipArchive::ER_CLOSE) {
                 @$zip->close(); // Attempt to close, suppress errors if already closed/invalid
             }
        } finally {
            // Clean up the temporary file
            if (file_exists($tempFilePath)) {
                unlink($tempFilePath);
            }
        }
        return $extractedText;

    } else {
        // --- Fallback for potential DOC (Binary Format) - HIGHLY UNRELIABLE ---
        // Check for the OLE CFBF signature (optional, but good practice)
        $docSignature = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1";
        if (strncmp($content_string, $docSignature, 8) !== 0) {
            // Doesn't have the typical .doc signature OR the .docx signature
             error_log("lightweightWordDocExtract: File signature doesn't match DOCX or common DOC. Treating as unknown binary/text.");
            // Proceed with basic filtering anyway, might be plain text or unknown binary
        } else {
             error_log("lightweightWordDocExtract: Detected potential DOC (binary) signature. Attempting basic text filtering (unreliable).");
        }


        // Very basic approach: filter out non-printable/non-ASCII characters
        // and hope that readable text remains. This ignores the actual file structure.
        // Keep basic Latin letters, numbers, common punctuation, and whitespace.
        // Using '/u' modifier for Unicode support if PHP environment handles it.
        $filtered_text = preg_replace('/[^\p{L}\p{N}\s\.,!?"\'()\[\]{}<>@#$%^&*\-_+=:;~`|\\\/\r\n\t]/u', '', $content_string);

        // If the result is mostly non-text garbage or very short, it probably failed
        if ($filtered_text === null || strlen(trim($filtered_text)) < 20) { // Arbitrary minimum length
             error_log("lightweightWordDocExtract: Basic filtering for potential DOC yielded little or no probable text.");
            return '';
        }

        // Normalize whitespace on the filtered text
        $filtered_text = preg_replace('/\s+/s', ' ', $filtered_text);

        error_log("non docx filter attempt: ".$filtered_text);
        return "";
    }
}

/**
 * Attempts to extract text content from an XLSX file string using ZipArchive.
 * WARNING: This is a VERY basic and unreliable method without external libraries.
 * It ignores spreadsheet structure (rows/columns) and shared string relationships,
 * concatenating all found text. The fallback for binary XLS is extremely unreliable.
 * Use PhpOffice/PhpSpreadsheet for proper extraction if possible.
 *
 * @param string $content_string The raw file content as a string.
 * @return string Attempted extraction of text, likely unstructured. Returns empty string on failure or empty input.
 */
function extractTextFromExcelSheet(string $content_string): string
{
    // 1. Handle empty input
    if (empty($content_string)) {
        return '';
    }

    // 2. Check for XLSX/ZIP format signature 'PK..'
    if (strncmp($content_string, "PK\x03\x04", 4) === 0) {
        // --- XLSX Handling using ZipArchive ---
        $extractedText = '';
        $tempFilePath = tempnam(sys_get_temp_dir(), 'xlsx_');
        if ($tempFilePath === false) {
            error_log("extractTextFromExcelSheet: Could not create temp file name for XLSX.");
            return ''; // Cannot proceed
        }

        if (file_put_contents($tempFilePath, $content_string) === false) {
            unlink($tempFilePath); // Clean up
            error_log("extractTextFromExcelSheet: Could not write XLSX content to temp file: " . $tempFilePath);
            return ''; // Cannot proceed
        }

        $zip = new \ZipArchive();
        try {
            $res = $zip->open($tempFilePath);
            if ($res === TRUE) {
                $fullText = '';

                // a) Extract from Shared Strings table (common location for text)
                $sharedStringsXml = $zip->getFromName('xl/sharedStrings.xml');
                if ($sharedStringsXml !== false) {
                    // Basic extraction: Strip all XML tags, targeting text within <t> tags.
                    $ssText = strip_tags($sharedStringsXml);
                    $ssText = html_entity_decode($ssText, ENT_QUOTES | ENT_XML1);
                    $fullText .= $ssText . ' '; // Add extracted shared strings
                     error_log("extractTextFromExcelSheet: Extracted text from sharedStrings.xml in " . $tempFilePath);
                } else {
                     error_log("extractTextFromExcelSheet: 'xl/sharedStrings.xml' not found in the XLSX file: " . $tempFilePath);
                }

                // b) Extract from individual worksheets (may contain inline strings or numbers)
                for ($i = 0; $i < $zip->numFiles; $i++) {
                    $fileName = $zip->getNameIndex($i);
                    // Look for sheet files within the worksheets directory
                    if (strpos($fileName, 'xl/worksheets/') === 0 && substr($fileName, -4) === '.xml') {
                        $sheetXml = $zip->getFromIndex($i);
                        if ($sheetXml !== false) {
                            // Basic extraction: Strip tags. This gets inline strings (<is><t>text</t></is>)
                            // and potentially cell values (<v>123</v>) as strings.
                            // Structure (rows/cols) is lost.
                            $sheetText = strip_tags($sheetXml);
                            $sheetText = html_entity_decode($sheetText, ENT_QUOTES | ENT_XML1);
                            $fullText .= $sheetText . ' '; // Append extracted sheet text
                            error_log("extractTextFromExcelSheet: Extracted text from sheet " . $fileName . " in " . $tempFilePath);
                        }
                    }
                }

                // Normalize whitespace on the combined text
                $extractedText = preg_replace('/\s+/s', ' ', $fullText);
                $extractedText = trim($extractedText);

                $zip->close();

            } else {
                 error_log("extractTextFromExcelSheet: Failed to open XLSX (ZipArchive error code: " . $res . ") file: " . $tempFilePath);
            }
        } catch (\Exception $e) {
             error_log("extractTextFromExcelSheet: Exception processing XLSX file: " . $e->getMessage());
             // Ensure zip is closed if open
             if (isset($zip) && $zip instanceof \ZipArchive && $zip->status !== \ZipArchive::ER_OK && $zip->status !== \ZipArchive::ER_CLOSE) {
                 @$zip->close();
             }
        } finally {
            // Clean up the temporary file
            if (file_exists($tempFilePath)) {
                unlink($tempFilePath);
            }
        }
        return $extractedText;

    } else {
        // --- Fallback for potential XLS (Binary Format) - EXTREMELY UNRELIABLE ---
        // Check for the OLE CFBF signature (optional)
        $oleSignature = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1";
        if (strncmp($content_string, $oleSignature, 8) !== 0) {
             error_log("extractTextFromExcelSheet: File signature doesn't match XLSX (ZIP) or common XLS (OLE). Treating as unknown binary/text.");
             // Proceed with basic filtering anyway
        } else {
             error_log("extractTextFromExcelSheet: Detected potential XLS (binary) signature. Attempting basic text filtering (highly unreliable).");
        }

        // Very basic approach: filter out non-printable/non-ASCII characters
        // Keep basic Latin letters, numbers, common punctuation, and whitespace.
        // Using '/u' modifier for Unicode support. This ignores the actual file structure.
        $filtered_text = preg_replace('/[^\p{L}\p{N}\s\.,!?"\'()\[\]{}<>@#$%^&*\-_+=:;~`|\\\/\r\n\t]/u', '', $content_string);

        // If the result is mostly non-text garbage or very short, it probably failed
        if ($filtered_text === null || strlen(trim($filtered_text)) < 10) { // Lowered threshold slightly for XLS maybe? Still arbitrary.
             error_log("extractTextFromExcelSheet: Basic filtering for potential XLS yielded little or no probable text.");
            return '';
        }

        // Normalize whitespace on the filtered text
        $filtered_text = preg_replace('/\s+/s', ' ', $filtered_text);
        $filtered_text = trim($filtered_text);

        error_log("extractTextFromExcelSheet: Basic filtering for non-XLSX file returned potential text (highly unreliable): " . substr($filtered_text, 0, 100) . "..."); // Log start of text
        // Even if we got something, it's likely unusable for AI comparison due to lack of structure.
        // Consider returning '' here anyway unless you specifically want this raw filtered data.
        // For consistency with the word function's pattern, we return it, but with heavy caveats.
        return $filtered_text;
        // return ""; // Alternative: return empty string as filtered binary XLS is rarely useful here.
    }
}

/**
 * Attempts to determine a user-friendly file type name from raw string content.
 *
 * Prioritizes using the 'finfo' extension if available (most reliable).
 * Falls back to checking common file signatures (magic numbers) if finfo is unavailable.
 * Translates technical MIME types into simpler names.
 *
 * @param string $content_string The raw string content of the file.
 * @return string A user-friendly file type name (e.g., "PDF Document", "JPEG Image", "Word Document (Modern)", "Plain Text File"),
 *                or "Unknown Binary File" if the type is unknown binary,
 *                or "Empty File" for an empty input string.
 */
function getFileTypeFromString(string $content_string)
{
    // --- Step 1: Determine the technical MIME type ---
    $mime_type = determineMimeType($content_string);

    // --- Step 2: Translate the MIME type to a friendly name ---
    return translateMimeToFriendlyName($mime_type);
}

/**
 * Helper function to determine the technical MIME type (internal logic).
 * Separated for clarity.
 */
function determineMimeType(string $content_string): string
{
     // 1. Handle Empty Input
    if (empty($content_string)) {
        return 'application/x-empty';
    }

    // 2. Preferred Method: Use finfo extension if available
    if (extension_loaded('fileinfo') && function_exists('finfo_open') && function_exists('finfo_buffer')) {
        $finfo = finfo_open(FILEINFO_MIME_TYPE);
        if ($finfo) {
            $determined_mime = finfo_buffer($finfo, $content_string);
            finfo_close($finfo);
            if ($determined_mime !== false) {
                return explode(';', $determined_mime, 2)[0]; // Return cleaned MIME type
            }
            error_log("determineMimeType: finfo_buffer failed.");
        } else {
             error_log("determineMimeType: finfo_open failed.");
        }
    }

    // 3. Fallback Method: Check common magic numbers
    $signatures = [
        // Documents (Prioritize specific signatures)
        '%PDF-'                             => 'application/pdf',
        "\x89PNG\x0D\x0A\x1A\x0A"            => 'image/png',
        "\xFF\xD8\xFF"                      => 'image/jpeg',
        'GIF87a'                            => 'image/gif',
        'GIF89a'                            => 'image/gif',
        'BM'                                => 'image/bmp',
        'II*\x00'                           => 'image/tiff',
        'MM\x00*'                           => 'image/tiff',
        "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"   => 'application/vnd.ms-office', // Generic OLE (DOC, XLS, PPT 97-2003)
        "PK\x03\x04"                        => 'application/zip', // Generic ZIP (DOCX, XLSX, PPTX etc)
        'ID3'                               => 'audio/mpeg',
        '<?xml'                             => 'application/xml', // Case sensitive check here
        "\x1F\x8B"                          => 'application/gzip',
        "RAR!\x1A\x07\x00"                  => 'application/vnd.rar',
        "RAR!\x1A\x07\x01\x00"              => 'application/vnd.rar',
    ];

     foreach ($signatures as $signature => $type) {
        if (strncmp($content_string, $signature, strlen($signature)) === 0) {
            // --- Special handling to refine generic types ---
            if ($type === 'application/zip') {
                // Attempt to identify specific Office formats within the ZIP
                if (stripos($content_string, 'word/document.xml') !== false) return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; // DOCX
                if (stripos($content_string, 'xl/workbook.xml') !== false) return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; // XLSX
                if (stripos($content_string, 'ppt/presentation.xml') !== false) return 'application/vnd.openxmlformats-officedocument.presentationml.presentation'; // PPTX
                // Add checks for ODT ('META-INF/manifest.xml' and 'mimetype' containing 'opendocument.text')?
                 if (stripos($content_string, 'mimetypeapplication/vnd.oasis.opendocument.text') !== false) return 'application/vnd.oasis.opendocument.text'; // ODT
                 if (stripos($content_string, 'mimetypeapplication/vnd.oasis.opendocument.spreadsheet') !== false) return 'application/vnd.oasis.opendocument.spreadsheet'; // ODS
                 if (stripos($content_string, 'mimetypeapplication/vnd.oasis.opendocument.presentation') !== false) return 'application/vnd.oasis.opendocument.presentation'; // ODP

                return 'application/zip'; // Return generic zip if specific content not easily found
            }
             if ($type === 'application/vnd.ms-office') {
                 // Attempt to identify specific older Office formats (less reliable)
                 if (stripos($content_string, 'WordDocument') !== false) return 'application/msword'; // DOC
                 if (stripos($content_string, 'Workbook') !== false || stripos($content_string, 'Book') !== false) return 'application/vnd.ms-excel'; // XLS (Checking 'Book' is common too)
                 if (stripos($content_string, 'PowerPoint') !== false) return 'application/vnd.ms-powerpoint'; // PPT
                 // Could add checks for Publisher (MSPUB), Visio (VSD) if needed
                 return 'application/vnd.ms-office'; // Return generic if no specific hints found
             }
            // --- End special handling ---

            return $type; // Return the matched type from the signature list
        }
    }


    // 4. Content Sniffing for Text-based Types (if no signature matched)
    $trimmed_content = trim($content_string);
    // Use stripos for case-insensitive HTML tag checks
    if (stripos($trimmed_content, '<!doctype html') === 0 || stripos($trimmed_content, '<html') === 0 || (stripos($trimmed_content, '<') !== false && stripos($trimmed_content, '</') !== false && stripos($trimmed_content, '<body') !== false)) {
         return 'text/html';
    }
     // Case-insensitive XML check
    if (stripos($trimmed_content, '<?xml') === 0) {
         return 'application/xml';
    }
    // JSON check
    if ((strpos($trimmed_content, '{') === 0 && strpos(strrev($trimmed_content), '}') === 0) || (strpos($trimmed_content, '[') === 0 && strpos(strrev($trimmed_content), ']') === 0)) {
        if (json_decode($trimmed_content) !== null) {
            return 'application/json';
        }
    }
    // CSV check heuristic (multiple lines, commas present in first lines)
    $lines = explode("\n", substr($trimmed_content, 0, 500), 3);
    if (count($lines) > 1 && strpos($lines[0], ',') !== false) {
         if (isset($lines[1]) && strpos($lines[1], ',') !== false) {
             // Maybe check if comma counts are consistent? Overly complex for this scope.
            return 'text/csv';
        }
    }

    // 5. Default Fallback: Check for null bytes -> binary, otherwise assume text
    if (strpos($content_string, "\0") !== false) {
        return 'application/octet-stream'; // Generic Binary
    }

    // If it passed null byte check and didn't match text signatures, assume Plain Text
    return 'text/plain';
}


/**
 * Translates a technical MIME type string into a user-friendly name.
 *
 * @param string $mime_type The technical MIME type.
 * @return string The user-friendly file type name.
 */
function translateMimeToFriendlyName(string $mime_type)
{
    // Mapping of MIME types to friendly names
    $friendlyNames = [
        // Documents
        'application/pdf'                                                       => 'PDF Document',
        'application/msword'                                                    => 'Word Document (Old Format - DOC)',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'Word Document (Modern Format - DOCX)',
        'application/vnd.ms-excel'                                              => 'Excel Spreadsheet (Old Format - XLS)',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'     => 'Excel Spreadsheet (Modern Format - XLSX)',
        'application/vnd.ms-powerpoint'                                         => 'PowerPoint Presentation (Old Format - PPT)',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'PowerPoint Presentation (Modern Format - PPTX)',
        'application/vnd.oasis.opendocument.text'                               => 'OpenDocument Text (ODT)',
        'application/vnd.oasis.opendocument.spreadsheet'                        => 'OpenDocument Spreadsheet (ODS)',
        'application/vnd.oasis.opendocument.presentation'                       => 'OpenDocument Presentation (ODP)',
        'application/vnd.ms-office'                                             => 'Microsoft Office Document (Old Format)', // Generic fallback

        // Images
        'image/jpeg'                                                            => 'JPEG Image',
        'image/png'                                                             => 'PNG Image',
        'image/gif'                                                             => 'GIF Image',
        'image/bmp'                                                             => 'Bitmap Image',
        'image/tiff'                                                            => 'TIFF Image',
        'image/webp'                                                            => 'WEBP Image',
        'image/svg+xml'                                                         => 'SVG Image',

        // Text/Data Formats
        'text/plain'                                                            => 'Plain Text File',
        'text/html'                                                             => 'HTML Web Page File',
        'text/css'                                                              => 'CSS Stylesheet File',
        'text/csv'                                                              => 'CSV Data File',
        'application/json'                                                      => 'JSON Data File',
        'application/xml'                                                       => 'XML Data File',
        'text/xml'                                                              => 'XML Data File', // Duplicate for common variation

        // Archives
        'application/zip'                                                       => 'ZIP Archive File',
        'application/x-rar-compressed'                                          => 'RAR Archive File', // Common variation
        'application/vnd.rar'                                                   => 'RAR Archive File',
        'application/gzip'                                                      => 'Gzip Archive File',
        'application/x-tar'                                                     => 'TAR Archive File',
        'application/x-7z-compressed'                                           => '7-Zip Archive File',

        // Audio
        'audio/mpeg'                                                            => 'MP3 Audio File',
        'audio/ogg'                                                             => 'Ogg Vorbis Audio File',
        'audio/wav'                                                             => 'WAV Audio File',
        'audio/aac'                                                             => 'AAC Audio File',
        'audio/flac'                                                            => 'FLAC Audio File',

        // Video
        'video/mp4'                                                             => 'MP4 Video File',
        'video/webm'                                                            => 'WebM Video File',
        'video/ogg'                                                             => 'Ogg Video File',
        'video/mpeg'                                                            => 'MPEG Video File',
        'video/quicktime'                                                       => 'QuickTime Video File (MOV)',
        'video/x-msvideo'                                                       => 'AVI Video File',

        // Special Cases
        'application/octet-stream'                                              => 'Unknown Binary File',
        'application/x-empty'                                                   => 'Empty File',
        'inode/x-empty'                                                         => 'Empty File', // Another convention for empty
    ];

    // Return the friendly name if found, otherwise a generic description
    return $friendlyNames[$mime_type] ?? 'Unknown File Type (' . $mime_type . ')'; // Fallback includes original MIME
}
?>