AI-Tools/ocr.cpp at main · LynxGeekNYC/AI-Tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
#include <poppler-document.h>
#include <poppler-page.h>
#include <opencv2/opencv.hpp>
#include <fstream>
#include <iostream>
#include <nlohmann/json.hpp>
#include <thread>
#include <mutex>
#include <vector>

using json = nlohmann::json;
std::mutex jsonMutex; // Mutex for thread-safe JSON access

// Preprocess image using OpenCV
std::string preprocessAndOCR(const std::string &imagePath) {
    cv::Mat img = cv::imread(imagePath, cv::IMREAD_GRAYSCALE);
    if (img.empty()) {
        std::cerr << "Failed to read image: " << imagePath << std::endl;
        return "";
    }

    // Apply preprocessing: thresholding, noise removal
    cv::Mat processedImg;
    cv::threshold(img, processedImg, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
    cv::imwrite("temp_processed.png", processedImg); // Save processed image

    // Perform OCR on the processed image
    tesseract::TessBaseAPI ocr;
    if (ocr.Init(nullptr, "eng")) {
        std::cerr << "Could not initialize Tesseract." << std::endl;
        return "";
    }

    Pix *image = pixRead("temp_processed.png");
    if (!image) {
        std::cerr << "Failed to open processed image file." << std::endl;
        return "";
    }

    ocr.SetImage(image);
    std::string text = ocr.GetUTF8Text();
    ocr.End();
    pixDestroy(&image);

    // Clean up temporary file
    std::remove("temp_processed.png");
    return text;
}

// Extract text from PDF
std::string extractPDFText(const std::string &pdfPath) {
    auto doc = poppler::document::load_from_file(pdfPath);
    if (!doc) {
        std::cerr << "Failed to open PDF file: " << pdfPath << std::endl;
        return "";
    }

    std::string pdfText;
    for (int i = 0; i < doc->pages(); ++i) {
        auto page = doc->create_page(i);
        if (page) {
            pdfText += page->text().to_latin1();
        }
    }
    return pdfText;
}

// Process individual file
void processFile(const std::string &file, json &result) {
    std::string extension = file.substr(file.find_last_of('.') + 1);
    std::string extractedText;

    if (extension == "pdf") {
        extractedText = extractPDFText(file);
    } else {
        extractedText = preprocessAndOCR(file);
    }

    if (!extractedText.empty()) {
        // Lock mutex to safely modify JSON
        std::lock_guard<std::mutex> lock(jsonMutex);
        result[file] = {
            {"type", extension == "pdf" ? "PDF" : "Image"},
            {"text", extractedText}
        };
    }
}

// Multithreaded document processing
void processDocuments(const std::vector<std::string> &files, const std::string &outputJson) {
    json result;
    std::vector<std::thread> threads;

    for (const auto &file : files) {
        threads.emplace_back(processFile, file, std::ref(result));
    }

    // Wait for all threads to complete
    for (auto &t : threads) {
        if (t.joinable()) {
            t.join();
        }
    }

    // Save result to JSON file
    std::ofstream outFile(outputJson);
    if (!outFile) {
        std::cerr << "Failed to write JSON file: " << outputJson << std::endl;
        return;
    }
    outFile << result.dump(4); // Pretty print with indentation
    std::cout << "Data saved to " << outputJson << std::endl;
}

int main(int argc, char *argv[]) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <output_json> <files...>" << std::endl;
        return 1;
    }

    std::string outputJson = argv[1];
    std::vector<std::string> files(argv + 2, argv + argc);

    processDocuments(files, outputJson);

    return 0;
}