Experimental-Projects/ImageToTextExtractor.java at main · TaurusSilver201/Experimental-Projects · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

public class ImageToTextExtractor {
    private static final Logger logger = Logger.getLogger(ImageToTextExtractor.class.getName());
    private static final AtomicInteger processedCount = new AtomicInteger();
    private static final AtomicInteger errorCount = new AtomicInteger();
    private static final ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
    private static final BlockingQueue<Path> imageQueue = new LinkedBlockingQueue<>();

    public static void main(String[] args) {
        String imagePath = "/path/to/images";
        String outputPath = "/path/to/output";
        String dataPath = "/path/to/tessdata";
        String language = "eng"; // Set the language code for OCR

        try {
            // Start worker threads
            for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
                executor.execute(new ImageProcessingWorker(outputPath, dataPath, language));
            }

            // Traverse the input directory and add files to the queue
            Files.walk(Paths.get(imagePath))
                 .filter(path -> path.toString().endsWith(".png") || path.toString().endsWith(".jpg"))
                 .forEach(path -> {
                     try {
                         imageQueue.put(path);
                     } catch (InterruptedException e) {
                         logger.log(Level.SEVERE, "Error occurred while adding file to queue: " + path, e);
                         Thread.currentThread().interrupt();
                     }
                 });

            // Poison pill to signal workers to terminate
            for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
                imageQueue.put(null);
            }

            // Wait for all tasks to complete
            executor.shutdown();
            boolean terminated = executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);

            if (terminated) {
                logger.info("Text extraction completed. Processed " + processedCount.get() + " files.");
                if (errorCount.get() > 0) {
                    logger.severe("Errors occurred during processing " + errorCount.get() + " files.");
                }
            } else {
                logger.severe("Text extraction was interrupted.");
            }
        } catch (IOException | InterruptedException e) {
            logger.log(Level.SEVERE, "Error occurred: " + e.getMessage(), e);
        }
    }

    private static class ImageProcessingWorker implements Runnable {
        private final String outputDir;
        private final String dataPath;
        private final String language;

        public ImageProcessingWorker(String outputDir, String dataPath, String language) {
            this.outputDir = outputDir;
            this.dataPath = dataPath;
            this.language = language;
        }

        @Override
        public void run() {
            Tesseract tesseract = new Tesseract();
            tesseract.setDatapath(dataPath);
            tesseract.setLanguage(language);
            tesseract.setPageSegMode(1); // Automatic page segmentation with OSD
            tesseract.setOcrEngineMode(3); // Enable only word recognition

            while (true) {
                Path imagePath;
                try {
                    imagePath = imageQueue.take();
                    if (imagePath == null) {
                        break; // Poison pill received, terminate worker
                    }
                } catch (InterruptedException e) {
                    logger.log(Level.SEVERE, "Worker thread interrupted", e);
                    Thread.currentThread().interrupt();
                    break;
                }

                try {
                    File imageFile = imagePath.toFile();
                    BufferedImage image = ImageIO.read(imageFile);
                    List<Word> words = tesseract.getWords(image);

                    String outputFileName = imagePath.getFileName().toString().replace(".png", ".txt").replace(".jpg", ".txt");
                    String outputFilePath = Paths.get(outputDir, outputFileName).toString();
                    Files.write(Paths.get(outputFilePath), extractTextFromWords(words).getBytes());

                    processedCount.incrementAndGet();
                    logger.info("Processed: " + imagePath);
                } catch (TesseractException | IOException e) {
                    errorCount.incrementAndGet();
                    logger.log(Level.SEVERE, "Error occurred while processing " + imagePath, e);
                }
            }
        }

        private String extractTextFromWords(List<Word> words) {
            StringBuilder extractedText = new StringBuilder();
            for (Word word : words) {
                extractedText.append(word.getText()).append(" ");
            }
            return extractedText.toString().trim();
        }
    }
}