-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathImageToTextExtractor.java
More file actions
129 lines (113 loc) · 5.35 KB
/
ImageToTextExtractor.java
File metadata and controls
129 lines (113 loc) · 5.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
public class ImageToTextExtractor {
private static final Logger logger = Logger.getLogger(ImageToTextExtractor.class.getName());
private static final AtomicInteger processedCount = new AtomicInteger();
private static final AtomicInteger errorCount = new AtomicInteger();
private static final ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
private static final BlockingQueue<Path> imageQueue = new LinkedBlockingQueue<>();
public static void main(String[] args) {
String imagePath = "/path/to/images";
String outputPath = "/path/to/output";
String dataPath = "/path/to/tessdata";
String language = "eng"; // Set the language code for OCR
try {
// Start worker threads
for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
executor.execute(new ImageProcessingWorker(outputPath, dataPath, language));
}
// Traverse the input directory and add files to the queue
Files.walk(Paths.get(imagePath))
.filter(path -> path.toString().endsWith(".png") || path.toString().endsWith(".jpg"))
.forEach(path -> {
try {
imageQueue.put(path);
} catch (InterruptedException e) {
logger.log(Level.SEVERE, "Error occurred while adding file to queue: " + path, e);
Thread.currentThread().interrupt();
}
});
// Poison pill to signal workers to terminate
for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
imageQueue.put(null);
}
// Wait for all tasks to complete
executor.shutdown();
boolean terminated = executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
if (terminated) {
logger.info("Text extraction completed. Processed " + processedCount.get() + " files.");
if (errorCount.get() > 0) {
logger.severe("Errors occurred during processing " + errorCount.get() + " files.");
}
} else {
logger.severe("Text extraction was interrupted.");
}
} catch (IOException | InterruptedException e) {
logger.log(Level.SEVERE, "Error occurred: " + e.getMessage(), e);
}
}
private static class ImageProcessingWorker implements Runnable {
private final String outputDir;
private final String dataPath;
private final String language;
public ImageProcessingWorker(String outputDir, String dataPath, String language) {
this.outputDir = outputDir;
this.dataPath = dataPath;
this.language = language;
}
@Override
public void run() {
Tesseract tesseract = new Tesseract();
tesseract.setDatapath(dataPath);
tesseract.setLanguage(language);
tesseract.setPageSegMode(1); // Automatic page segmentation with OSD
tesseract.setOcrEngineMode(3); // Enable only word recognition
while (true) {
Path imagePath;
try {
imagePath = imageQueue.take();
if (imagePath == null) {
break; // Poison pill received, terminate worker
}
} catch (InterruptedException e) {
logger.log(Level.SEVERE, "Worker thread interrupted", e);
Thread.currentThread().interrupt();
break;
}
try {
File imageFile = imagePath.toFile();
BufferedImage image = ImageIO.read(imageFile);
List<Word> words = tesseract.getWords(image);
String outputFileName = imagePath.getFileName().toString().replace(".png", ".txt").replace(".jpg", ".txt");
String outputFilePath = Paths.get(outputDir, outputFileName).toString();
Files.write(Paths.get(outputFilePath), extractTextFromWords(words).getBytes());
processedCount.incrementAndGet();
logger.info("Processed: " + imagePath);
} catch (TesseractException | IOException e) {
errorCount.incrementAndGet();
logger.log(Level.SEVERE, "Error occurred while processing " + imagePath, e);
}
}
}
private String extractTextFromWords(List<Word> words) {
StringBuilder extractedText = new StringBuilder();
for (Word word : words) {
extractedText.append(word.getText()).append(" ");
}
return extractedText.toString().trim();
}
}
}