81 lines
2.3 KiB
JavaScript
81 lines
2.3 KiB
JavaScript
const Tesseract = require('tesseract.js');
|
|
const path = require('path');
|
|
|
|
class OCRService {
|
|
async processDocument(filePath) {
|
|
try {
|
|
console.log(`[OCR] Verarbeite: ${filePath}`);
|
|
|
|
// Erkenne Dateityp
|
|
const ext = path.extname(filePath).toLowerCase();
|
|
|
|
if (ext === '.pdf') {
|
|
// Für PDFs: Text-Extraktion (ohne OCR, wenn möglich)
|
|
// Hier könnte pdf-parse verwendet werden
|
|
return {
|
|
success: true,
|
|
extracted: { text: 'PDF Text extrahiert (Platzhalter)', type: 'pdf' }
|
|
};
|
|
}
|
|
|
|
// Für Bilder: Tesseract OCR
|
|
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'].includes(ext)) {
|
|
const result = await Tesseract.recognize(
|
|
filePath,
|
|
'deu', // Deutsche Sprache
|
|
{
|
|
logger: m => console.log(`[OCR] ${m.status}: ${Math.round(m.progress * 100)}%`)
|
|
}
|
|
);
|
|
|
|
// Extrahiere potenzielle Beträge (einfache Regex)
|
|
const amounts = this.extractAmounts(result.data.text);
|
|
|
|
return {
|
|
success: true,
|
|
extracted: {
|
|
text: result.data.text,
|
|
confidence: result.data.confidence,
|
|
amounts: amounts,
|
|
type: 'image'
|
|
}
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: 'Nicht unterstütztes Dateiformat'
|
|
};
|
|
} catch (error) {
|
|
console.error('[OCR] Fehler:', error);
|
|
return {
|
|
success: false,
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
extractAmounts(text) {
|
|
// Deutsche Beträge erkennen (z.B. 1.234,56 oder 1234,56)
|
|
const patterns = [
|
|
/(\d{1,3}(?:\.\d{3})*,\d{2})\s*[€$]?/g, // 1.234,56 €
|
|
/(\d+,\d{2})\s*[€$]?/g, // 1234,56 €
|
|
/[€$]\s*(\d{1,3}(?:,\d{3})*\.\d{2})/g, // € 1,234.56
|
|
/[€$]\s*(\d+\.\d{2})/g // € 1234.56
|
|
];
|
|
|
|
const amounts = [];
|
|
patterns.forEach(pattern => {
|
|
const matches = text.match(pattern);
|
|
if (matches) {
|
|
amounts.push(...matches.map(m => m.replace(/[^\d,]/g, '').replace(',', '.')));
|
|
}
|
|
});
|
|
|
|
// Eindeutige Beträge zurückgeben
|
|
return [...new Set(amounts)].map(a => parseFloat(a)).filter(a => a > 0);
|
|
}
|
|
}
|
|
|
|
module.exports = new OCRService();
|