Initial commit - Stand 26.04.2026
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
const Tesseract = require('tesseract.js');
|
||||
const path = require('path');
|
||||
|
||||
class OCRService {
|
||||
async processDocument(filePath) {
|
||||
try {
|
||||
console.log(`[OCR] Verarbeite: ${filePath}`);
|
||||
|
||||
// Erkenne Dateityp
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
|
||||
if (ext === '.pdf') {
|
||||
// Für PDFs: Text-Extraktion (ohne OCR, wenn möglich)
|
||||
// Hier könnte pdf-parse verwendet werden
|
||||
return {
|
||||
success: true,
|
||||
extracted: { text: 'PDF Text extrahiert (Platzhalter)', type: 'pdf' }
|
||||
};
|
||||
}
|
||||
|
||||
// Für Bilder: Tesseract OCR
|
||||
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'].includes(ext)) {
|
||||
const result = await Tesseract.recognize(
|
||||
filePath,
|
||||
'deu', // Deutsche Sprache
|
||||
{
|
||||
logger: m => console.log(`[OCR] ${m.status}: ${Math.round(m.progress * 100)}%`)
|
||||
}
|
||||
);
|
||||
|
||||
// Extrahiere potenzielle Beträge (einfache Regex)
|
||||
const amounts = this.extractAmounts(result.data.text);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
extracted: {
|
||||
text: result.data.text,
|
||||
confidence: result.data.confidence,
|
||||
amounts: amounts,
|
||||
type: 'image'
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: 'Nicht unterstütztes Dateiformat'
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[OCR] Fehler:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
extractAmounts(text) {
|
||||
// Deutsche Beträge erkennen (z.B. 1.234,56 oder 1234,56)
|
||||
const patterns = [
|
||||
/(\d{1,3}(?:\.\d{3})*,\d{2})\s*[€$]?/g, // 1.234,56 €
|
||||
/(\d+,\d{2})\s*[€$]?/g, // 1234,56 €
|
||||
/[€$]\s*(\d{1,3}(?:,\d{3})*\.\d{2})/g, // € 1,234.56
|
||||
/[€$]\s*(\d+\.\d{2})/g // € 1234.56
|
||||
];
|
||||
|
||||
const amounts = [];
|
||||
patterns.forEach(pattern => {
|
||||
const matches = text.match(pattern);
|
||||
if (matches) {
|
||||
amounts.push(...matches.map(m => m.replace(/[^\d,]/g, '').replace(',', '.')));
|
||||
}
|
||||
});
|
||||
|
||||
// Eindeutige Beträge zurückgeben
|
||||
return [...new Set(amounts)].map(a => parseFloat(a)).filter(a => a > 0);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = new OCRService();
|
||||
Reference in New Issue
Block a user