🏠 Home | 📚 Documentation | 📋 Examples | 🔌 API | 💻 CLI |
This document provides practical examples of how to use InvOCR for various document processing tasks.
# Clone the repository
git clone https://github.com/fin-officer/invocr.git
cd invocr
# Install dependencies
poetry install
# Install Tesseract OCR (Ubuntu/Debian)
sudo apt-get install tesseract-ocr tesseract-ocr-pol tesseract-ocr-eng
from invocr.core import PDFProcessor
# Initialize the processor
processor = PDFProcessor()
# Process a single PDF
success, error = processor.process_pdf("invoice.pdf", "output")
if success:
print("PDF processed successfully!")
else:
print(f"Error: {error}")
from invocr.core.pdf_processor import PDFProcessor
processor = PDFProcessor()
text, error = processor.extract_text("document.pdf")
if text:
print(text[:500]) # Print first 500 characters
from invocr.utils.validation import is_valid_pdf, is_valid_pdf_simple
# Simple check (fast)
if is_valid_pdf_simple("document.pdf"):
print("File appears to be a valid PDF")
# Detailed validation
is_valid, error = is_valid_pdf("document.pdf", min_size=1024) # 1KB minimum
if not is_valid:
print(f"Invalid PDF: {error}")
from invocr.core.ocr import create_ocr_engine
# Initialize OCR engine with supported languages
ocr = create_ocr_engine(["en", "pl", "de"])
# Extract text from image
result = ocr.extract_text("receipt.jpg")
print(result["text"])
from invocr.core import PDFProcessor
processor = PDFProcessor()
results = processor.process_directory("invoices/", "output/")
print(f"Processed: {results['succeeded']} files")
print(f"Failed: {results['failed']} files")
uvicorn invocr.api.main:app --reload --host 0.0.0.0 --port 8000
curl -X 'POST' \
'http://localhost:8000/api/v1/process' \
-H 'accept: application/json' \
-H 'Content-Type: multipart/form-data' \
-F 'file=@invoice.pdf;type=application/pdf'
from pathlib import Path
from invocr.core import PDFProcessor
from invocr.core.ocr import create_ocr_engine
from invocr.core.extractor import create_extractor
def custom_pipeline(pdf_path, output_dir):
# Initialize components
pdf_processor = PDFProcessor()
ocr_engine = create_ocr_engine(["en", "pl"])
extractor = create_extractor()
# Process PDF
text, error = pdf_processor.extract_text(pdf_path)
if error:
return False, f"Text extraction failed: {error}"
# Extract structured data
data = extractor.extract_invoice_data(text)
# Save results
output_path = Path(output_dir) / f"{Path(pdf_path).stem}.json"
with open(output_path, 'w') as f:
json.dump(data, f, indent=2)
return True, str(output_path)
See detailed validation examples in validation_examples.md
# Install system dependencies
sudo apt-get install tesseract-ocr tesseract-ocr-pol
is_valid_pdf_simple()
for quick validation before full processing