feat(pdf): gemeinsames pdfutil — Scan-PDFs als Bild + OCR (Files & Mail)
Scan-/bildbasierte PDFs werden jetzt von Files-MCP (read_file) UND Mail-MCP (read_attachment) ueber das gemeinsame Modul pdfutil.py verarbeitet: Seiten via PyMuPDF als PNG (150dpi, max 20) + OCR-Text (tesseract deu+eng). Verschluesselte/ kaputte PDFs bleiben graceful. Deps: pymupdf, pytesseract (+ system tesseract-ocr). 76 Tests gruen.
This commit is contained in:
+2
-20
@@ -23,6 +23,7 @@ from starlette.routing import Mount
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from common import get_current_user, OAUTH_ROUTES, BearerAuthMiddleware
|
||||
from pdfutil import pdf_to_content
|
||||
|
||||
from common import load_config as _lc
|
||||
_cfg = _lc()
|
||||
@@ -302,26 +303,7 @@ def read_attachment(
|
||||
if mime.startswith("text/"):
|
||||
return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])]
|
||||
if mime == "application/pdf" or att["filename"].lower().endswith(".pdf"):
|
||||
try:
|
||||
import pdfplumber
|
||||
with pdfplumber.open(io.BytesIO(payload)) as pdf:
|
||||
pages = []
|
||||
for i, page in enumerate(pdf.pages, 1):
|
||||
t = page.extract_text() or ""
|
||||
if t.strip():
|
||||
pages.append(f"--- Seite {i} ---\n{t}")
|
||||
text = "\n\n".join(pages)
|
||||
if text.strip():
|
||||
return [TextContent(type="text", text=f"[PDF: {att['filename']}]\n\n{text[:200000]}")]
|
||||
# Kein extrahierbarer Text -> vermutlich Scan/Bild-PDF: als Blob zurueck
|
||||
return [
|
||||
TextContent(type="text", text=f"[PDF '{att['filename']}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
|
||||
EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||
uri=f"mail://attachment/{att['filename']}",
|
||||
blob=base64.b64encode(payload).decode(), mimeType=mime)),
|
||||
]
|
||||
except Exception as e:
|
||||
return [TextContent(type="text", text=f"PDF '{att['filename']}' konnte nicht gelesen werden: {e}")]
|
||||
return pdf_to_content(payload, att["filename"], mime, uri=f"mail://attachment/{att['filename']}")
|
||||
return [EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||
uri=f"mail://attachment/{att['filename']}",
|
||||
blob=base64.b64encode(payload).decode(), mimeType=mime))]
|
||||
|
||||
Reference in New Issue
Block a user