feat(pdf): gemeinsames pdfutil — Scan-PDFs als Bild + OCR (Files & Mail)
Scan-/bildbasierte PDFs werden jetzt von Files-MCP (read_file) UND Mail-MCP (read_attachment) ueber das gemeinsame Modul pdfutil.py verarbeitet: Seiten via PyMuPDF als PNG (150dpi, max 20) + OCR-Text (tesseract deu+eng). Verschluesselte/ kaputte PDFs bleiben graceful. Deps: pymupdf, pytesseract (+ system tesseract-ocr). 76 Tests gruen.
This commit is contained in:
+2
-32
@@ -13,6 +13,7 @@ from starlette.routing import Mount
|
|||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||||
from common import get_current_user, OAUTH_ROUTES, BearerAuthMiddleware
|
from common import get_current_user, OAUTH_ROUTES, BearerAuthMiddleware
|
||||||
|
from pdfutil import pdf_to_content
|
||||||
|
|
||||||
from common import load_config as _lc
|
from common import load_config as _lc
|
||||||
_cfg = _lc()
|
_cfg = _lc()
|
||||||
@@ -116,38 +117,7 @@ def read_file(
|
|||||||
if any(t in ct for t in TEXT_HINTS):
|
if any(t in ct for t in TEXT_HINTS):
|
||||||
return [TextContent(type="text", text=r.text[:100000])]
|
return [TextContent(type="text", text=r.text[:100000])]
|
||||||
if ct == "application/pdf" or path.lower().endswith(".pdf"):
|
if ct == "application/pdf" or path.lower().endswith(".pdf"):
|
||||||
try:
|
return pdf_to_content(r.content, path, ct)
|
||||||
import pdfplumber
|
|
||||||
with pdfplumber.open(io.BytesIO(r.content)) as pdf:
|
|
||||||
pages = []
|
|
||||||
for i, page in enumerate(pdf.pages, 1):
|
|
||||||
t = page.extract_text() or ""
|
|
||||||
if t.strip():
|
|
||||||
pages.append(f"--- Seite {i} ---\n{t}")
|
|
||||||
text = "\n\n".join(pages)
|
|
||||||
if text.strip():
|
|
||||||
return [TextContent(type="text", text=f"[PDF: {path}]\n\n{text[:200000]}")]
|
|
||||||
# Kein extrahierbarer Text -> bildbasiert/gescannt: Seiten als Bilder
|
|
||||||
# rendern, damit das LLM sie per Vision lesen kann (statt nutzloser Rohbytes).
|
|
||||||
try:
|
|
||||||
import fitz # PyMuPDF
|
|
||||||
doc = fitz.open(stream=r.content, filetype="pdf")
|
|
||||||
n = doc.page_count
|
|
||||||
MAX_PAGES = 20
|
|
||||||
out = [TextContent(type="text", text=f"[PDF '{path}' ist bildbasiert/gescannt ({n} Seite(n)) — als Bilder gerendert:]")]
|
|
||||||
for i in range(min(n, MAX_PAGES)):
|
|
||||||
pix = doc[i].get_pixmap(dpi=150)
|
|
||||||
out.append(ImageContent(type="image", data=base64.b64encode(pix.tobytes("png")).decode(), mimeType="image/png"))
|
|
||||||
if n > MAX_PAGES:
|
|
||||||
out.append(TextContent(type="text", text=f"[... {n - MAX_PAGES} weitere Seiten ausgelassen (Limit {MAX_PAGES}).]"))
|
|
||||||
return out
|
|
||||||
except Exception as e:
|
|
||||||
return [
|
|
||||||
TextContent(type="text", text=f"[PDF '{path}' ist bildbasiert; Rendern fehlgeschlagen ({e}). Rohdaten folgen.]"),
|
|
||||||
EmbeddedResource(type="resource", resource=BlobResourceContents(uri=f"file://{path}", blob=base64.b64encode(r.content).decode(), mimeType=ct)),
|
|
||||||
]
|
|
||||||
except Exception as e:
|
|
||||||
return [TextContent(type="text", text=f"PDF '{path}' konnte nicht gelesen werden: {e}")]
|
|
||||||
try:
|
try:
|
||||||
text = r.content.decode("utf-8")
|
text = r.content.decode("utf-8")
|
||||||
return [TextContent(type="text", text=text[:100000])]
|
return [TextContent(type="text", text=text[:100000])]
|
||||||
|
|||||||
+2
-20
@@ -23,6 +23,7 @@ from starlette.routing import Mount
|
|||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||||
from common import get_current_user, OAUTH_ROUTES, BearerAuthMiddleware
|
from common import get_current_user, OAUTH_ROUTES, BearerAuthMiddleware
|
||||||
|
from pdfutil import pdf_to_content
|
||||||
|
|
||||||
from common import load_config as _lc
|
from common import load_config as _lc
|
||||||
_cfg = _lc()
|
_cfg = _lc()
|
||||||
@@ -302,26 +303,7 @@ def read_attachment(
|
|||||||
if mime.startswith("text/"):
|
if mime.startswith("text/"):
|
||||||
return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])]
|
return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])]
|
||||||
if mime == "application/pdf" or att["filename"].lower().endswith(".pdf"):
|
if mime == "application/pdf" or att["filename"].lower().endswith(".pdf"):
|
||||||
try:
|
return pdf_to_content(payload, att["filename"], mime, uri=f"mail://attachment/{att['filename']}")
|
||||||
import pdfplumber
|
|
||||||
with pdfplumber.open(io.BytesIO(payload)) as pdf:
|
|
||||||
pages = []
|
|
||||||
for i, page in enumerate(pdf.pages, 1):
|
|
||||||
t = page.extract_text() or ""
|
|
||||||
if t.strip():
|
|
||||||
pages.append(f"--- Seite {i} ---\n{t}")
|
|
||||||
text = "\n\n".join(pages)
|
|
||||||
if text.strip():
|
|
||||||
return [TextContent(type="text", text=f"[PDF: {att['filename']}]\n\n{text[:200000]}")]
|
|
||||||
# Kein extrahierbarer Text -> vermutlich Scan/Bild-PDF: als Blob zurueck
|
|
||||||
return [
|
|
||||||
TextContent(type="text", text=f"[PDF '{att['filename']}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
|
|
||||||
EmbeddedResource(type="resource", resource=BlobResourceContents(
|
|
||||||
uri=f"mail://attachment/{att['filename']}",
|
|
||||||
blob=base64.b64encode(payload).decode(), mimeType=mime)),
|
|
||||||
]
|
|
||||||
except Exception as e:
|
|
||||||
return [TextContent(type="text", text=f"PDF '{att['filename']}' konnte nicht gelesen werden: {e}")]
|
|
||||||
return [EmbeddedResource(type="resource", resource=BlobResourceContents(
|
return [EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||||
uri=f"mail://attachment/{att['filename']}",
|
uri=f"mail://attachment/{att['filename']}",
|
||||||
blob=base64.b64encode(payload).decode(), mimeType=mime))]
|
blob=base64.b64encode(payload).decode(), mimeType=mime))]
|
||||||
|
|||||||
+70
@@ -0,0 +1,70 @@
|
|||||||
|
"""Gemeinsame PDF-Verarbeitung fuer Files- und Mail-MCP.
|
||||||
|
|
||||||
|
Text-PDFs -> extrahierter Text. Bildbasierte/gescannte PDFs -> Seiten als PNG
|
||||||
|
gerendert (PyMuPDF) + OCR-Text (tesseract), damit das LLM sie per Vision liest
|
||||||
|
UND durchsuchbaren Text bekommt. Verschluesselte/kaputte PDFs -> graceful.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
from mcp.types import TextContent, ImageContent, EmbeddedResource, BlobResourceContents
|
||||||
|
|
||||||
|
DPI = 150
|
||||||
|
MAX_PAGES = 20
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr(png_bytes):
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
|
return pytesseract.image_to_string(Image.open(io.BytesIO(png_bytes)), lang="deu+eng").strip()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_content(content, label, ct="application/pdf", uri=None):
|
||||||
|
uri = uri or f"file://{label}"
|
||||||
|
|
||||||
|
# 1. Echten Text extrahieren
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||||
|
pages = []
|
||||||
|
for i, page in enumerate(pdf.pages, 1):
|
||||||
|
t = page.extract_text() or ""
|
||||||
|
if t.strip():
|
||||||
|
pages.append(f"--- Seite {i} ---\n{t}")
|
||||||
|
text = "\n\n".join(pages)
|
||||||
|
if text.strip():
|
||||||
|
return [TextContent(type="text", text=f"[PDF: {label}]\n\n{text[:200000]}")]
|
||||||
|
except Exception:
|
||||||
|
pass # weiter zum Rendern (z.B. verschluesselt -> faellt unten in except)
|
||||||
|
|
||||||
|
# 2. Bildbasiert/gescannt -> Seiten rendern + OCR
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
doc = fitz.open(stream=content, filetype="pdf")
|
||||||
|
n = doc.page_count
|
||||||
|
images, ocr_pages = [], []
|
||||||
|
for i in range(min(n, MAX_PAGES)):
|
||||||
|
png = doc[i].get_pixmap(dpi=DPI).tobytes("png")
|
||||||
|
images.append(ImageContent(type="image", data=base64.b64encode(png).decode(), mimeType="image/png"))
|
||||||
|
t = _ocr(png)
|
||||||
|
if t:
|
||||||
|
ocr_pages.append(f"--- Seite {i + 1} (OCR) ---\n{t}")
|
||||||
|
header = f"[PDF '{label}' ist bildbasiert/gescannt ({n} Seite(n)) — als Bilder gerendert"
|
||||||
|
header += " (+ OCR-Text)" if ocr_pages else ""
|
||||||
|
header += ":]"
|
||||||
|
out = [TextContent(type="text", text=header)]
|
||||||
|
if ocr_pages:
|
||||||
|
out.append(TextContent(type="text", text="\n\n".join(ocr_pages)[:200000]))
|
||||||
|
out += images
|
||||||
|
if n > MAX_PAGES:
|
||||||
|
out.append(TextContent(type="text", text=f"[... {n - MAX_PAGES} weitere Seiten ausgelassen (Limit {MAX_PAGES}).]"))
|
||||||
|
return out
|
||||||
|
except Exception as e:
|
||||||
|
return [
|
||||||
|
TextContent(type="text", text=f"[PDF '{label}' konnte nicht verarbeitet werden ({e}). Rohdaten folgen.]"),
|
||||||
|
EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||||
|
uri=uri, blob=base64.b64encode(content).decode(), mimeType=ct)),
|
||||||
|
]
|
||||||
@@ -5,4 +5,5 @@ openpyxl==3.1.5
|
|||||||
pdfplumber==0.11.9
|
pdfplumber==0.11.9
|
||||||
pillow==12.2.0
|
pillow==12.2.0
|
||||||
PyMuPDF==1.27.2.3
|
PyMuPDF==1.27.2.3
|
||||||
|
pytesseract==0.3.13
|
||||||
python-docx==1.2.0
|
python-docx==1.2.0
|
||||||
|
|||||||
+5
-2
@@ -100,5 +100,8 @@ Frueher gab `read_file` bei Scan-PDFs (kein extrahierbarer Text) nur Rohbytes
|
|||||||
(`EmbeddedResource`) zurueck — claude.ai konnte den Inhalt nicht lesen. Jetzt werden
|
(`EmbeddedResource`) zurueck — claude.ai konnte den Inhalt nicht lesen. Jetzt werden
|
||||||
solche PDFs mit **PyMuPDF** seitenweise als **PNG-Bilder** (150 dpi, max 20 Seiten)
|
solche PDFs mit **PyMuPDF** seitenweise als **PNG-Bilder** (150 dpi, max 20 Seiten)
|
||||||
gerendert und als `ImageContent` zurueckgegeben -> das LLM liest sie per Vision.
|
gerendert und als `ImageContent` zurueckgegeben -> das LLM liest sie per Vision.
|
||||||
Produktiv-Feature (gilt fuer alle User). Test: `TestFileTypes` scanned.pdf -> `image`.
|
Zusaetzlich **OCR** (tesseract deu+eng) -> durchsuchbarer Text neben den Bildern.
|
||||||
Runtime-Dep: `pymupdf` (siehe `requirements-extra.txt`).
|
Gemeinsames Modul `pdfutil.py` wird von Files-MCP (`read_file`) UND Mail-MCP
|
||||||
|
(`read_attachment`) genutzt -> Scan-PDF-Mailanhaenge werden genauso gerendert.
|
||||||
|
Produktiv-Feature (alle User). Test: `TestFileTypes` scanned.pdf -> `image`.
|
||||||
|
Runtime-Deps: `pymupdf`, `pytesseract` + System `tesseract-ocr`/`-deu` (s. `requirements-extra.txt`).
|
||||||
|
|||||||
Reference in New Issue
Block a user