files: PDF-Dateien als extrahierten Text zurueckgeben

read_file extrahiert PDF-Text serverseitig via pdfplumber (wie mail/server.py), liefert TextContent statt EmbeddedResource-Blob. Scan-PDFs: Hinweis + Blob-Fallback. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 09:51:11 +02:00
parent bced937a24
commit c41fb89b06
1 changed files with 20 additions and 2 deletions
@@ -1,6 +1,6 @@
 """MCP Files Server — browse and read files via WebDAV/oCIS."""
-import os, sys, contextlib, base64
+import os, sys, contextlib, base64, io
 from xml.etree import ElementTree as ET
 from typing import Annotated
@@ -102,7 +102,7 @@ def _guess_mime(path, ct):
 def read_file(
    path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")],
 ) -> list[TextContent | ImageContent | EmbeddedResource]:
-    """Read a file. Text files return content directly. Images are displayed inline. Documents (PDF, docx, xlsx, pptx) are passed as binary for the client to process. Max 25 MB."""
+    """Read a file. Text files return content directly. Images inline. PDFs as extracted text. Other documents (docx, xlsx, pptx) as binary. Max 25 MB."""
    user = get_current_user()
    if not user: return [TextContent(type="text", text="Error: not authenticated")]
    r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60)
@@ -115,6 +115,24 @@ def read_file(
        return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)]
    if any(t in ct for t in TEXT_HINTS):
        return [TextContent(type="text", text=r.text[:100000])]
    if ct == "application/pdf" or path.lower().endswith(".pdf"):
        try:
            import pdfplumber
            with pdfplumber.open(io.BytesIO(r.content)) as pdf:
                pages = []
                for i, page in enumerate(pdf.pages, 1):
                    t = page.extract_text() or ""
                    if t.strip():
                        pages.append(f"--- Seite {i} ---\n{t}")
                text = "\n\n".join(pages)
            if text.strip():
                return [TextContent(type="text", text=f"[PDF: {path}]\n\n{text[:200000]}")]
            return [
                TextContent(type="text", text=f"[PDF '{path}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
                EmbeddedResource(type="resource", resource=BlobResourceContents(uri=f"file://{path}", blob=base64.b64encode(r.content).decode(), mimeType=ct)),
            ]
        except Exception as e:
            return [TextContent(type="text", text=f"PDF '{path}' konnte nicht gelesen werden: {e}")]
    try:
        text = r.content.decode("utf-8")
        return [TextContent(type="text", text=text[:100000])]