files: PDF-Dateien als extrahierten Text zurueckgeben

read_file extrahiert PDF-Text serverseitig via pdfplumber (wie mail/server.py),
liefert TextContent statt EmbeddedResource-Blob. Scan-PDFs: Hinweis + Blob-Fallback.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
root
2026-06-17 09:51:11 +02:00
parent bced937a24
commit c41fb89b06
+20 -2
View File
@@ -1,6 +1,6 @@
"""MCP Files Server — browse and read files via WebDAV/oCIS.""" """MCP Files Server — browse and read files via WebDAV/oCIS."""
import os, sys, contextlib, base64 import os, sys, contextlib, base64, io
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from typing import Annotated from typing import Annotated
@@ -102,7 +102,7 @@ def _guess_mime(path, ct):
def read_file( def read_file(
path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")], path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")],
) -> list[TextContent | ImageContent | EmbeddedResource]: ) -> list[TextContent | ImageContent | EmbeddedResource]:
"""Read a file. Text files return content directly. Images are displayed inline. Documents (PDF, docx, xlsx, pptx) are passed as binary for the client to process. Max 25 MB.""" """Read a file. Text files return content directly. Images inline. PDFs as extracted text. Other documents (docx, xlsx, pptx) as binary. Max 25 MB."""
user = get_current_user() user = get_current_user()
if not user: return [TextContent(type="text", text="Error: not authenticated")] if not user: return [TextContent(type="text", text="Error: not authenticated")]
r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60) r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60)
@@ -115,6 +115,24 @@ def read_file(
return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)] return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)]
if any(t in ct for t in TEXT_HINTS): if any(t in ct for t in TEXT_HINTS):
return [TextContent(type="text", text=r.text[:100000])] return [TextContent(type="text", text=r.text[:100000])]
if ct == "application/pdf" or path.lower().endswith(".pdf"):
try:
import pdfplumber
with pdfplumber.open(io.BytesIO(r.content)) as pdf:
pages = []
for i, page in enumerate(pdf.pages, 1):
t = page.extract_text() or ""
if t.strip():
pages.append(f"--- Seite {i} ---\n{t}")
text = "\n\n".join(pages)
if text.strip():
return [TextContent(type="text", text=f"[PDF: {path}]\n\n{text[:200000]}")]
return [
TextContent(type="text", text=f"[PDF '{path}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
EmbeddedResource(type="resource", resource=BlobResourceContents(uri=f"file://{path}", blob=base64.b64encode(r.content).decode(), mimeType=ct)),
]
except Exception as e:
return [TextContent(type="text", text=f"PDF '{path}' konnte nicht gelesen werden: {e}")]
try: try:
text = r.content.decode("utf-8") text = r.content.decode("utf-8")
return [TextContent(type="text", text=text[:100000])] return [TextContent(type="text", text=text[:100000])]