diff --git a/files/server.py b/files/server.py index 1e0d40f..505c497 100644 --- a/files/server.py +++ b/files/server.py @@ -1,6 +1,6 @@ """MCP Files Server — browse and read files via WebDAV/oCIS.""" -import os, sys, contextlib, base64 +import os, sys, contextlib, base64, io from xml.etree import ElementTree as ET from typing import Annotated @@ -102,7 +102,7 @@ def _guess_mime(path, ct): def read_file( path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")], ) -> list[TextContent | ImageContent | EmbeddedResource]: - """Read a file. Text files return content directly. Images are displayed inline. Documents (PDF, docx, xlsx, pptx) are passed as binary for the client to process. Max 25 MB.""" + """Read a file. Text files return content directly. Images inline. PDFs as extracted text. Other documents (docx, xlsx, pptx) as binary. Max 25 MB.""" user = get_current_user() if not user: return [TextContent(type="text", text="Error: not authenticated")] r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60) @@ -115,6 +115,24 @@ def read_file( return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)] if any(t in ct for t in TEXT_HINTS): return [TextContent(type="text", text=r.text[:100000])] + if ct == "application/pdf" or path.lower().endswith(".pdf"): + try: + import pdfplumber + with pdfplumber.open(io.BytesIO(r.content)) as pdf: + pages = [] + for i, page in enumerate(pdf.pages, 1): + t = page.extract_text() or "" + if t.strip(): + pages.append(f"--- Seite {i} ---\n{t}") + text = "\n\n".join(pages) + if text.strip(): + return [TextContent(type="text", text=f"[PDF: {path}]\n\n{text[:200000]}")] + return [ + TextContent(type="text", text=f"[PDF '{path}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"), + EmbeddedResource(type="resource", resource=BlobResourceContents(uri=f"file://{path}", blob=base64.b64encode(r.content).decode(), mimeType=ct)), + ] + except Exception as e: + return [TextContent(type="text", text=f"PDF '{path}' konnte nicht gelesen werden: {e}")] try: text = r.content.decode("utf-8") return [TextContent(type="text", text=text[:100000])]