files: PDF-Dateien als extrahierten Text zurueckgeben
read_file extrahiert PDF-Text serverseitig via pdfplumber (wie mail/server.py), liefert TextContent statt EmbeddedResource-Blob. Scan-PDFs: Hinweis + Blob-Fallback. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+20
-2
@@ -1,6 +1,6 @@
|
|||||||
"""MCP Files Server — browse and read files via WebDAV/oCIS."""
|
"""MCP Files Server — browse and read files via WebDAV/oCIS."""
|
||||||
|
|
||||||
import os, sys, contextlib, base64
|
import os, sys, contextlib, base64, io
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
from typing import Annotated
|
from typing import Annotated
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ def _guess_mime(path, ct):
|
|||||||
def read_file(
|
def read_file(
|
||||||
path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")],
|
path: Annotated[str, Field(description="Full file path, e.g. '/Documents/notes.txt', '/report.pdf', '/photo.jpg'")],
|
||||||
) -> list[TextContent | ImageContent | EmbeddedResource]:
|
) -> list[TextContent | ImageContent | EmbeddedResource]:
|
||||||
"""Read a file. Text files return content directly. Images are displayed inline. Documents (PDF, docx, xlsx, pptx) are passed as binary for the client to process. Max 25 MB."""
|
"""Read a file. Text files return content directly. Images inline. PDFs as extracted text. Other documents (docx, xlsx, pptx) as binary. Max 25 MB."""
|
||||||
user = get_current_user()
|
user = get_current_user()
|
||||||
if not user: return [TextContent(type="text", text="Error: not authenticated")]
|
if not user: return [TextContent(type="text", text="Error: not authenticated")]
|
||||||
r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60)
|
r = httpx.get(_dav(user, path), auth=_auth(user), timeout=60)
|
||||||
@@ -115,6 +115,24 @@ def read_file(
|
|||||||
return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)]
|
return [ImageContent(type="image", data=base64.b64encode(r.content).decode(), mimeType=ct)]
|
||||||
if any(t in ct for t in TEXT_HINTS):
|
if any(t in ct for t in TEXT_HINTS):
|
||||||
return [TextContent(type="text", text=r.text[:100000])]
|
return [TextContent(type="text", text=r.text[:100000])]
|
||||||
|
if ct == "application/pdf" or path.lower().endswith(".pdf"):
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
with pdfplumber.open(io.BytesIO(r.content)) as pdf:
|
||||||
|
pages = []
|
||||||
|
for i, page in enumerate(pdf.pages, 1):
|
||||||
|
t = page.extract_text() or ""
|
||||||
|
if t.strip():
|
||||||
|
pages.append(f"--- Seite {i} ---\n{t}")
|
||||||
|
text = "\n\n".join(pages)
|
||||||
|
if text.strip():
|
||||||
|
return [TextContent(type="text", text=f"[PDF: {path}]\n\n{text[:200000]}")]
|
||||||
|
return [
|
||||||
|
TextContent(type="text", text=f"[PDF '{path}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
|
||||||
|
EmbeddedResource(type="resource", resource=BlobResourceContents(uri=f"file://{path}", blob=base64.b64encode(r.content).decode(), mimeType=ct)),
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
return [TextContent(type="text", text=f"PDF '{path}' konnte nicht gelesen werden: {e}")]
|
||||||
try:
|
try:
|
||||||
text = r.content.decode("utf-8")
|
text = r.content.decode("utf-8")
|
||||||
return [TextContent(type="text", text=text[:100000])]
|
return [TextContent(type="text", text=text[:100000])]
|
||||||
|
|||||||
Reference in New Issue
Block a user