diff --git a/mail/server.py b/mail/server.py index 06c90d8..a10a931 100644 --- a/mail/server.py +++ b/mail/server.py @@ -3,6 +3,7 @@ import os import sys import base64 +import io import contextlib import imaplib import mailbox @@ -252,7 +253,7 @@ def read_attachment( key: Annotated[str, Field(description="Message key from search results")], attachment_index: Annotated[int, Field(description="Attachment number from the read_mail attachment list (1-based)")], ) -> list[TextContent | ImageContent | EmbeddedResource]: - """Read an email attachment. Images shown inline, documents (PDF/docx) as binary, text directly. Get the index from read_mail.""" + """Read an email attachment. Images shown inline, PDFs as extracted text, text directly, other documents as binary. Get the index from read_mail.""" user = get_current_user() if not user: return [TextContent(type="text", text="Error: not authenticated")] @@ -275,6 +276,27 @@ def read_attachment( return [ImageContent(type="image", data=base64.b64encode(payload).decode(), mimeType=mime)] if mime.startswith("text/"): return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])] + if mime == "application/pdf" or att["filename"].lower().endswith(".pdf"): + try: + import pdfplumber + with pdfplumber.open(io.BytesIO(payload)) as pdf: + pages = [] + for i, page in enumerate(pdf.pages, 1): + t = page.extract_text() or "" + if t.strip(): + pages.append(f"--- Seite {i} ---\n{t}") + text = "\n\n".join(pages) + if text.strip(): + return [TextContent(type="text", text=f"[PDF: {att['filename']}]\n\n{text[:200000]}")] + # Kein extrahierbarer Text -> vermutlich Scan/Bild-PDF: als Blob zurueck + return [ + TextContent(type="text", text=f"[PDF '{att['filename']}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"), + EmbeddedResource(type="resource", resource=BlobResourceContents( + uri=f"mail://attachment/{att['filename']}", + blob=base64.b64encode(payload).decode(), mimeType=mime)), + ] + except Exception as e: + return [TextContent(type="text", text=f"PDF '{att['filename']}' konnte nicht gelesen werden: {e}")] return [EmbeddedResource(type="resource", resource=BlobResourceContents( uri=f"mail://attachment/{att['filename']}", blob=base64.b64encode(payload).decode(), mimeType=mime))]