mail: PDF-Anhaenge als extrahierten Text zurueckgeben
read_attachment extrahiert PDF-Text serverseitig via pdfplumber und liefert ihn als TextContent (statt EmbeddedResource-Blob, den claude.ai nicht lesen kann). Bei Scan-/Bild-PDFs ohne Text: Hinweis + Blob-Fallback. Loest 'Connector kann PDF nicht lesen' bei Buchungsbestaetigungen etc. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+23
-1
@@ -3,6 +3,7 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import base64
|
import base64
|
||||||
|
import io
|
||||||
import contextlib
|
import contextlib
|
||||||
import imaplib
|
import imaplib
|
||||||
import mailbox
|
import mailbox
|
||||||
@@ -252,7 +253,7 @@ def read_attachment(
|
|||||||
key: Annotated[str, Field(description="Message key from search results")],
|
key: Annotated[str, Field(description="Message key from search results")],
|
||||||
attachment_index: Annotated[int, Field(description="Attachment number from the read_mail attachment list (1-based)")],
|
attachment_index: Annotated[int, Field(description="Attachment number from the read_mail attachment list (1-based)")],
|
||||||
) -> list[TextContent | ImageContent | EmbeddedResource]:
|
) -> list[TextContent | ImageContent | EmbeddedResource]:
|
||||||
"""Read an email attachment. Images shown inline, documents (PDF/docx) as binary, text directly. Get the index from read_mail."""
|
"""Read an email attachment. Images shown inline, PDFs as extracted text, text directly, other documents as binary. Get the index from read_mail."""
|
||||||
user = get_current_user()
|
user = get_current_user()
|
||||||
if not user:
|
if not user:
|
||||||
return [TextContent(type="text", text="Error: not authenticated")]
|
return [TextContent(type="text", text="Error: not authenticated")]
|
||||||
@@ -275,6 +276,27 @@ def read_attachment(
|
|||||||
return [ImageContent(type="image", data=base64.b64encode(payload).decode(), mimeType=mime)]
|
return [ImageContent(type="image", data=base64.b64encode(payload).decode(), mimeType=mime)]
|
||||||
if mime.startswith("text/"):
|
if mime.startswith("text/"):
|
||||||
return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])]
|
return [TextContent(type="text", text=payload.decode("utf-8", errors="replace")[:100000])]
|
||||||
|
if mime == "application/pdf" or att["filename"].lower().endswith(".pdf"):
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
with pdfplumber.open(io.BytesIO(payload)) as pdf:
|
||||||
|
pages = []
|
||||||
|
for i, page in enumerate(pdf.pages, 1):
|
||||||
|
t = page.extract_text() or ""
|
||||||
|
if t.strip():
|
||||||
|
pages.append(f"--- Seite {i} ---\n{t}")
|
||||||
|
text = "\n\n".join(pages)
|
||||||
|
if text.strip():
|
||||||
|
return [TextContent(type="text", text=f"[PDF: {att['filename']}]\n\n{text[:200000]}")]
|
||||||
|
# Kein extrahierbarer Text -> vermutlich Scan/Bild-PDF: als Blob zurueck
|
||||||
|
return [
|
||||||
|
TextContent(type="text", text=f"[PDF '{att['filename']}' enthaelt keinen extrahierbaren Text (vermutlich Scan). Rohdaten folgen.]"),
|
||||||
|
EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||||
|
uri=f"mail://attachment/{att['filename']}",
|
||||||
|
blob=base64.b64encode(payload).decode(), mimeType=mime)),
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
return [TextContent(type="text", text=f"PDF '{att['filename']}' konnte nicht gelesen werden: {e}")]
|
||||||
return [EmbeddedResource(type="resource", resource=BlobResourceContents(
|
return [EmbeddedResource(type="resource", resource=BlobResourceContents(
|
||||||
uri=f"mail://attachment/{att['filename']}",
|
uri=f"mail://attachment/{att['filename']}",
|
||||||
blob=base64.b64encode(payload).decode(), mimeType=mime))]
|
blob=base64.b64encode(payload).decode(), mimeType=mime))]
|
||||||
|
|||||||
Reference in New Issue
Block a user