Coverage for astrocyte/documents/parsers/markitdown.py: 27%
41 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""MarkitdownParser — PDF/DOCX/HTML/PPTX → Markdown via markitdown.
3Microsoft's markitdown library (Apache 2.0, local, no API key required).
4Produces richer section structure than raw text extraction for text-heavy
5PDFs: headings are preserved from document structure (PDF bookmarks,
6DOCX heading styles) rather than inferred from page boundaries.
8Install: pip install markitdown
9 or: pip install 'astrocyte[markitdown]'
10"""
12from __future__ import annotations
14import os
15import tempfile
16from pathlib import Path
18from astrocyte.documents.parsers.base import Parser, UnsupportedFileTypeError
20_SUPPORTED_EXTENSIONS = frozenset({".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"})
21_SUPPORTED_MIME_PREFIXES = (
22 "application/pdf",
23 "application/vnd.openxmlformats",
24 "application/vnd.ms-",
25 "text/html",
26 "application/msword",
27)
30class MarkitdownParser(Parser):
31 """PDF/DOCX/HTML/PPTX → Markdown via markitdown (Microsoft, Apache 2.0).
33 Local — no network calls, no API key.
35 Compared to the pymupdf fallback in the bench harness, markitdown
36 preserves document heading structure (PDF bookmarks → markdown headers)
37 so build_markdown_tree produces semantically meaningful section
38 boundaries rather than one leaf per page.
39 """
41 def name(self) -> str:
42 return "markitdown"
44 def supports(self, filename: str, content_type: str | None = None) -> bool:
45 if content_type:
46 for prefix in _SUPPORTED_MIME_PREFIXES:
47 if content_type.lower().startswith(prefix):
48 return True
49 if filename:
50 return Path(filename).suffix.lower() in _SUPPORTED_EXTENSIONS
51 return False
53 async def convert(self, file_data: bytes, filename: str) -> str:
54 """Convert file bytes to markdown via markitdown.
56 Writes to a temp file (markitdown detects format by extension),
57 converts, then cleans up. The temp file is always removed even
58 on error.
60 Raises:
61 UnsupportedFileTypeError: markitdown is not installed.
62 RuntimeError: markitdown could not parse this file.
63 """
64 try:
65 from markitdown import MarkItDown # type: ignore[import-not-found]
66 except ImportError as exc:
67 raise UnsupportedFileTypeError(
68 "markitdown is not installed. "
69 "Install: pip install markitdown "
70 "or: pip install 'astrocyte[markitdown]'"
71 ) from exc
73 ext = Path(filename).suffix or ".pdf"
74 tmp_path: str | None = None
75 try:
76 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
77 tmp.write(file_data)
78 tmp_path = tmp.name
80 md = MarkItDown()
81 result = md.convert(tmp_path)
82 return result.text_content or ""
83 except UnsupportedFileTypeError:
84 raise
85 except Exception as exc:
86 raise RuntimeError(
87 f"markitdown failed to convert {filename!r}: {exc}"
88 ) from exc
89 finally:
90 if tmp_path is not None:
91 try:
92 os.unlink(tmp_path)
93 except OSError:
94 pass