Coverage for astrocyte/documents/parsers/markitdown.py: 27%

1"""MarkitdownParser — PDF/DOCX/HTML/PPTX → Markdown via markitdown.

3Microsoft's markitdown library (Apache 2.0, local, no API key required).

4Produces richer section structure than raw text extraction for text-heavy

5PDFs: headings are preserved from document structure (PDF bookmarks,

6DOCX heading styles) rather than inferred from page boundaries.

8Install: pip install markitdown

9 or: pip install 'astrocyte[markitdown]'

10"""

12from __future__ import annotations

14import os

15import tempfile

16from pathlib import Path

18from astrocyte.documents.parsers.base import Parser, UnsupportedFileTypeError

20_SUPPORTED_EXTENSIONS = frozenset({".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"})

21_SUPPORTED_MIME_PREFIXES = (

22 "application/pdf",

23 "application/vnd.openxmlformats",

24 "application/vnd.ms-",

25 "text/html",

26 "application/msword",

27)

30class MarkitdownParser(Parser):

31 """PDF/DOCX/HTML/PPTX → Markdown via markitdown (Microsoft, Apache 2.0).

33 Local — no network calls, no API key.

35 Compared to the pymupdf fallback in the bench harness, markitdown

36 preserves document heading structure (PDF bookmarks → markdown headers)

37 so build_markdown_tree produces semantically meaningful section

38 boundaries rather than one leaf per page.

39 """

41 def name(self) -> str:

42 return "markitdown"

44 def supports(self, filename: str, content_type: str | None = None) -> bool:

45 if content_type:

46 for prefix in _SUPPORTED_MIME_PREFIXES:

47 if content_type.lower().startswith(prefix):

48 return True

49 if filename:

50 return Path(filename).suffix.lower() in _SUPPORTED_EXTENSIONS

51 return False

53 async def convert(self, file_data: bytes, filename: str) -> str:

54 """Convert file bytes to markdown via markitdown.

56 Writes to a temp file (markitdown detects format by extension),

57 converts, then cleans up. The temp file is always removed even

58 on error.

60 Raises:

61 UnsupportedFileTypeError: markitdown is not installed.

62 RuntimeError: markitdown could not parse this file.

63 """

64 try:

65 from markitdown import MarkItDown # type: ignore[import-not-found]

66 except ImportError as exc:

67 raise UnsupportedFileTypeError(

68 "markitdown is not installed. "

69 "Install: pip install markitdown "

70 "or: pip install 'astrocyte[markitdown]'"

71 ) from exc

73 ext = Path(filename).suffix or ".pdf"

74 tmp_path: str | None = None

75 try:

76 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:

77 tmp.write(file_data)

78 tmp_path = tmp.name

80 md = MarkItDown()

81 result = md.convert(tmp_path)

82 return result.text_content or ""

83 except UnsupportedFileTypeError:

84 raise

85 except Exception as exc:

86 raise RuntimeError(

87 f"markitdown failed to convert {filename!r}: {exc}"

88 ) from exc

89 finally:

90 if tmp_path is not None:

91 try:

92 os.unlink(tmp_path)

93 except OSError:

94 pass