Coverage for astrocyte/documents/parsers/markitdown.py: 27%

41 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""MarkitdownParser — PDF/DOCX/HTML/PPTX → Markdown via markitdown. 

2 

3Microsoft's markitdown library (Apache 2.0, local, no API key required). 

4Produces richer section structure than raw text extraction for text-heavy 

5PDFs: headings are preserved from document structure (PDF bookmarks, 

6DOCX heading styles) rather than inferred from page boundaries. 

7 

8Install: pip install markitdown 

9 or: pip install 'astrocyte[markitdown]' 

10""" 

11 

12from __future__ import annotations 

13 

14import os 

15import tempfile 

16from pathlib import Path 

17 

18from astrocyte.documents.parsers.base import Parser, UnsupportedFileTypeError 

19 

20_SUPPORTED_EXTENSIONS = frozenset({".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"}) 

21_SUPPORTED_MIME_PREFIXES = ( 

22 "application/pdf", 

23 "application/vnd.openxmlformats", 

24 "application/vnd.ms-", 

25 "text/html", 

26 "application/msword", 

27) 

28 

29 

30class MarkitdownParser(Parser): 

31 """PDF/DOCX/HTML/PPTX → Markdown via markitdown (Microsoft, Apache 2.0). 

32 

33 Local — no network calls, no API key. 

34 

35 Compared to the pymupdf fallback in the bench harness, markitdown 

36 preserves document heading structure (PDF bookmarks → markdown headers) 

37 so build_markdown_tree produces semantically meaningful section 

38 boundaries rather than one leaf per page. 

39 """ 

40 

41 def name(self) -> str: 

42 return "markitdown" 

43 

44 def supports(self, filename: str, content_type: str | None = None) -> bool: 

45 if content_type: 

46 for prefix in _SUPPORTED_MIME_PREFIXES: 

47 if content_type.lower().startswith(prefix): 

48 return True 

49 if filename: 

50 return Path(filename).suffix.lower() in _SUPPORTED_EXTENSIONS 

51 return False 

52 

53 async def convert(self, file_data: bytes, filename: str) -> str: 

54 """Convert file bytes to markdown via markitdown. 

55 

56 Writes to a temp file (markitdown detects format by extension), 

57 converts, then cleans up. The temp file is always removed even 

58 on error. 

59 

60 Raises: 

61 UnsupportedFileTypeError: markitdown is not installed. 

62 RuntimeError: markitdown could not parse this file. 

63 """ 

64 try: 

65 from markitdown import MarkItDown # type: ignore[import-not-found] 

66 except ImportError as exc: 

67 raise UnsupportedFileTypeError( 

68 "markitdown is not installed. " 

69 "Install: pip install markitdown " 

70 "or: pip install 'astrocyte[markitdown]'" 

71 ) from exc 

72 

73 ext = Path(filename).suffix or ".pdf" 

74 tmp_path: str | None = None 

75 try: 

76 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: 

77 tmp.write(file_data) 

78 tmp_path = tmp.name 

79 

80 md = MarkItDown() 

81 result = md.convert(tmp_path) 

82 return result.text_content or "" 

83 except UnsupportedFileTypeError: 

84 raise 

85 except Exception as exc: 

86 raise RuntimeError( 

87 f"markitdown failed to convert {filename!r}: {exc}" 

88 ) from exc 

89 finally: 

90 if tmp_path is not None: 

91 try: 

92 os.unlink(tmp_path) 

93 except OSError: 

94 pass