Coverage for astrocyte/documents/parsers/markdown.py: 100%

18 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""MarkdownParser — pass-through for markdown / text input. 

2 

3Decodes UTF-8 bytes to a string and returns the markdown unchanged. 

4The cheapest parser; useful for tests, for inline content, and as the 

5zero-configuration default when ingesting text the caller already has. 

6""" 

7 

8from __future__ import annotations 

9 

10from astrocyte.documents.parsers.base import Parser 

11 

12_MARKDOWN_EXTENSIONS = {".md", ".markdown", ".txt", ".rst"} 

13 

14 

15class MarkdownParser(Parser): 

16 """Treat input as raw markdown / plain text. UTF-8 decode + return.""" 

17 

18 def name(self) -> str: 

19 return "markdown" 

20 

21 def supports(self, filename: str, content_type: str | None = None) -> bool: 

22 if content_type and content_type.lower().startswith(("text/markdown", "text/plain")): 

23 return True 

24 if not filename: 

25 return True # if we don't know, claim it 

26 lower = filename.lower() 

27 return any(lower.endswith(ext) for ext in _MARKDOWN_EXTENSIONS) 

28 

29 async def convert(self, file_data: bytes, filename: str) -> str: 

30 """Decode bytes as UTF-8 markdown; pass through. 

31 

32 Surrogate-escape fallback for malformed encodings so we never 

33 raise on a slightly-broken file — the tree builder will handle 

34 whatever comes out. 

35 """ 

36 try: 

37 return file_data.decode("utf-8") 

38 except UnicodeDecodeError: 

39 return file_data.decode("utf-8", errors="surrogateescape")