Coverage for astrocyte/documents/parsers/base.py: 94%

16 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Parser ABC — converts raw file bytes to markdown. 

2 

3Hindsight-style abstract base. Concrete parsers (Markdown, Markitdown, 

4LlamaParse) implement ``convert(bytes, filename) -> markdown text``. 

5Their output feeds the Document Engine's tree builders. 

6 

7This is the layer that handles "user uploaded a file" → "we have text we 

8can parse into a tree." Markdown parsers pass through; PDF/DOCX/HTML 

9parsers extract text first. 

10""" 

11 

12from __future__ import annotations 

13 

14from abc import ABC, abstractmethod 

15from dataclasses import dataclass 

16 

17 

18class UnsupportedFileTypeError(Exception): 

19 """Raised by a Parser when it can't handle a given file type.""" 

20 

21 

22@dataclass 

23class ConvertResult: 

24 """Result of a successful file → markdown conversion.""" 

25 

26 content: str 

27 parser_name: str 

28 mime_type: str = "text/markdown" 

29 

30 

31class Parser(ABC): 

32 """Abstract base for file → markdown parsers. 

33 

34 Subclasses MUST implement ``convert()`` and ``name()``. ``supports()`` 

35 has a default of True; override for local extension-based filtering. 

36 Parsers that delegate to a remote service should leave ``supports()`` 

37 True and raise ``UnsupportedFileTypeError`` from ``convert()``. 

38 """ 

39 

40 @abstractmethod 

41 async def convert(self, file_data: bytes, filename: str) -> str: 

42 """Convert file bytes to markdown text. 

43 

44 Args: 

45 file_data: raw file bytes. 

46 filename: original filename (extension used for type detection). 

47 

48 Returns: 

49 Markdown content (string). 

50 

51 Raises: 

52 UnsupportedFileTypeError: this parser can't handle the file type. 

53 RuntimeError: parsing failed for some other reason. 

54 """ 

55 

56 @abstractmethod 

57 def name(self) -> str: 

58 """Short identifier for the parser, e.g. ``"markdown"``, ``"markitdown"``.""" 

59 

60 def supports(self, filename: str, content_type: str | None = None) -> bool: 

61 """Quick check: can this parser handle this file? 

62 

63 Defaults to True (the parser handles everything until proven 

64 otherwise via UnsupportedFileTypeError). Override for static 

65 extension-based filtering. 

66 """ 

67 return True