Coverage for astrocyte/documents/parsers/__init__.py: 84%

25 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Parser registry + public re-exports. 

2 

3Usage: 

4 from astrocyte.documents.parsers import ParserRegistry, MarkdownParser 

5 

6 registry = ParserRegistry() 

7 registry.register(MarkdownParser()) 

8 parser = registry.pick("notes.md") 

9 text = await parser.convert(file_bytes, "notes.md") 

10""" 

11 

12from __future__ import annotations 

13 

14import logging 

15 

16from astrocyte.documents.parsers.base import ( 

17 ConvertResult, 

18 Parser, 

19 UnsupportedFileTypeError, 

20) 

21from astrocyte.documents.parsers.markdown import MarkdownParser 

22 

23__all__ = [ 

24 "ConvertResult", 

25 "Parser", 

26 "UnsupportedFileTypeError", 

27 "MarkdownParser", 

28 "ParserRegistry", 

29] 

30 

31logger = logging.getLogger(__name__) 

32 

33 

34class ParserRegistry: 

35 """Routes a file to the first registered parser that supports it. 

36 

37 Registration order = preference order. Register more specific 

38 parsers first (e.g., a custom PDF parser) and the catch-all 

39 MarkdownParser last. 

40 """ 

41 

42 def __init__(self) -> None: 

43 self._parsers: list[Parser] = [] 

44 

45 def register(self, parser: Parser) -> None: 

46 self._parsers.append(parser) 

47 logger.debug("ParserRegistry: registered %s", parser.name()) 

48 

49 def pick(self, filename: str, content_type: str | None = None) -> Parser: 

50 """Return the first parser that supports the file. 

51 

52 Raises ``UnsupportedFileTypeError`` if no registered parser 

53 supports the file. ``UnsupportedFileTypeError`` is also the 

54 right exception for ``convert()`` to raise downstream when a 

55 parser claimed support but then couldn't handle a specific file. 

56 """ 

57 for p in self._parsers: 

58 if p.supports(filename, content_type): 

59 return p 

60 raise UnsupportedFileTypeError( 

61 f"No registered parser supports filename={filename!r} content_type={content_type!r}", 

62 ) 

63 

64 def names(self) -> list[str]: 

65 return [p.name() for p in self._parsers] 

66 

67 def __len__(self) -> int: 

68 return len(self._parsers) 

69 

70 

71def default_registry() -> ParserRegistry: 

72 """A registry pre-populated with MarkdownParser only (Phase 1). 

73 

74 Phase 6 will add MarkitdownParser + LlamaParseParser ahead of 

75 MarkdownParser so PDF/DOCX files route to richer parsers first 

76 and markdown falls through to the catch-all. 

77 """ 

78 reg = ParserRegistry() 

79 reg.register(MarkdownParser()) 

80 return reg