Coverage for astrocyte/documents/parsers/__init__.py: 84%
25 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Parser registry + public re-exports.
3Usage:
4 from astrocyte.documents.parsers import ParserRegistry, MarkdownParser
6 registry = ParserRegistry()
7 registry.register(MarkdownParser())
8 parser = registry.pick("notes.md")
9 text = await parser.convert(file_bytes, "notes.md")
10"""
12from __future__ import annotations
14import logging
16from astrocyte.documents.parsers.base import (
17 ConvertResult,
18 Parser,
19 UnsupportedFileTypeError,
20)
21from astrocyte.documents.parsers.markdown import MarkdownParser
23__all__ = [
24 "ConvertResult",
25 "Parser",
26 "UnsupportedFileTypeError",
27 "MarkdownParser",
28 "ParserRegistry",
29]
31logger = logging.getLogger(__name__)
34class ParserRegistry:
35 """Routes a file to the first registered parser that supports it.
37 Registration order = preference order. Register more specific
38 parsers first (e.g., a custom PDF parser) and the catch-all
39 MarkdownParser last.
40 """
42 def __init__(self) -> None:
43 self._parsers: list[Parser] = []
45 def register(self, parser: Parser) -> None:
46 self._parsers.append(parser)
47 logger.debug("ParserRegistry: registered %s", parser.name())
49 def pick(self, filename: str, content_type: str | None = None) -> Parser:
50 """Return the first parser that supports the file.
52 Raises ``UnsupportedFileTypeError`` if no registered parser
53 supports the file. ``UnsupportedFileTypeError`` is also the
54 right exception for ``convert()`` to raise downstream when a
55 parser claimed support but then couldn't handle a specific file.
56 """
57 for p in self._parsers:
58 if p.supports(filename, content_type):
59 return p
60 raise UnsupportedFileTypeError(
61 f"No registered parser supports filename={filename!r} content_type={content_type!r}",
62 )
64 def names(self) -> list[str]:
65 return [p.name() for p in self._parsers]
67 def __len__(self) -> int:
68 return len(self._parsers)
71def default_registry() -> ParserRegistry:
72 """A registry pre-populated with MarkdownParser only (Phase 1).
74 Phase 6 will add MarkitdownParser + LlamaParseParser ahead of
75 MarkdownParser so PDF/DOCX files route to richer parsers first
76 and markdown falls through to the catch-all.
77 """
78 reg = ParserRegistry()
79 reg.register(MarkdownParser())
80 return reg