Coverage for astrocyte/documents/parsers/base.py: 94%
16 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Parser ABC — converts raw file bytes to markdown.
3Hindsight-style abstract base. Concrete parsers (Markdown, Markitdown,
4LlamaParse) implement ``convert(bytes, filename) -> markdown text``.
5Their output feeds the Document Engine's tree builders.
7This is the layer that handles "user uploaded a file" → "we have text we
8can parse into a tree." Markdown parsers pass through; PDF/DOCX/HTML
9parsers extract text first.
10"""
12from __future__ import annotations
14from abc import ABC, abstractmethod
15from dataclasses import dataclass
18class UnsupportedFileTypeError(Exception):
19 """Raised by a Parser when it can't handle a given file type."""
22@dataclass
23class ConvertResult:
24 """Result of a successful file → markdown conversion."""
26 content: str
27 parser_name: str
28 mime_type: str = "text/markdown"
31class Parser(ABC):
32 """Abstract base for file → markdown parsers.
34 Subclasses MUST implement ``convert()`` and ``name()``. ``supports()``
35 has a default of True; override for local extension-based filtering.
36 Parsers that delegate to a remote service should leave ``supports()``
37 True and raise ``UnsupportedFileTypeError`` from ``convert()``.
38 """
40 @abstractmethod
41 async def convert(self, file_data: bytes, filename: str) -> str:
42 """Convert file bytes to markdown text.
44 Args:
45 file_data: raw file bytes.
46 filename: original filename (extension used for type detection).
48 Returns:
49 Markdown content (string).
51 Raises:
52 UnsupportedFileTypeError: this parser can't handle the file type.
53 RuntimeError: parsing failed for some other reason.
54 """
56 @abstractmethod
57 def name(self) -> str:
58 """Short identifier for the parser, e.g. ``"markdown"``, ``"markitdown"``."""
60 def supports(self, filename: str, content_type: str | None = None) -> bool:
61 """Quick check: can this parser handle this file?
63 Defaults to True (the parser handles everything until proven
64 otherwise via UnsupportedFileTypeError). Override for static
65 extension-based filtering.
66 """
67 return True