Coverage for astrocyte/policy/ner_scanner.py: 55%
22 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""NER-based PII scanner — uses spaCy for name, address, organization detection.
3spaCy is an optional dependency. Install with: pip install astrocyte[ner]
4"""
6from __future__ import annotations
8import logging
10from astrocyte.types import PiiMatch
12logger = logging.getLogger("astrocyte.pii")
14# Mapping from spaCy entity labels to PII types
15_ENTITY_PII_MAP: dict[str, tuple[str, str]] = {
16 "PERSON": ("name", "[NAME_REDACTED]"),
17 "GPE": ("address", "[ADDRESS_REDACTED]"),
18 "LOC": ("address", "[ADDRESS_REDACTED]"),
19 "FAC": ("address", "[ADDRESS_REDACTED]"),
20}
23class NerPiiScanner:
24 """spaCy-based NER for names, addresses, and locations.
26 Requires spaCy and a language model (e.g. en_core_web_sm).
27 Sync — spaCy inference is CPU-bound, no async needed.
28 """
30 def __init__(self, model: str = "en_core_web_sm") -> None:
31 try:
32 import spacy
34 self._nlp = spacy.load(model)
35 except ImportError:
36 raise ImportError("NER PII detection requires spaCy. Install with: pip install astrocyte[ner]") from None
37 except OSError:
38 raise ImportError(
39 f"spaCy model '{model}' not found. Install with: python -m spacy download {model}"
40 ) from None
42 def scan(self, text: str) -> list[PiiMatch]:
43 """Detect PERSON, GPE, LOC, FAC entities as PII."""
44 doc = self._nlp(text)
45 matches: list[PiiMatch] = []
47 for ent in doc.ents:
48 if ent.label_ in _ENTITY_PII_MAP:
49 pii_type, replacement = _ENTITY_PII_MAP[ent.label_]
50 matches.append(
51 PiiMatch(
52 pii_type=pii_type,
53 start=ent.start_char,
54 end=ent.end_char,
55 matched_text=ent.text,
56 replacement=replacement,
57 )
58 )
60 return matches