Coverage for astrocyte/policy/ner_scanner.py: 55%

22 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""NER-based PII scanner — uses spaCy for name, address, organization detection. 

2 

3spaCy is an optional dependency. Install with: pip install astrocyte[ner] 

4""" 

5 

6from __future__ import annotations 

7 

8import logging 

9 

10from astrocyte.types import PiiMatch 

11 

12logger = logging.getLogger("astrocyte.pii") 

13 

14# Mapping from spaCy entity labels to PII types 

15_ENTITY_PII_MAP: dict[str, tuple[str, str]] = { 

16 "PERSON": ("name", "[NAME_REDACTED]"), 

17 "GPE": ("address", "[ADDRESS_REDACTED]"), 

18 "LOC": ("address", "[ADDRESS_REDACTED]"), 

19 "FAC": ("address", "[ADDRESS_REDACTED]"), 

20} 

21 

22 

23class NerPiiScanner: 

24 """spaCy-based NER for names, addresses, and locations. 

25 

26 Requires spaCy and a language model (e.g. en_core_web_sm). 

27 Sync — spaCy inference is CPU-bound, no async needed. 

28 """ 

29 

30 def __init__(self, model: str = "en_core_web_sm") -> None: 

31 try: 

32 import spacy 

33 

34 self._nlp = spacy.load(model) 

35 except ImportError: 

36 raise ImportError("NER PII detection requires spaCy. Install with: pip install astrocyte[ner]") from None 

37 except OSError: 

38 raise ImportError( 

39 f"spaCy model '{model}' not found. Install with: python -m spacy download {model}" 

40 ) from None 

41 

42 def scan(self, text: str) -> list[PiiMatch]: 

43 """Detect PERSON, GPE, LOC, FAC entities as PII.""" 

44 doc = self._nlp(text) 

45 matches: list[PiiMatch] = [] 

46 

47 for ent in doc.ents: 

48 if ent.label_ in _ENTITY_PII_MAP: 

49 pii_type, replacement = _ENTITY_PII_MAP[ent.label_] 

50 matches.append( 

51 PiiMatch( 

52 pii_type=pii_type, 

53 start=ent.start_char, 

54 end=ent.end_char, 

55 matched_text=ent.text, 

56 replacement=replacement, 

57 ) 

58 ) 

59 

60 return matches