Source code for matmmextract.shared.sentence_utils

from __future__ import annotations

import re

# Abbreviations that end with a period but must NOT trigger a sentence split.
ABBREV_RE = re.compile(
    r"\b(Fig|Figs|fig|figs|e\.g|i\.e|et al|vs|approx|Dr|Prof|cf|Eq|Eqs|No|Vol|pp)\."
)


[docs] def split_sentences(text: str) -> list[str]: """Split *text* into sentences, protecting scientific abbreviations. Strategy: temporarily replace abbreviation dots with a placeholder, split on ``[.!?]`` followed by whitespace + uppercase/bracket, then restore the placeholder. Parameters ---------- text: Plain text to split. Returns ------- list[str] Non-empty sentence strings. Examples -------- >>> split_sentences("As shown in Fig. 3, the yield is high. See also Fig. 4.") ['As shown in Fig. 3, the yield is high.', 'See also Fig. 4.'] """ protected = ABBREV_RE.sub(lambda m: m.group(0).replace(".", "<DOT>"), text) parts = re.split(r"(?<=[.!?])\s+(?=[A-Z\(\[])", protected) return [p.replace("<DOT>", ".").strip() for p in parts if p.strip()]