Source code for matmmextract.shared.sentence_utils
from __future__ import annotations
import re
# Abbreviations that end with a period but must NOT trigger a sentence split.
ABBREV_RE = re.compile(
r"\b(Fig|Figs|fig|figs|e\.g|i\.e|et al|vs|approx|Dr|Prof|cf|Eq|Eqs|No|Vol|pp)\."
)
[docs]
def split_sentences(text: str) -> list[str]:
"""Split *text* into sentences, protecting scientific abbreviations.
Strategy: temporarily replace abbreviation dots with a placeholder,
split on ``[.!?]`` followed by whitespace + uppercase/bracket, then
restore the placeholder.
Parameters
----------
text:
Plain text to split.
Returns
-------
list[str]
Non-empty sentence strings.
Examples
--------
>>> split_sentences("As shown in Fig. 3, the yield is high. See also Fig. 4.")
['As shown in Fig. 3, the yield is high.', 'See also Fig. 4.']
"""
protected = ABBREV_RE.sub(lambda m: m.group(0).replace(".", "<DOT>"), text)
parts = re.split(r"(?<=[.!?])\s+(?=[A-Z\(\[])", protected)
return [p.replace("<DOT>", ".").strip() for p in parts if p.strip()]