Source code for matmmextract.shared.doi_utils
from __future__ import annotations
import re
from pathlib import Path
[docs]
def doi_to_filename(doi: str, suffix: str = ".xml") -> str:
"""Convert a DOI to a safe filename.
Replaces every character that is not word-safe, a hyphen, underscore,
or dot with ``_``.
Parameters
----------
doi:
Raw DOI string, e.g. ``"10.1016/j.actamat.2020.01.001"``.
suffix:
File extension (default ``".xml"``).
Returns
-------
str
e.g. ``"10.1016_j.actamat.2020.01.001.xml"``
"""
return re.sub(r"[^\w\-_.]", "_", doi) + suffix
[docs]
def filename_to_doi(filename: str | Path) -> str:
"""Reverse ``doi_to_filename``: convert a stem back to a DOI.
Handles the Springer convention where the first ``_`` after the
registrant prefix (``10.XXXX``) maps back to ``/``.
Parameters
----------
filename:
File path or bare filename, e.g.
``"10.1007_s42114-026-01633-w.xml"``.
Returns
-------
str
DOI string, e.g. ``"10.1007/s42114-026-01633-w"``, or ``""``
if the stem does not look like a DOI.
"""
stem = Path(str(filename)).stem
match = re.match(r"^(10\.\d{4,9})_(.+)$", stem)
if not match:
return ""
return f"{match.group(1)}/{match.group(2)}"
[docs]
def load_set(path: str | Path) -> set[str]:
"""Load a newline-delimited text file into a set of strings.
Returns an empty set if the file does not exist.
Parameters
----------
path:
Path to the text file.
"""
p = Path(path)
if p.exists():
return {line.strip() for line in p.read_text(encoding="utf-8").splitlines() if line.strip()}
return set()
[docs]
def append_line(path: str | Path, line: str) -> None:
"""Append a single line (with newline) to a text file.
Creates the file if it does not exist.
Parameters
----------
path:
Destination file path.
line:
Text to append (newline is added automatically).
"""
with open(path, "a", encoding="utf-8") as fh:
fh.write(line + "\n")