Source code for matmmextract.elsevier.extractor

from __future__ import annotations

import argparse
import os
import re
import warnings
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning

from ..shared.sentence_utils import split_sentences

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)


FIGURE_REFID_PREFIXES: frozenset[str] = frozenset({"fig", "f", "appsec"})
PARA_TAGS: list[str] = ["ce:para", "ce:simple-para"]
XREF_TAGS: list[str] = ["ce:cross-ref", "ce:cross-refs"]


[docs] def build_image_map(soup: BeautifulSoup) -> dict[str, str]: """Build ``{ref_key: url}`` from the ``<objects>`` section. Prefers ``category='high'``; falls back to any other category. """ high: dict[str, str] = {} fallback: dict[str, str] = {} for obj in soup.find_all("object"): ref = obj.get("ref", "").strip() if not ref: continue url = obj.text.strip() if obj.get("category") == "high": high[ref] = url elif ref not in fallback: fallback[ref] = url return high if high else fallback
[docs] def resolve_image_url(fig_tag, image_map: dict[str, str]) -> str | None: """Resolve the image URL for a ``<ce:figure>`` element (3 strategies).""" # S1: ce:graphic href (legacy) graphic = fig_tag.find(["ce:graphic", "graphic"]) if graphic: for attr, val in graphic.attrs.items(): if "href" in attr: url = image_map.get(val.strip()) if url: return url # S2: ce:link locator (primary path for all current XMLs) link = fig_tag.find("ce:link") if link: locator = link.get("locator", "").strip() if locator: url = image_map.get(locator) if url: return url # S3: numeric fallback fig_id = fig_tag.get("id", "") nums = re.findall(r"\d+", fig_id) if nums: url = image_map.get(f"gr{int(nums[0])}") if url: return url return None
[docs] def extract_figures(soup: BeautifulSoup, image_map: dict[str, str]) -> dict[str, dict]: """Return ``{fig_id: {caption, image_url, fig_num}}`` for all figures.""" figures: dict[str, dict] = {} for fig in soup.find_all("ce:figure"): fig_id = fig.get("id", "").strip() if not fig_id: continue caption = "" cap = fig.find("ce:caption", recursive=False) or fig.find("ce:caption") if cap: caption = cap.get_text(" ", strip=True) img_url = resolve_image_url(fig, image_map) nums = re.findall(r"\d+", fig_id) fig_num = int(nums[0]) if nums else None figures[fig_id] = { "caption": caption, "image_url": img_url, "fig_num": fig_num, } return figures
[docs] def get_merged_paragraphs(soup: BeautifulSoup) -> list[tuple]: """Merge sibling paragraphs split by ``<ce:float-anchor>``.""" body = soup.find(["ce:body", "body", "ce:sections"]) if not body: return [(p, p.get_text(" ", strip=True)) for p in soup.find_all(PARA_TAGS)] all_paras = body.find_all(PARA_TAGS) merged: list[tuple] = [] skip_next = False for i, para in enumerate(all_paras): if skip_next: skip_next = False continue text = para.get_text(" ", strip=True) if para.find("ce:float-anchor") and i + 1 < len(all_paras): text = text + " " + all_paras[i + 1].get_text(" ", strip=True) skip_next = True merged.append((para, text)) return merged
[docs] def extract_reference_sentences( soup: BeautifulSoup, figures: dict[str, dict], ) -> dict[str, list[str]]: """Find body sentences that cite each figure (structured + regex modes).""" ref_map: dict[str, list[str]] = {fid: [] for fid in figures} num_to_fids: dict[int, list[str]] = {} for fid, data in figures.items(): n = data["fig_num"] if n is not None: num_to_fids.setdefault(n, []).append(fid) for para_tag, text in get_merged_paragraphs(soup): if not text: continue sentences = split_sentences(text) # Mode A: structured xrefs fids_structured: set[str] = set() for xr in para_tag.find_all(XREF_TAGS): refid_val = xr.get("refid", "") for rid in refid_val.split(): prefix = re.match(r"^([A-Za-z]+)", rid) if prefix and prefix.group(1).lower() in FIGURE_REFID_PREFIXES and rid in ref_map: fids_structured.add(rid) if fids_structured: nums_needed = { figures[fid]["fig_num"] for fid in fids_structured if figures[fid]["fig_num"] is not None } for sentence in sentences: for n in nums_needed: if re.search(rf"\b(Fig\.?\s*{n}|Figs\.?\s*{n}|Figure\s*{n})\b", sentence, re.IGNORECASE): for fid in num_to_fids.get(n, []): ref_map[fid].append(sentence) continue # Mode B: regex fallback for n, fids in num_to_fids.items(): pattern = rf"\b(Fig\.?\s*{n}|Figs\.?\s*{n}|Figure\s*{n})\b" if re.search(pattern, text, re.IGNORECASE): for sentence in sentences: if re.search(pattern, sentence, re.IGNORECASE): for fid in fids: ref_map[fid].append(sentence) return ref_map
[docs] def process_file(xml_path: str | Path) -> list[dict]: """Extract all figure rows from a single Elsevier XML file.""" with open(xml_path, "rb") as fh: soup = BeautifulSoup(fh, "lxml") image_map = build_image_map(soup) figures = extract_figures(soup, image_map) ref_map = extract_reference_sentences(soup, figures) rows: list[dict] = [] fname = os.path.basename(xml_path) for fid, data in figures.items(): seen: set[str] = set() unique: list[str] = [] for s in ref_map.get(fid, []): if s not in seen: seen.add(s) unique.append(s) rows.append({ "xml_file": fname, "figure_id": fid, "caption": data["caption"], "image_url": data["image_url"] or "", "num_references": len(unique), "reference_sentences": " || ".join(unique[:5]), }) return rows
[docs] def extract_all( xml_dir: str | Path, output_csv: str | Path | None = None, verbose: bool = True, ) -> tuple[pd.DataFrame, list[tuple[str, str]]]: """Process every XML file in *xml_dir* and return a figures DataFrame. Parameters ---------- xml_dir: Directory containing Elsevier ``.xml`` files. output_csv: If provided, write the DataFrame here. verbose: Print per-file progress. Returns ------- df : pd.DataFrame One row per figure. errors : list of (filename, error_message) """ xml_dir = Path(xml_dir) xml_files = sorted(f for f in xml_dir.iterdir() if f.suffix == ".xml") if verbose: print(f"Found {len(xml_files)} XML files in '{xml_dir}'") all_rows: list[dict] = [] errors: list[tuple[str, str]] = [] for path in xml_files: try: rows = process_file(path) all_rows.extend(rows) if verbose: print(f" ✓ {path.name} ({len(rows)} figures)") except Exception as exc: errors.append((path.name, str(exc))) if verbose: print(f" ✗ {path.name} ERROR: {exc}") df = pd.DataFrame(all_rows) if output_csv is not None: df.to_csv(output_csv, index=False) if verbose: print(f"\n{len(all_rows)} rows → {output_csv}") return df, errors
# --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="Extract figures from Elsevier XML files.") p.add_argument("--xml-dir", default="alloys_elsevier") p.add_argument("--output-csv", default="alloy_elsevier_img_details.csv") return p.parse_args() def main() -> None: args = _parse_args() extract_all(xml_dir=args.xml_dir, output_csv=args.output_csv) if __name__ == "__main__": main()