Source code for matmmextract.elsevier.downloader

from __future__ import annotations

import argparse
import os

import pandas as pd

from matmmextract.shared.downloader import run_downloads


DEFAULT_CSV = "elsevier_img_details.csv"
DEFAULT_OUTPUT_DIR = "elsevier_contents"
DEFAULT_OUTPUT_CSV = "elsevier_figures_with_images.csv"
DEFAULT_LOG_FILE = "download_log_elsevier.csv"
DEFAULT_NAME_PREFIX = "img"
DEFAULT_MAX_WORKERS = 4


[docs] def build_candidate_urls(image_url: str, xml_file: str) -> list[str]: """Return candidate download URLs for an Elsevier image. Elsevier image URLs in the figure CSV are absolute CDN URLs, so this is a direct passthrough. The ``xml_file`` argument is unused but kept for interface compatibility with the shared engine. """ url = image_url.strip() return [url] if url else []
[docs] def download_all( csv_path: str, output_dir: str = DEFAULT_OUTPUT_DIR, output_csv: str = DEFAULT_OUTPUT_CSV, log_file: str = DEFAULT_LOG_FILE, api_key: str | None = None, inst_token: str | None = None, name_prefix: str = DEFAULT_NAME_PREFIX, max_workers: int = DEFAULT_MAX_WORKERS, verbose: bool = True, ) -> pd.DataFrame: """Download all Elsevier figure images referenced in *csv_path*. Parameters ---------- csv_path: Figure CSV (output of :func:`~matmmextract.elsevier.extractor.extract_all`). output_dir: Directory to save downloaded images. output_csv: Updated CSV written after each batch. log_file: Per-URL download log for resume support. api_key: Elsevier API key (falls back to ``ELSEVIER_API_KEY`` env var). inst_token: Elsevier institutional token (falls back to ``ELSEVIER_INST_TOKEN``). name_prefix: Image filename prefix. max_workers: Thread pool size. verbose: Print progress. Returns ------- pd.DataFrame Updated figure DataFrame with download status columns added. """ api_key = api_key or os.getenv("ELSEVIER_API_KEY", "") inst_token = inst_token or os.getenv("ELSEVIER_INST_TOKEN", "") headers = { "X-ELS-APIKey": api_key, "X-ELS-Insttoken": inst_token, "Accept": "image/jpeg", } df = pd.read_csv(csv_path) return run_downloads( df=df, output_dir=output_dir, output_csv=output_csv, log_file=log_file, build_candidate_urls=build_candidate_urls, request_headers=headers, name_prefix=name_prefix, max_workers=max_workers, verbose=verbose, )
def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="Download Elsevier figure images.") p.add_argument("--csv", default=DEFAULT_CSV) p.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR) p.add_argument("--output-csv", default=DEFAULT_OUTPUT_CSV) p.add_argument("--log-file", default=DEFAULT_LOG_FILE) p.add_argument("--api-key", default=os.getenv("ELSEVIER_API_KEY")) p.add_argument("--inst-token", default=os.getenv("ELSEVIER_INST_TOKEN")) p.add_argument("--name-prefix", default=DEFAULT_NAME_PREFIX) p.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS) return p.parse_args() def main() -> None: args = _parse_args() download_all( csv_path=args.csv, output_dir=args.output_dir, output_csv=args.output_csv, log_file=args.log_file, api_key=args.api_key, inst_token=args.inst_token, name_prefix=args.name_prefix, max_workers=args.workers, ) if __name__ == "__main__": main()