Source code for matmmextract.springer.downloader
from __future__ import annotations
import argparse
from urllib.parse import quote, urlparse
import pandas as pd
from ..shared.doi_utils import filename_to_doi
from ..shared.downloader import extension_from_url, run_downloads
SPRINGER_STATIC_BASE = "https://media.springernature.com/full/springer-static/image"
DEFAULT_CSV = "springer_figure_details.csv"
DEFAULT_OUTPUT_DIR = "springer_images_flat"
DEFAULT_OUTPUT_CSV = "springer_figure_details_with_images.csv"
DEFAULT_LOG_FILE = "download_log_springer.csv"
DEFAULT_NAME_PREFIX = "img"
DEFAULT_MAX_WORKERS = 6
def _is_absolute(url: str) -> bool:
parsed = urlparse(url)
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
[docs]
def build_candidate_urls(image_url: str, xml_file: str) -> list[str]:
"""Build ordered candidate URLs for a Springer image.
If *image_url* is already absolute, return it directly.
Otherwise, build CDN candidates using the DOI reconstructed from
*xml_file*.
Parameters
----------
image_url:
Raw URL string from the figure CSV.
xml_file:
Source XML filename (used to reconstruct the DOI for relative URLs).
Returns
-------
list[str]
Ordered candidate URLs to try.
"""
url = str(image_url).strip()
if not url:
return []
if _is_absolute(url):
return [url]
candidates: list[str] = []
doi = filename_to_doi(xml_file)
rel = url.lstrip("/")
if doi:
article_key = "art%3A" + quote(doi, safe="")
candidates.append(f"{SPRINGER_STATIC_BASE}/{article_key}/{rel}")
if not rel.startswith("MediaObjects/"):
candidates.append(
f"{SPRINGER_STATIC_BASE}/{article_key}/MediaObjects/{rel}"
)
candidates.append(f"https://media.springernature.com/{rel}")
return list(dict.fromkeys(candidates)) # deduplicate preserving order
[docs]
def download_all(
csv_path: str,
output_dir: str = DEFAULT_OUTPUT_DIR,
output_csv: str = DEFAULT_OUTPUT_CSV,
log_file: str = DEFAULT_LOG_FILE,
name_prefix: str = DEFAULT_NAME_PREFIX,
max_workers: int = DEFAULT_MAX_WORKERS,
verbose: bool = True,
) -> pd.DataFrame:
"""Download all Springer figure images referenced in *csv_path*.
Parameters
----------
csv_path:
Figure CSV (output of
:func:`~matmmextract.springer.extractor.extract_all`).
output_dir:
Directory to save downloaded images.
output_csv:
Updated CSV written after each batch.
log_file:
Per-URL download log for resume support.
name_prefix:
Image filename prefix.
max_workers:
Thread pool size.
verbose:
Print progress.
Returns
-------
pd.DataFrame
Updated figure DataFrame with download status columns added.
"""
headers = {
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 SpringerImageDownloader/1.0",
}
df = pd.read_csv(csv_path)
return run_downloads(
df=df,
output_dir=output_dir,
output_csv=output_csv,
log_file=log_file,
build_candidate_urls=build_candidate_urls,
request_headers=headers,
name_prefix=name_prefix,
max_workers=max_workers,
verbose=verbose,
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Download Springer figure images.")
p.add_argument("--csv", default=DEFAULT_CSV)
p.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR)
p.add_argument("--output-csv", default=DEFAULT_OUTPUT_CSV)
p.add_argument("--log-file", default=DEFAULT_LOG_FILE)
p.add_argument("--name-prefix", default=DEFAULT_NAME_PREFIX)
p.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS)
return p.parse_args()
def main() -> None:
args = _parse_args()
download_all(
csv_path=args.csv,
output_dir=args.output_dir,
output_csv=args.output_csv,
log_file=args.log_file,
name_prefix=args.name_prefix,
max_workers=args.workers,
)
if __name__ == "__main__":
main()