"""
matmmextract.inference.crop_csv_builder
==========================================
Bridge between the cropper and the captioner.
The figures CSV has xml_file + figure_id but no downloaded_image_name.
The downloaded images are named img1.jpg, img2.jpg, ... sequentially.
This module joins them by ROW ORDER — the N-th row in the figures CSV
corresponds to imgN.jpg — which is how the Elsevier downloader assigned names.
If your downloader CSV DOES have a downloaded_image_name column, pass
that CSV as figures_csv and set use_row_order=False.
"""
from __future__ import annotations
import argparse
import os
import re
from pathlib import Path
import pandas as pd
IMG_RE = re.compile(
r"^(img\d+)_(single|common|[A-Z])(?:_(\d+))?\.jpg$",
re.IGNORECASE,
)
# Matches the numeric part from "img14" → 14
IMG_NUM_RE = re.compile(r"^img(\d+)$", re.IGNORECASE)
[docs]
def build_crop_csv(
crops_dir: str | Path,
figures_csv: str | Path,
output_csv: str | Path = "crops_for_captioning.csv",
use_row_order: bool = True,
image_name_col: str = "downloaded_image_name",
caption_col: str = "caption",
reference_col: str = "reference_sentences",
verbose: bool = True,
) -> pd.DataFrame:
"""Build a CSV mapping each crop to its figure caption and references.
Parameters
----------
crops_dir:
Directory of cropped panel images (output of cropper.run).
Files named like img1_A.jpg, img1_single.jpg, img2_B.jpg ...
figures_csv:
Figure-level CSV from elsevier/springer extractor.
Must have ``caption`` and ``reference_sentences`` columns.
output_csv:
Where to write the resulting CSV.
use_row_order:
If True (default): join by row position — img1 = row 0,
img2 = row 1, etc. Use this when the figures CSV has no
downloaded_image_name column (standard extractor output).
If False: join by the downloaded_image_name column value.
image_name_col:
Only used when use_row_order=False. Column holding image stems.
caption_col / reference_col:
Column names for caption and references in figures_csv.
verbose:
Print summary.
Returns
-------
pd.DataFrame
One row per cropped image with columns:
downloaded_image_name, caption, reference_sentences
"""
crops_dir = Path(crops_dir)
figures_df = pd.read_csv(figures_csv)
if verbose:
print(f"[crop_csv_builder] figures CSV: {len(figures_df)} rows")
print(f"[crop_csv_builder] columns: {list(figures_df.columns)}")
# Collect all crop filenames and their parent img_id
crop_files = sorted(
f for f in os.listdir(crops_dir)
if IMG_RE.match(f)
)
if use_row_order:
# img1 → row index 0, img2 → row index 1, ...
# Build lookup: img_number (1-based) → figure row
lookup: dict[int, dict] = {}
for idx, row in figures_df.iterrows():
img_num = idx + 1 # 1-based
lookup[img_num] = {
caption_col: row.get(caption_col, ""),
reference_col: row.get(reference_col, ""),
}
else:
# Join by downloaded_image_name column
if image_name_col not in figures_df.columns:
raise KeyError(
f"Column '{image_name_col}' not found. "
f"Available: {list(figures_df.columns)}. "
f"Try use_row_order=True instead."
)
figures_df[image_name_col] = (
figures_df[image_name_col].astype(str).str.strip()
.apply(lambda x: Path(x).stem)
)
lookup_by_name: dict[str, dict] = (
figures_df.set_index(image_name_col)[[caption_col, reference_col]]
.to_dict("index")
)
rows = []
unmatched = []
for fname in crop_files:
m = IMG_RE.match(fname)
if not m:
continue
img_id = m.group(1) # e.g. "img1"
stem = Path(fname).stem # e.g. "img1_A"
if use_row_order:
num_match = IMG_NUM_RE.match(img_id)
if not num_match:
unmatched.append(fname)
rows.append({"downloaded_image_name": stem,
"caption": "", "reference_sentences": ""})
continue
img_num = int(num_match.group(1))
fig_data = lookup.get(img_num)
else:
fig_data = lookup_by_name.get(img_id)
if fig_data is None:
unmatched.append(fname)
rows.append({"downloaded_image_name": stem,
"caption": "", "reference_sentences": ""})
else:
rows.append({
"downloaded_image_name": stem,
"caption": fig_data.get(caption_col, ""),
"reference_sentences": fig_data.get(reference_col, ""),
})
df = pd.DataFrame(rows)
output_csv = Path(output_csv)
output_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_csv, index=False)
if verbose:
print(f"[crop_csv_builder] {len(df)} crops → {output_csv}")
print(f"[crop_csv_builder] matched={len(df) - len(unmatched)} unmatched={len(unmatched)}")
if unmatched:
print(f"[crop_csv_builder] unmatched samples: {unmatched[:5]}")
return df
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser("Build captioning CSV from crops + figures CSV")
p.add_argument("--crops-dir", required=True)
p.add_argument("--figures-csv", required=True)
p.add_argument("--output-csv", default="crops_for_captioning.csv")
p.add_argument("--no-row-order", action="store_true",
help="Join by downloaded_image_name column instead of row order")
p.add_argument("--image-name-col", default="downloaded_image_name")
return p.parse_args()
def main() -> None:
args = _parse_args()
build_crop_csv(
crops_dir=args.crops_dir,
figures_csv=args.figures_csv,
output_csv=args.output_csv,
use_row_order=not args.no_row_order,
image_name_col=args.image_name_col,
)
if __name__ == "__main__":
main()