Source code for matmmextract.inference.crop_csv_builder

"""
matmmextract.inference.crop_csv_builder
==========================================
Bridge between the cropper and the captioner.

The figures CSV has xml_file + figure_id but no downloaded_image_name.
The downloaded images are named img1.jpg, img2.jpg, ... sequentially.

This module joins them by ROW ORDER — the N-th row in the figures CSV
corresponds to imgN.jpg — which is how the Elsevier downloader assigned names.

If your downloader CSV DOES have a downloaded_image_name column, pass
that CSV as figures_csv and set use_row_order=False.
"""

from __future__ import annotations

import argparse
import os
import re
from pathlib import Path

import pandas as pd

IMG_RE = re.compile(
    r"^(img\d+)_(single|common|[A-Z])(?:_(\d+))?\.jpg$",
    re.IGNORECASE,
)

# Matches the numeric part from "img14" → 14
IMG_NUM_RE = re.compile(r"^img(\d+)$", re.IGNORECASE)



[docs]
def build_crop_csv(
    crops_dir: str | Path,
    figures_csv: str | Path,
    output_csv: str | Path = "crops_for_captioning.csv",
    use_row_order: bool = True,
    image_name_col: str = "downloaded_image_name",
    caption_col: str = "caption",
    reference_col: str = "reference_sentences",
    verbose: bool = True,
) -> pd.DataFrame:
    """Build a CSV mapping each crop to its figure caption and references.

    Parameters
    ----------
    crops_dir:
        Directory of cropped panel images (output of cropper.run).
        Files named like img1_A.jpg, img1_single.jpg, img2_B.jpg ...
    figures_csv:
        Figure-level CSV from elsevier/springer extractor.
        Must have ``caption`` and ``reference_sentences`` columns.
    output_csv:
        Where to write the resulting CSV.
    use_row_order:
        If True (default): join by row position — img1 = row 0,
        img2 = row 1, etc. Use this when the figures CSV has no
        downloaded_image_name column (standard extractor output).
        If False: join by the downloaded_image_name column value.
    image_name_col:
        Only used when use_row_order=False. Column holding image stems.
    caption_col / reference_col:
        Column names for caption and references in figures_csv.
    verbose:
        Print summary.

    Returns
    -------
    pd.DataFrame
        One row per cropped image with columns:
        downloaded_image_name, caption, reference_sentences
    """
    crops_dir = Path(crops_dir)
    figures_df = pd.read_csv(figures_csv)

    if verbose:
        print(f"[crop_csv_builder] figures CSV: {len(figures_df)} rows")
        print(f"[crop_csv_builder] columns: {list(figures_df.columns)}")

    # Collect all crop filenames and their parent img_id
    crop_files = sorted(
        f for f in os.listdir(crops_dir)
        if IMG_RE.match(f)
    )

    if use_row_order:
        # img1 → row index 0, img2 → row index 1, ...
        # Build lookup: img_number (1-based) → figure row
        lookup: dict[int, dict] = {}
        for idx, row in figures_df.iterrows():
            img_num = idx + 1  # 1-based
            lookup[img_num] = {
                caption_col:   row.get(caption_col, ""),
                reference_col: row.get(reference_col, ""),
            }
    else:
        # Join by downloaded_image_name column
        if image_name_col not in figures_df.columns:
            raise KeyError(
                f"Column '{image_name_col}' not found. "
                f"Available: {list(figures_df.columns)}. "
                f"Try use_row_order=True instead."
            )
        figures_df[image_name_col] = (
            figures_df[image_name_col].astype(str).str.strip()
            .apply(lambda x: Path(x).stem)
        )
        lookup_by_name: dict[str, dict] = (
            figures_df.set_index(image_name_col)[[caption_col, reference_col]]
            .to_dict("index")
        )

    rows = []
    unmatched = []

    for fname in crop_files:
        m = IMG_RE.match(fname)
        if not m:
            continue

        img_id = m.group(1)        # e.g. "img1"
        stem   = Path(fname).stem  # e.g. "img1_A"

        if use_row_order:
            num_match = IMG_NUM_RE.match(img_id)
            if not num_match:
                unmatched.append(fname)
                rows.append({"downloaded_image_name": stem,
                             "caption": "", "reference_sentences": ""})
                continue
            img_num = int(num_match.group(1))
            fig_data = lookup.get(img_num)
        else:
            fig_data = lookup_by_name.get(img_id)

        if fig_data is None:
            unmatched.append(fname)
            rows.append({"downloaded_image_name": stem,
                         "caption": "", "reference_sentences": ""})
        else:
            rows.append({
                "downloaded_image_name": stem,
                "caption":               fig_data.get(caption_col, ""),
                "reference_sentences":   fig_data.get(reference_col, ""),
            })

    df = pd.DataFrame(rows)
    output_csv = Path(output_csv)
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_csv, index=False)

    if verbose:
        print(f"[crop_csv_builder] {len(df)} crops → {output_csv}")
        print(f"[crop_csv_builder] matched={len(df) - len(unmatched)}  unmatched={len(unmatched)}")
        if unmatched:
            print(f"[crop_csv_builder] unmatched samples: {unmatched[:5]}")

    return df



def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser("Build captioning CSV from crops + figures CSV")
    p.add_argument("--crops-dir",    required=True)
    p.add_argument("--figures-csv",  required=True)
    p.add_argument("--output-csv",   default="crops_for_captioning.csv")
    p.add_argument("--no-row-order", action="store_true",
                   help="Join by downloaded_image_name column instead of row order")
    p.add_argument("--image-name-col", default="downloaded_image_name")
    return p.parse_args()


def main() -> None:
    args = _parse_args()
    build_crop_csv(
        crops_dir=args.crops_dir,
        figures_csv=args.figures_csv,
        output_csv=args.output_csv,
        use_row_order=not args.no_row_order,
        image_name_col=args.image_name_col,
    )


if __name__ == "__main__":
    main()