Examples

Elsevier Full Pipeline (OpenAlex → Elsevier → Detection → Azure → Dataset)

 1from matmmextract.preprocess.pipeline import load_csvs
 2from matmmextract.inference.detector import detect
 3from matmmextract.inference.cropper import crop
 4from matmmextract.inference.crop_csv_builder import build_crop_csv
 5from matmmextract.inference.captioner_azure import captioner as azure_caption
 6from matmmextract.inference.dataset_builder import build
 7from matmmextract.openalex.fetcher import fetch_elsevier
 8from matmmextract.elsevier.extractor import extract_all as elsevier_extract
 9from matmmextract.elsevier.downloader import download_all as elsevier_download
10from matmmextract.elsevier.fetcher import fetch_all as elsevier_fetch
11
12
13elsevier_result = fetch_elsevier(
14    license_="cc-by",
15    keywords=["titanium"],
16    from_year=2020, to_year=2024,
17    max_results=1,
18    output_csv="output/elsevier_papers.csv",
19    api_key="",
20)
21
22elsevier_df = load_csvs(["output/elsevier_papers.csv"])
23
24elsevier_fetch(
25    df=elsevier_df,
26    api_key="",
27    inst_token="",
28    output_dir="_elsevier",
29)
30
31elsevier_figs, _ = elsevier_extract(
32    "_elsevier",
33    output_csv="output/elsevier_figures.csv",
34)
35
36elsevier_download(
37    csv_path="output/elsevier_figures.csv",
38    output_dir="images/elsevier",
39    api_key="",
40    inst_token="",
41)
42
43
44# Step 8: detect
45detect(
46    image_dir="images/elsevier",
47    output_dir="inference_results",
48    checkpoint="https://huggingface.co/CMEG-IITR/yolo12_unique_multimat",
49    conf=0.6, iou=0.4, imgsz=1024,
50)
51
52# Step 9: crop
53crop(
54    image_dir="images/elsevier",
55    json_dir="inference_results",
56    output_dir="crops",
57)
58
59# Step 9.5: build captioning CSV
60build_crop_csv(
61    crops_dir="crops",
62    figures_csv="output/elsevier_figures.csv",
63    output_csv="output/crops_for_captioning.csv",
64)
65
66# Step 10: generate sub-captions via Azure
67azure_caption(
68    csv_path="output/crops_for_captioning.csv",
69    output_dir="subcaptions",
70    api_key="",
71    azure_endpoint="",
72    model_name="Mistral-Large-3",
73    image_name_col="downloaded_image_name",   # matches crops_for_captioning.csv
74    caption_col="caption",
75    reference_col="reference_sentences",
76)
77
78# Step 11: link crops + captions → final dataset
79build(
80    images_dir="crops",
81    json_dir="subcaptions",
82    output_csv="elsevier_linked_dataset.csv",
83)
84

Elsevier from Scopus Export (Scopus → Elsevier)

 1from matmmextract.preprocess import (
 2    load_csvs,
 3    drop_duplicate_dois,
 4    filter_open_access,
 5    save_csv,
 6)
 7
 8from matmmextract.preprocess import (
 9    scan_directory,
10    filter_figures_cc_by,
11)
12
13from matmmextract.elsevier import (
14    fetch_all as elsevier_fetch,
15    extract_all as elsevier_extract,
16    download_all as elsevier_download,
17)
18
19
20# ── Step 1: Load CSVs ────────────────────────────────────────────────────────
21elsevier_df = load_csvs(["scopus.csv"])
22elsevier_df = drop_duplicate_dois(elsevier_df)
23
24elsevier_oa = filter_open_access(elsevier_df)
25
26save_csv(elsevier_df, "output/elsevier_papers.csv")
27
28
29# ── Step 2: Fetch Elsevier XMLs ──────────────────────────────────────────────
30elsevier_fetch(
31    df=elsevier_df,
32    api_key="",
33    inst_token="",
34    output_dir="_elsevier",
35)
36
37
38# ── Step 3: Extract figures ──────────────────────────────────────────────────
39elsevier_figs, _ = elsevier_extract(
40    "_elsevier",
41    output_csv="output/elsevier_figures.csv",
42)
43
44
45# ── Step 4: CC-BY filtering ──────────────────────────────────────────────────
46cc_df, _ = scan_directory(
47    "_elsevier",
48    output_csv="output/elsevier_cc.csv",
49)
50
51elsevier_figs_ccby = filter_figures_cc_by(
52    elsevier_figs,
53    cc_df,
54    output_csv="output/elsevier_figures_ccby.csv",
55)
56
57
58# ── Step 5: Download images ──────────────────────────────────────────────────
59elsevier_download(
60    csv_path="output/elsevier_figures_ccby.csv",
61    output_dir="images/elsevier",
62    api_key="",
63    inst_token="",
64)

Springer Full Pipeline (OpenAlex → Springer → Detection (model checkpoint from Hugging Face Hub) → Gemini → Dataset)

 1from matmmextract.openalex import fetch_springer
 2from matmmextract.preprocess import load_csvs
 3
 4from matmmextract.springer import (
 5    fetch_all as springer_fetch,
 6    extract_all as springer_extract,
 7    download_all as springer_download,
 8)
 9
10from matmmextract.inference import (
11    detect,
12    crop,
13    build_crop_csv,
14    gemini_captioner as gemini_caption,
15    build,
16)
17
18springer_result = fetch_springer(
19    license_=["cc-by", "cc-by-nc"],
20    keywords=["alloy"],
21    max_results=1,
22    output_csv="output/springer_papers.csv",
23    api_key="",
24)
25
26springer_df = load_csvs(["output/springer_papers.csv"])
27
28springer_fetch(
29    df=springer_df,
30    api_key="",
31    output_dir="_springer",
32    use_open_access=True,
33)
34
35
36# ── Step 3: Extract figures from XMLs ────────────────────────────────────────
37springer_figs, _ = springer_extract(
38    "_springer",
39    output_csv="output/springer_figures.csv",
40)
41
42
43# ── Step 4: Download figure images ───────────────────────────────────────────
44springer_download(
45    csv_path="output/springer_figures.csv",
46    output_dir="images/springer",
47)
48
49detect(
50    image_dir="images/springer",
51    output_dir="inference_results",
52    checkpoint="https://huggingface.co/CMEG-IITR/yolo12_unique_multimat",
53    conf=0.6, iou=0.4, imgsz=1024,
54)
55
56# Step 9: crop
57crop(
58    image_dir="images/springer",
59    json_dir="inference_results",
60    output_dir="crops",
61)
62
63# Step 9.5: build captioning CSV
64build_crop_csv(
65    crops_dir="crops",
66    figures_csv="output/springer_figures.csv",
67    output_csv="output/crops_for_captioning.csv",
68)
69
70# Step 10: generate sub-captions via gemini
71gemini_caption(
72    csv_path="output/crops_for_captioning.csv",
73    output_dir="subcaptions",
74    api_key="",
75)
76
77# Step 11: link crops + captions → final dataset
78build(
79    images_dir="crops",
80    json_dir="subcaptions",
81    output_csv="springer_linked_dataset.csv",
82)

Springer from Scopus Export (Scopus → Springer)

 1from matmmextract.preprocess import (
 2    load_csvs,
 3    drop_duplicate_dois,
 4    save_csv,
 5)
 6
 7from matmmextract.springer import (
 8    fetch_all as springer_fetch,
 9    extract_all as springer_extract,
10    download_all as springer_download,
11)
12
13
14# ── Step 1: Load CSVs ────────────────────────────────────────────────────────
15springer_df = load_csvs(["scopus.csv"])
16springer_df = drop_duplicate_dois(springer_df)
17
18springer_df = filter_by_publisher(df, pattern=r"\bSpringer\b")
19
20save_csv(springer_df, "output/springer_papers.csv")
21
22
23# ── Step 2: Fetch Springer XMLs ──────────────────────────────────────────────
24springer_fetch(
25    df=springer_df,
26    api_key="",
27    output_dir="_springer",
28    use_open_access=True,
29)
30
31
32# ── Step 3: Extract figures from XMLs ────────────────────────────────────────
33springer_figs, _ = springer_extract(
34    "_springer",
35    output_csv="output/springer_figures.csv",
36)
37
38
39# ── Step 4: Download figure images ───────────────────────────────────────────
40springer_download(
41    csv_path="output/springer_figures.csv",
42    output_dir="images/springer",
43)

Cleanup Intermediate Files

from matmmextract.inference import clean

clean(dry_run=True)
# clean()