Examples¶
Elsevier Full Pipeline (OpenAlex → Elsevier → Detection → Azure → Dataset)¶
1from matmmextract.preprocess.pipeline import load_csvs
2from matmmextract.inference.detector import detect
3from matmmextract.inference.cropper import crop
4from matmmextract.inference.crop_csv_builder import build_crop_csv
5from matmmextract.inference.captioner_azure import captioner as azure_caption
6from matmmextract.inference.dataset_builder import build
7from matmmextract.openalex.fetcher import fetch_elsevier
8from matmmextract.elsevier.extractor import extract_all as elsevier_extract
9from matmmextract.elsevier.downloader import download_all as elsevier_download
10from matmmextract.elsevier.fetcher import fetch_all as elsevier_fetch
11
12
13elsevier_result = fetch_elsevier(
14 license_="cc-by",
15 keywords=["titanium"],
16 from_year=2020, to_year=2024,
17 max_results=1,
18 output_csv="output/elsevier_papers.csv",
19 api_key="",
20)
21
22elsevier_df = load_csvs(["output/elsevier_papers.csv"])
23
24elsevier_fetch(
25 df=elsevier_df,
26 api_key="",
27 inst_token="",
28 output_dir="_elsevier",
29)
30
31elsevier_figs, _ = elsevier_extract(
32 "_elsevier",
33 output_csv="output/elsevier_figures.csv",
34)
35
36elsevier_download(
37 csv_path="output/elsevier_figures.csv",
38 output_dir="images/elsevier",
39 api_key="",
40 inst_token="",
41)
42
43
44# Step 8: detect
45detect(
46 image_dir="images/elsevier",
47 output_dir="inference_results",
48 checkpoint="https://huggingface.co/CMEG-IITR/yolo12_unique_multimat",
49 conf=0.6, iou=0.4, imgsz=1024,
50)
51
52# Step 9: crop
53crop(
54 image_dir="images/elsevier",
55 json_dir="inference_results",
56 output_dir="crops",
57)
58
59# Step 9.5: build captioning CSV
60build_crop_csv(
61 crops_dir="crops",
62 figures_csv="output/elsevier_figures.csv",
63 output_csv="output/crops_for_captioning.csv",
64)
65
66# Step 10: generate sub-captions via Azure
67azure_caption(
68 csv_path="output/crops_for_captioning.csv",
69 output_dir="subcaptions",
70 api_key="",
71 azure_endpoint="",
72 model_name="Mistral-Large-3",
73 image_name_col="downloaded_image_name", # matches crops_for_captioning.csv
74 caption_col="caption",
75 reference_col="reference_sentences",
76)
77
78# Step 11: link crops + captions → final dataset
79build(
80 images_dir="crops",
81 json_dir="subcaptions",
82 output_csv="elsevier_linked_dataset.csv",
83)
84
Elsevier from Scopus Export (Scopus → Elsevier)¶
1from matmmextract.preprocess import (
2 load_csvs,
3 drop_duplicate_dois,
4 filter_open_access,
5 save_csv,
6)
7
8from matmmextract.preprocess import (
9 scan_directory,
10 filter_figures_cc_by,
11)
12
13from matmmextract.elsevier import (
14 fetch_all as elsevier_fetch,
15 extract_all as elsevier_extract,
16 download_all as elsevier_download,
17)
18
19
20# ── Step 1: Load CSVs ────────────────────────────────────────────────────────
21elsevier_df = load_csvs(["scopus.csv"])
22elsevier_df = drop_duplicate_dois(elsevier_df)
23
24elsevier_oa = filter_open_access(elsevier_df)
25
26save_csv(elsevier_df, "output/elsevier_papers.csv")
27
28
29# ── Step 2: Fetch Elsevier XMLs ──────────────────────────────────────────────
30elsevier_fetch(
31 df=elsevier_df,
32 api_key="",
33 inst_token="",
34 output_dir="_elsevier",
35)
36
37
38# ── Step 3: Extract figures ──────────────────────────────────────────────────
39elsevier_figs, _ = elsevier_extract(
40 "_elsevier",
41 output_csv="output/elsevier_figures.csv",
42)
43
44
45# ── Step 4: CC-BY filtering ──────────────────────────────────────────────────
46cc_df, _ = scan_directory(
47 "_elsevier",
48 output_csv="output/elsevier_cc.csv",
49)
50
51elsevier_figs_ccby = filter_figures_cc_by(
52 elsevier_figs,
53 cc_df,
54 output_csv="output/elsevier_figures_ccby.csv",
55)
56
57
58# ── Step 5: Download images ──────────────────────────────────────────────────
59elsevier_download(
60 csv_path="output/elsevier_figures_ccby.csv",
61 output_dir="images/elsevier",
62 api_key="",
63 inst_token="",
64)
Springer Full Pipeline (OpenAlex → Springer → Detection (model checkpoint from Hugging Face Hub) → Gemini → Dataset)¶
1from matmmextract.openalex import fetch_springer
2from matmmextract.preprocess import load_csvs
3
4from matmmextract.springer import (
5 fetch_all as springer_fetch,
6 extract_all as springer_extract,
7 download_all as springer_download,
8)
9
10from matmmextract.inference import (
11 detect,
12 crop,
13 build_crop_csv,
14 gemini_captioner as gemini_caption,
15 build,
16)
17
18springer_result = fetch_springer(
19 license_=["cc-by", "cc-by-nc"],
20 keywords=["alloy"],
21 max_results=1,
22 output_csv="output/springer_papers.csv",
23 api_key="",
24)
25
26springer_df = load_csvs(["output/springer_papers.csv"])
27
28springer_fetch(
29 df=springer_df,
30 api_key="",
31 output_dir="_springer",
32 use_open_access=True,
33)
34
35
36# ── Step 3: Extract figures from XMLs ────────────────────────────────────────
37springer_figs, _ = springer_extract(
38 "_springer",
39 output_csv="output/springer_figures.csv",
40)
41
42
43# ── Step 4: Download figure images ───────────────────────────────────────────
44springer_download(
45 csv_path="output/springer_figures.csv",
46 output_dir="images/springer",
47)
48
49detect(
50 image_dir="images/springer",
51 output_dir="inference_results",
52 checkpoint="https://huggingface.co/CMEG-IITR/yolo12_unique_multimat",
53 conf=0.6, iou=0.4, imgsz=1024,
54)
55
56# Step 9: crop
57crop(
58 image_dir="images/springer",
59 json_dir="inference_results",
60 output_dir="crops",
61)
62
63# Step 9.5: build captioning CSV
64build_crop_csv(
65 crops_dir="crops",
66 figures_csv="output/springer_figures.csv",
67 output_csv="output/crops_for_captioning.csv",
68)
69
70# Step 10: generate sub-captions via gemini
71gemini_caption(
72 csv_path="output/crops_for_captioning.csv",
73 output_dir="subcaptions",
74 api_key="",
75)
76
77# Step 11: link crops + captions → final dataset
78build(
79 images_dir="crops",
80 json_dir="subcaptions",
81 output_csv="springer_linked_dataset.csv",
82)
Springer from Scopus Export (Scopus → Springer)¶
1from matmmextract.preprocess import (
2 load_csvs,
3 drop_duplicate_dois,
4 save_csv,
5)
6
7from matmmextract.springer import (
8 fetch_all as springer_fetch,
9 extract_all as springer_extract,
10 download_all as springer_download,
11)
12
13
14# ── Step 1: Load CSVs ────────────────────────────────────────────────────────
15springer_df = load_csvs(["scopus.csv"])
16springer_df = drop_duplicate_dois(springer_df)
17
18springer_df = filter_by_publisher(df, pattern=r"\bSpringer\b")
19
20save_csv(springer_df, "output/springer_papers.csv")
21
22
23# ── Step 2: Fetch Springer XMLs ──────────────────────────────────────────────
24springer_fetch(
25 df=springer_df,
26 api_key="",
27 output_dir="_springer",
28 use_open_access=True,
29)
30
31
32# ── Step 3: Extract figures from XMLs ────────────────────────────────────────
33springer_figs, _ = springer_extract(
34 "_springer",
35 output_csv="output/springer_figures.csv",
36)
37
38
39# ── Step 4: Download figure images ───────────────────────────────────────────
40springer_download(
41 csv_path="output/springer_figures.csv",
42 output_dir="images/springer",
43)
Cleanup Intermediate Files¶
from matmmextract.inference import clean
clean(dry_run=True)
# clean()