Source code for matmmextract.inference.cleaner
"""
matmmextract.inference.cleaner
==================================
Delete intermediate files and directories after the full pipeline
has completed successfully, keeping only what matters:
KEPT
----
- crops/ cropped panel images
- images/ original downloaded images
- linked_dataset.csv final output
DELETED
-------
- inference_results/ per-image detection JSONs + _summary.json
- subcaptions/ per-crop captioning JSONs
- output/ all intermediate CSVs
- alloys_elsevier/ fetched Elsevier XMLs
- alloys_springer/ fetched Springer XMLs
- ``*.txt`` processed_dois / failed_dois resume files
"""
from __future__ import annotations
import argparse
import os
import shutil
from dataclasses import dataclass, field
from pathlib import Path
[docs]
@dataclass
class CleanResult:
deleted_dirs: list[str] = field(default_factory=list)
deleted_files: list[str] = field(default_factory=list)
kept: list[str] = field(default_factory=list)
skipped_missing: list[str] = field(default_factory=list)
[docs]
def clean(
base_dir: str | Path = ".",
delete_dirs: list[str] | None = None,
delete_files: list[str] | None = None,
delete_glob_patterns: list[str] | None = None,
dry_run: bool = False,
verbose: bool = True,
) -> CleanResult:
"""Delete intermediate pipeline artifacts.
Parameters
----------
base_dir:
Root directory to resolve paths from (default: current directory).
delete_dirs:
List of directory names/paths to delete recursively.
Defaults to the standard pipeline intermediates.
delete_files:
List of specific file paths to delete.
delete_glob_patterns:
Glob patterns relative to base_dir, e.g. ``["*.txt", "output/*.csv"]``.
dry_run:
Print what would be deleted without actually deleting.
verbose:
Print each deleted item.
Returns
-------
CleanResult
Examples
--------
>>> from matmmextract.inference.cleaner import clean
>>> clean() # delete all standard intermediates
>>> clean(dry_run=True) # preview without deleting
>>> clean(delete_dirs=["output"]) # delete only output/
"""
base_dir = Path(base_dir).resolve()
# Defaults
if delete_dirs is None:
delete_dirs = [
"inference_results",
"subcaptions",
"output",
"alloys_elsevier",
"alloys_springer",
]
if delete_files is None:
delete_files = [
"elsevier_with_xml_paths.csv",
"springer_with_xml_paths.csv",
]
if delete_glob_patterns is None:
delete_glob_patterns = [
"processed_dois_*.txt",
"failed_dois_*.txt",
"download_log_*.csv",
"build_dataset.log",
]
result = CleanResult()
action = "Would delete" if dry_run else "Deleted"
# Directories
for d in delete_dirs:
path = base_dir / d
if path.exists() and path.is_dir():
if not dry_run:
shutil.rmtree(path)
result.deleted_dirs.append(str(path))
if verbose:
print(f"[cleaner] {action} dir : {path}")
else:
result.skipped_missing.append(str(path))
# Specific files
for f in delete_files:
path = base_dir / f
if path.exists() and path.is_file():
if not dry_run:
path.unlink()
result.deleted_files.append(str(path))
if verbose:
print(f"[cleaner] {action} file : {path}")
else:
result.skipped_missing.append(str(path))
# Glob patterns
for pattern in delete_glob_patterns:
for path in sorted(base_dir.glob(pattern)):
if path.is_file():
if not dry_run:
path.unlink()
result.deleted_files.append(str(path))
if verbose:
print(f"[cleaner] {action} file : {path}")
if verbose:
print(
f"\n[cleaner] {'(dry run) ' if dry_run else ''}"
f"dirs={len(result.deleted_dirs)} "
f"files={len(result.deleted_files)} "
f"skipped={len(result.skipped_missing)}"
)
return result
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Delete intermediate pipeline files, keep crops + images + linked_dataset.csv"
)
p.add_argument(
"--base-dir", default=".",
help="Root directory (default: current directory)"
)
p.add_argument(
"--dry-run", action="store_true",
help="Show what would be deleted without deleting"
)
p.add_argument(
"--dirs", nargs="+", default=None,
help="Override default dirs to delete"
)
p.add_argument(
"--files", nargs="+", default=None,
help="Override default files to delete"
)
return p.parse_args()
def main() -> None:
args = _parse_args()
clean(
base_dir=args.base_dir,
delete_dirs=args.dirs,
delete_files=args.files,
dry_run=args.dry_run,
)
if __name__ == "__main__":
main()