pray-calc-ml/scripts/process_tess_batch.py

"""
Process all downloaded TESS monthly CSV files using the existing tess_processor.

Run this after download_all_tess.py has fetched the raw files.
Writes one output CSV per month to data/raw/raw_sightings/tess_{month}.csv

Usage:
    python scripts/process_tess_batch.py [--month jan2019] [--all]
"""
from __future__ import annotations

import csv
import logging
import sys
import time
from pathlib import Path

# Add repo root to path
REPO_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(REPO_ROOT))

from src.collect.tess_processor import process_tess_csv  # noqa: E402

log = logging.getLogger(__name__)

RAW_TESS_DIR = REPO_ROOT / "data" / "raw" / "tess"
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
FIELDNAMES = ["prayer", "date_local", "time_local", "utc_offset",
              "lat", "lng", "elevation_m", "source", "notes"]


def process_month(csv_path: Path, output_path: Path) -> int:
    """Process one TESS monthly CSV. Returns event count."""
    log.info("Processing: %s", csv_path.name)
    t0 = time.time()
    try:
        records = process_tess_csv(csv_path)
    except Exception as e:
        log.error("Error processing %s: %s", csv_path.name, e)
        return 0

    elapsed = time.time() - t0
    log.info("  -> %d events in %.1fs", len(records), elapsed)

    if records:
        with open(output_path, "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()
            writer.writerows(records)
        log.info("  -> Written: %s", output_path.name)

    return len(records)


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--month", nargs="+", default=None, help="Process specific month(s) (e.g. jan2019 feb2019)")
    parser.add_argument("--skip-existing", action="store_true",
                        help="Skip months already in raw_sightings/")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        datefmt="%H:%M:%S",
        stream=sys.stdout,
    )

    SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)

    if args.month:
        files = [RAW_TESS_DIR / f"tess_{m}.csv" for m in args.month]
    else:
        files = sorted(RAW_TESS_DIR.glob("tess_*.csv"))

    if not files:
        log.error("No TESS CSV files found in %s", RAW_TESS_DIR)
        sys.exit(1)

    log.info("Found %d TESS CSV files to process", len(files))

    total_events = 0
    results = []

    for csv_path in files:
        if not csv_path.exists():
            log.warning("File not found: %s", csv_path)
            continue

        month_label = csv_path.stem.replace("tess_", "")
        output_path = SIGHTINGS_DIR / f"tess_{month_label}.csv"

        if args.skip_existing and output_path.exists():
            existing = sum(1 for _ in open(output_path)) - 1
            log.info("SKIP (exists): %s (%d rows)", output_path.name, existing)
            total_events += existing
            results.append((month_label, existing, "skipped"))
            continue

        n = process_month(csv_path, output_path)
        total_events += n
        results.append((month_label, n, "processed"))

    # Print summary table
    print("\n" + "=" * 55)
    print(f"{'Month':<12} {'Events':>8}  {'Status'}")
    print("-" * 55)
    for month, n, status in results:
        print(f"{month:<12} {n:>8}  {status}")
    print("-" * 55)
    print(f"{'TOTAL':<12} {total_events:>8}")
    print("=" * 55)


if __name__ == "__main__":
    main()