""" Download all available TESS/Stars4All monthly archives from Zenodo, then process each with the existing tess_processor.py. Records discovered via Zenodo API (communities/stars4all): - Sep 2016 through Aug 2020, with one gap (Aug 2018 not published) - Total: ~47 months of data Usage: python scripts/download_all_tess.py [--skip-existing] [--process-only] """ from __future__ import annotations import argparse import csv import logging import os import subprocess import sys import time from pathlib import Path log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Complete manifest of all TESS Zenodo records # Columns: zenodo_id, month_label, filename, size_mb (approx) # --------------------------------------------------------------------------- TESS_RECORDS = [ # 2016 — smaller network, fewer stations, lower data volume # (Sep 2016 is JSON format, not CSV — skipping) # ("163341", "sep2016", "tess-september-2016.json", 12), # JSON, skip ("321291", "oct2016", "tess-october-2016.csv", 28), ("321292", "nov2016", "tess-november-2016.csv", 5), ("321305", "dec2016", "tess-december-2016.csv", 20), # 2017 ("321318", "jan2017", "tess-january-2017.csv", 13), ("375804", "feb2017", "tess-february-2017.csv", 23), ("470930", "mar2017", "tess-march-2017.csv", 33), ("572613", "apr2017", "tess-april-2017.csv", 34), ("824124", "may2017", "tess-may-2017.csv", 30), ("824128", "jun2017", "tess-june-2017.csv", 46), ("996285", "jul2017", "tess-july-2017.csv", 46), # Aug 2017 not found on Zenodo ("1000545", "sep2017", "tess-sept-2017.csv", 64), ("1042903", "oct2017", "tess-oct-2017.csv", 88), ("1072485", "nov2017", "tess-nov-2017.csv", 96), ("1134692", "dec2017", "tess-dec-2017.csv", 97), # 2018 ("1164632", "jan2018", "tess-january-2018.csv", 73), ("1215915", "feb2018", "tess-february-2018.csv", 89), ("1215546", "mar2018", "tess-march-2018.csv", 87), ("1240139", "apr2018", "tess-april-2018.csv", 88), ("1257445", "may2018", "tess-may-2018.csv", 94), ("1306765", "jun2018", "tess-june-2018.csv", 87), ("1332003", "jul2018", "tess-july-2018.csv", 92), # Aug 2018 — not published on Zenodo, skip ("1442525", "sep2018", "tess-september-2018.csv", 113), ("1479009", "oct2018", "tess-october-2018.csv", 133), ("1883080", "nov2018", "tess-november-2018.csv", 134), ("2536130", "dec2018", "tess-december-2018.csv", 151), # 2019 ("2561327", "jan2019", "tess-january-2019.csv", 168), ("2620256", "feb2019", "tess-february-2019.csv", 152), ("2620261", "mar2019", "tess-march-2019.csv", 162), ("3377710", "apr2019", "tess-april-2019.csv", 159), ("3378310", "may2019", "tess-may-2019.csv", 141), # Jun 2019 already downloaded — still include for processing ("3378728", "jun2019", "tess-june-2019.csv", 147), ("3564263", "jul2019", "tess-july-2019.csv", 147), ("3564257", "aug2019", "tess-august-2019.csv", 155), ("3564221", "sep2019", "tess-september-2019.csv", 142), ("3564196", "oct2019", "tess-october-2019.csv", 144), ("3563539", "nov2019", "tess-november-2019.csv", 135), ("3758045", "dec2019", "tess-december-2019.csv", 144), # 2020 ("4264883", "jan2020", "stars4all-january-2020.csv", 359), ("4264914", "feb2020", "stars4all-february-2020.csv", 358), ("4264946", "mar2020", "stars4all-march-2020.csv", 368), ("4264991", "apr2020", "stars4all-april-2020.csv", 338), ("4265020", "may2020", "stars4all-may-2020.csv", 318), ("4265108", "jun2020", "stars4all-june-2020.csv", 323), ("4265155", "jul2020", "stars4all-july-2020.csv", 323), ("4265171", "aug2020", "stars4all-august-2020.csv", 324), ] REPO_ROOT = Path(__file__).parent.parent RAW_TESS_DIR = REPO_ROOT / "data" / "raw" / "tess" SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings" VENV_PYTHON = REPO_ROOT / ".venv" / "bin" / "python" def zenodo_download_url(record_id: str, filename: str) -> str: return f"https://zenodo.org/records/{record_id}/files/{filename}?download=1" def download_file(url: str, dest: Path, size_mb: int) -> bool: """Download url to dest using curl. Returns True on success.""" log.info("Downloading %s -> %s (~%d MB)", url, dest.name, size_mb) tmp = dest.with_suffix(".tmp") cmd = [ "curl", "-L", "--retry", "3", "--retry-delay", "5", "--connect-timeout", "30", "--max-time", "3600", "-o", str(tmp), url, ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: log.error("Download failed: %s", result.stderr[:500]) tmp.unlink(missing_ok=True) return False # Sanity check: file must be at least 1 MB if tmp.stat().st_size < 1_000_000: log.error("Downloaded file too small (%d bytes), likely an error page", tmp.stat().st_size) tmp.unlink(missing_ok=True) return False tmp.rename(dest) log.info("Downloaded %s (%.1f MB)", dest.name, dest.stat().st_size / 1e6) return True def process_month(csv_path: Path, output_path: Path) -> int: """ Run the TESS processor on csv_path, write results to output_path. Returns number of twilight events extracted. """ script = f""" import sys, csv sys.path.insert(0, '{REPO_ROOT}') from src.collect.tess_processor import process_tess_csv from pathlib import Path csv_path = Path('{csv_path}') records = process_tess_csv(csv_path) print(f'RECORDS_COUNT: {{len(records)}}', flush=True) if records: output_path = Path('{output_path}') fieldnames = ['prayer', 'date_local', 'time_local', 'utc_offset', 'lat', 'lng', 'elevation_m', 'source', 'notes'] with open(output_path, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(records) print(f'Written to {{output_path}}', flush=True) """ result = subprocess.run( [str(VENV_PYTHON), "-c", script], capture_output=True, text=True, timeout=7200, cwd=str(REPO_ROOT), ) # Extract record count from output count = 0 for line in result.stdout.splitlines(): if line.startswith("RECORDS_COUNT:"): count = int(line.split(":")[1].strip()) if result.returncode != 0: log.error("Processor error for %s: %s", csv_path.name, result.stderr[-1000:]) return 0 log.info("Processed %s: %d twilight events", csv_path.name, count) return count def main(): parser = argparse.ArgumentParser(description="Download and process all TESS monthly archives") parser.add_argument("--skip-existing", action="store_true", help="Skip download if raw CSV already exists") parser.add_argument("--process-only", action="store_true", help="Skip downloads, only process existing CSVs") parser.add_argument("--download-only", action="store_true", help="Only download, do not process") parser.add_argument("--months", nargs="+", default=None, help="Only process these month labels (e.g. jan2019 feb2019)") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", stream=sys.stdout, ) RAW_TESS_DIR.mkdir(parents=True, exist_ok=True) SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True) records_to_process = TESS_RECORDS if args.months: records_to_process = [r for r in TESS_RECORDS if r[1] in args.months] log.info("Filtered to %d months: %s", len(records_to_process), args.months) total_downloaded = 0 total_processed = 0 total_events = 0 results = [] for zenodo_id, month_label, filename, size_mb in records_to_process: raw_path = RAW_TESS_DIR / f"tess_{month_label}.csv" output_path = SIGHTINGS_DIR / f"tess_{month_label}.csv" log.info("=== %s (Zenodo %s) ===", month_label, zenodo_id) # --- Download --- if not args.process_only: if raw_path.exists() and args.skip_existing: log.info("Skipping download (exists): %s", raw_path.name) elif raw_path.exists() and raw_path.stat().st_size > 1_000_000: log.info("File already present: %s (%.1f MB)", raw_path.name, raw_path.stat().st_size / 1e6) else: url = zenodo_download_url(zenodo_id, filename) ok = download_file(url, raw_path, size_mb) if not ok: log.error("FAILED to download %s, skipping processing", month_label) results.append((month_label, "download_failed", 0)) continue total_downloaded += 1 # --- Process --- if not args.download_only: if not raw_path.exists(): log.warning("Raw file missing, cannot process: %s", raw_path) results.append((month_label, "no_file", 0)) continue n_events = process_month(raw_path, output_path) total_processed += 1 total_events += n_events results.append((month_label, "ok", n_events)) else: results.append((month_label, "download_only", 0)) # Summary print("\n" + "=" * 70) print(f"SUMMARY: {total_downloaded} downloaded, {total_processed} processed, " f"{total_events} total twilight events") print("=" * 70) print(f"{'Month':<12} {'Status':<18} {'Events':>8}") print("-" * 40) for month, status, count in results: print(f"{month:<12} {status:<18} {count:>8}") print("-" * 40) print(f"{'TOTAL':<12} {'':<18} {total_events:>8}") if __name__ == "__main__": main()