pray-calc-ml/scripts/download_all_tess.py
Aric Camarata ada08e7ec4 data: expand dataset from 5.9k to 91k records via 6 new SQM sources
Add 6 new data collection pipelines and their processed outputs:

Sources added:
- TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020),
  ~40k raw events from 100+ European stations via Zenodo archives
- Globe at Night citizen science: 26k twilight observations (2006-2024),
  filtered from 308k total observations for solar depression 6-22 deg
- GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025),
  ~12.5k twilight events from 88 stations across 20+ countries
- Galicia SQM network: 14 stations, 1-min resolution, 7.5k events
- Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events
- washetdonker.nl Netherlands: 7 stations, 3.3k morning events
- Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer

Pipeline changes:
- ingest.py: add all new files to APPROVED_RAW_CSVS allowlist,
  fix filter to use allowlist instead of hardcoded exclusions
- .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN,
  washetdonker, Globe at Night downloads)

Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records
Previous: 5,871 Fajr + 46 Isha = 5,917 total records
2026-03-22 16:39:29 -04:00

245 lines
9.7 KiB
Python

"""
Download all available TESS/Stars4All monthly archives from Zenodo,
then process each with the existing tess_processor.py.
Records discovered via Zenodo API (communities/stars4all):
- Sep 2016 through Aug 2020, with one gap (Aug 2018 not published)
- Total: ~47 months of data
Usage:
python scripts/download_all_tess.py [--skip-existing] [--process-only]
"""
from __future__ import annotations
import argparse
import csv
import logging
import os
import subprocess
import sys
import time
from pathlib import Path
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Complete manifest of all TESS Zenodo records
# Columns: zenodo_id, month_label, filename, size_mb (approx)
# ---------------------------------------------------------------------------
TESS_RECORDS = [
# 2016 — smaller network, fewer stations, lower data volume
# (Sep 2016 is JSON format, not CSV — skipping)
# ("163341", "sep2016", "tess-september-2016.json", 12), # JSON, skip
("321291", "oct2016", "tess-october-2016.csv", 28),
("321292", "nov2016", "tess-november-2016.csv", 5),
("321305", "dec2016", "tess-december-2016.csv", 20),
# 2017
("321318", "jan2017", "tess-january-2017.csv", 13),
("375804", "feb2017", "tess-february-2017.csv", 23),
("470930", "mar2017", "tess-march-2017.csv", 33),
("572613", "apr2017", "tess-april-2017.csv", 34),
("824124", "may2017", "tess-may-2017.csv", 30),
("824128", "jun2017", "tess-june-2017.csv", 46),
("996285", "jul2017", "tess-july-2017.csv", 46),
# Aug 2017 not found on Zenodo
("1000545", "sep2017", "tess-sept-2017.csv", 64),
("1042903", "oct2017", "tess-oct-2017.csv", 88),
("1072485", "nov2017", "tess-nov-2017.csv", 96),
("1134692", "dec2017", "tess-dec-2017.csv", 97),
# 2018
("1164632", "jan2018", "tess-january-2018.csv", 73),
("1215915", "feb2018", "tess-february-2018.csv", 89),
("1215546", "mar2018", "tess-march-2018.csv", 87),
("1240139", "apr2018", "tess-april-2018.csv", 88),
("1257445", "may2018", "tess-may-2018.csv", 94),
("1306765", "jun2018", "tess-june-2018.csv", 87),
("1332003", "jul2018", "tess-july-2018.csv", 92),
# Aug 2018 — not published on Zenodo, skip
("1442525", "sep2018", "tess-september-2018.csv", 113),
("1479009", "oct2018", "tess-october-2018.csv", 133),
("1883080", "nov2018", "tess-november-2018.csv", 134),
("2536130", "dec2018", "tess-december-2018.csv", 151),
# 2019
("2561327", "jan2019", "tess-january-2019.csv", 168),
("2620256", "feb2019", "tess-february-2019.csv", 152),
("2620261", "mar2019", "tess-march-2019.csv", 162),
("3377710", "apr2019", "tess-april-2019.csv", 159),
("3378310", "may2019", "tess-may-2019.csv", 141),
# Jun 2019 already downloaded — still include for processing
("3378728", "jun2019", "tess-june-2019.csv", 147),
("3564263", "jul2019", "tess-july-2019.csv", 147),
("3564257", "aug2019", "tess-august-2019.csv", 155),
("3564221", "sep2019", "tess-september-2019.csv", 142),
("3564196", "oct2019", "tess-october-2019.csv", 144),
("3563539", "nov2019", "tess-november-2019.csv", 135),
("3758045", "dec2019", "tess-december-2019.csv", 144),
# 2020
("4264883", "jan2020", "stars4all-january-2020.csv", 359),
("4264914", "feb2020", "stars4all-february-2020.csv", 358),
("4264946", "mar2020", "stars4all-march-2020.csv", 368),
("4264991", "apr2020", "stars4all-april-2020.csv", 338),
("4265020", "may2020", "stars4all-may-2020.csv", 318),
("4265108", "jun2020", "stars4all-june-2020.csv", 323),
("4265155", "jul2020", "stars4all-july-2020.csv", 323),
("4265171", "aug2020", "stars4all-august-2020.csv", 324),
]
REPO_ROOT = Path(__file__).parent.parent
RAW_TESS_DIR = REPO_ROOT / "data" / "raw" / "tess"
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
VENV_PYTHON = REPO_ROOT / ".venv" / "bin" / "python"
def zenodo_download_url(record_id: str, filename: str) -> str:
return f"https://zenodo.org/records/{record_id}/files/{filename}?download=1"
def download_file(url: str, dest: Path, size_mb: int) -> bool:
"""Download url to dest using curl. Returns True on success."""
log.info("Downloading %s -> %s (~%d MB)", url, dest.name, size_mb)
tmp = dest.with_suffix(".tmp")
cmd = [
"curl", "-L", "--retry", "3", "--retry-delay", "5",
"--connect-timeout", "30", "--max-time", "3600",
"-o", str(tmp), url,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
log.error("Download failed: %s", result.stderr[:500])
tmp.unlink(missing_ok=True)
return False
# Sanity check: file must be at least 1 MB
if tmp.stat().st_size < 1_000_000:
log.error("Downloaded file too small (%d bytes), likely an error page", tmp.stat().st_size)
tmp.unlink(missing_ok=True)
return False
tmp.rename(dest)
log.info("Downloaded %s (%.1f MB)", dest.name, dest.stat().st_size / 1e6)
return True
def process_month(csv_path: Path, output_path: Path) -> int:
"""
Run the TESS processor on csv_path, write results to output_path.
Returns number of twilight events extracted.
"""
script = f"""
import sys, csv
sys.path.insert(0, '{REPO_ROOT}')
from src.collect.tess_processor import process_tess_csv
from pathlib import Path
csv_path = Path('{csv_path}')
records = process_tess_csv(csv_path)
print(f'RECORDS_COUNT: {{len(records)}}', flush=True)
if records:
output_path = Path('{output_path}')
fieldnames = ['prayer', 'date_local', 'time_local', 'utc_offset',
'lat', 'lng', 'elevation_m', 'source', 'notes']
with open(output_path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(records)
print(f'Written to {{output_path}}', flush=True)
"""
result = subprocess.run(
[str(VENV_PYTHON), "-c", script],
capture_output=True, text=True, timeout=7200,
cwd=str(REPO_ROOT),
)
# Extract record count from output
count = 0
for line in result.stdout.splitlines():
if line.startswith("RECORDS_COUNT:"):
count = int(line.split(":")[1].strip())
if result.returncode != 0:
log.error("Processor error for %s: %s", csv_path.name, result.stderr[-1000:])
return 0
log.info("Processed %s: %d twilight events", csv_path.name, count)
return count
def main():
parser = argparse.ArgumentParser(description="Download and process all TESS monthly archives")
parser.add_argument("--skip-existing", action="store_true",
help="Skip download if raw CSV already exists")
parser.add_argument("--process-only", action="store_true",
help="Skip downloads, only process existing CSVs")
parser.add_argument("--download-only", action="store_true",
help="Only download, do not process")
parser.add_argument("--months", nargs="+", default=None,
help="Only process these month labels (e.g. jan2019 feb2019)")
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
stream=sys.stdout,
)
RAW_TESS_DIR.mkdir(parents=True, exist_ok=True)
SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
records_to_process = TESS_RECORDS
if args.months:
records_to_process = [r for r in TESS_RECORDS if r[1] in args.months]
log.info("Filtered to %d months: %s", len(records_to_process), args.months)
total_downloaded = 0
total_processed = 0
total_events = 0
results = []
for zenodo_id, month_label, filename, size_mb in records_to_process:
raw_path = RAW_TESS_DIR / f"tess_{month_label}.csv"
output_path = SIGHTINGS_DIR / f"tess_{month_label}.csv"
log.info("=== %s (Zenodo %s) ===", month_label, zenodo_id)
# --- Download ---
if not args.process_only:
if raw_path.exists() and args.skip_existing:
log.info("Skipping download (exists): %s", raw_path.name)
elif raw_path.exists() and raw_path.stat().st_size > 1_000_000:
log.info("File already present: %s (%.1f MB)", raw_path.name,
raw_path.stat().st_size / 1e6)
else:
url = zenodo_download_url(zenodo_id, filename)
ok = download_file(url, raw_path, size_mb)
if not ok:
log.error("FAILED to download %s, skipping processing", month_label)
results.append((month_label, "download_failed", 0))
continue
total_downloaded += 1
# --- Process ---
if not args.download_only:
if not raw_path.exists():
log.warning("Raw file missing, cannot process: %s", raw_path)
results.append((month_label, "no_file", 0))
continue
n_events = process_month(raw_path, output_path)
total_processed += 1
total_events += n_events
results.append((month_label, "ok", n_events))
else:
results.append((month_label, "download_only", 0))
# Summary
print("\n" + "=" * 70)
print(f"SUMMARY: {total_downloaded} downloaded, {total_processed} processed, "
f"{total_events} total twilight events")
print("=" * 70)
print(f"{'Month':<12} {'Status':<18} {'Events':>8}")
print("-" * 40)
for month, status, count in results:
print(f"{month:<12} {status:<18} {count:>8}")
print("-" * 40)
print(f"{'TOTAL':<12} {'':<18} {total_events:>8}")
if __name__ == "__main__":
main()