mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-07-01 11:24:26 +00:00
Add 6 new data collection pipelines and their processed outputs: Sources added: - TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020), ~40k raw events from 100+ European stations via Zenodo archives - Globe at Night citizen science: 26k twilight observations (2006-2024), filtered from 308k total observations for solar depression 6-22 deg - GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025), ~12.5k twilight events from 88 stations across 20+ countries - Galicia SQM network: 14 stations, 1-min resolution, 7.5k events - Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events - washetdonker.nl Netherlands: 7 stations, 3.3k morning events - Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer Pipeline changes: - ingest.py: add all new files to APPROVED_RAW_CSVS allowlist, fix filter to use allowlist instead of hardcoded exclusions - .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN, washetdonker, Globe at Night downloads) Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records Previous: 5,871 Fajr + 46 Isha = 5,917 total records
245 lines
9.7 KiB
Python
245 lines
9.7 KiB
Python
"""
|
|
Download all available TESS/Stars4All monthly archives from Zenodo,
|
|
then process each with the existing tess_processor.py.
|
|
|
|
Records discovered via Zenodo API (communities/stars4all):
|
|
- Sep 2016 through Aug 2020, with one gap (Aug 2018 not published)
|
|
- Total: ~47 months of data
|
|
|
|
Usage:
|
|
python scripts/download_all_tess.py [--skip-existing] [--process-only]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Complete manifest of all TESS Zenodo records
|
|
# Columns: zenodo_id, month_label, filename, size_mb (approx)
|
|
# ---------------------------------------------------------------------------
|
|
TESS_RECORDS = [
|
|
# 2016 — smaller network, fewer stations, lower data volume
|
|
# (Sep 2016 is JSON format, not CSV — skipping)
|
|
# ("163341", "sep2016", "tess-september-2016.json", 12), # JSON, skip
|
|
("321291", "oct2016", "tess-october-2016.csv", 28),
|
|
("321292", "nov2016", "tess-november-2016.csv", 5),
|
|
("321305", "dec2016", "tess-december-2016.csv", 20),
|
|
# 2017
|
|
("321318", "jan2017", "tess-january-2017.csv", 13),
|
|
("375804", "feb2017", "tess-february-2017.csv", 23),
|
|
("470930", "mar2017", "tess-march-2017.csv", 33),
|
|
("572613", "apr2017", "tess-april-2017.csv", 34),
|
|
("824124", "may2017", "tess-may-2017.csv", 30),
|
|
("824128", "jun2017", "tess-june-2017.csv", 46),
|
|
("996285", "jul2017", "tess-july-2017.csv", 46),
|
|
# Aug 2017 not found on Zenodo
|
|
("1000545", "sep2017", "tess-sept-2017.csv", 64),
|
|
("1042903", "oct2017", "tess-oct-2017.csv", 88),
|
|
("1072485", "nov2017", "tess-nov-2017.csv", 96),
|
|
("1134692", "dec2017", "tess-dec-2017.csv", 97),
|
|
# 2018
|
|
("1164632", "jan2018", "tess-january-2018.csv", 73),
|
|
("1215915", "feb2018", "tess-february-2018.csv", 89),
|
|
("1215546", "mar2018", "tess-march-2018.csv", 87),
|
|
("1240139", "apr2018", "tess-april-2018.csv", 88),
|
|
("1257445", "may2018", "tess-may-2018.csv", 94),
|
|
("1306765", "jun2018", "tess-june-2018.csv", 87),
|
|
("1332003", "jul2018", "tess-july-2018.csv", 92),
|
|
# Aug 2018 — not published on Zenodo, skip
|
|
("1442525", "sep2018", "tess-september-2018.csv", 113),
|
|
("1479009", "oct2018", "tess-october-2018.csv", 133),
|
|
("1883080", "nov2018", "tess-november-2018.csv", 134),
|
|
("2536130", "dec2018", "tess-december-2018.csv", 151),
|
|
# 2019
|
|
("2561327", "jan2019", "tess-january-2019.csv", 168),
|
|
("2620256", "feb2019", "tess-february-2019.csv", 152),
|
|
("2620261", "mar2019", "tess-march-2019.csv", 162),
|
|
("3377710", "apr2019", "tess-april-2019.csv", 159),
|
|
("3378310", "may2019", "tess-may-2019.csv", 141),
|
|
# Jun 2019 already downloaded — still include for processing
|
|
("3378728", "jun2019", "tess-june-2019.csv", 147),
|
|
("3564263", "jul2019", "tess-july-2019.csv", 147),
|
|
("3564257", "aug2019", "tess-august-2019.csv", 155),
|
|
("3564221", "sep2019", "tess-september-2019.csv", 142),
|
|
("3564196", "oct2019", "tess-october-2019.csv", 144),
|
|
("3563539", "nov2019", "tess-november-2019.csv", 135),
|
|
("3758045", "dec2019", "tess-december-2019.csv", 144),
|
|
# 2020
|
|
("4264883", "jan2020", "stars4all-january-2020.csv", 359),
|
|
("4264914", "feb2020", "stars4all-february-2020.csv", 358),
|
|
("4264946", "mar2020", "stars4all-march-2020.csv", 368),
|
|
("4264991", "apr2020", "stars4all-april-2020.csv", 338),
|
|
("4265020", "may2020", "stars4all-may-2020.csv", 318),
|
|
("4265108", "jun2020", "stars4all-june-2020.csv", 323),
|
|
("4265155", "jul2020", "stars4all-july-2020.csv", 323),
|
|
("4265171", "aug2020", "stars4all-august-2020.csv", 324),
|
|
]
|
|
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
RAW_TESS_DIR = REPO_ROOT / "data" / "raw" / "tess"
|
|
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
|
|
VENV_PYTHON = REPO_ROOT / ".venv" / "bin" / "python"
|
|
|
|
|
|
def zenodo_download_url(record_id: str, filename: str) -> str:
|
|
return f"https://zenodo.org/records/{record_id}/files/{filename}?download=1"
|
|
|
|
|
|
def download_file(url: str, dest: Path, size_mb: int) -> bool:
|
|
"""Download url to dest using curl. Returns True on success."""
|
|
log.info("Downloading %s -> %s (~%d MB)", url, dest.name, size_mb)
|
|
tmp = dest.with_suffix(".tmp")
|
|
cmd = [
|
|
"curl", "-L", "--retry", "3", "--retry-delay", "5",
|
|
"--connect-timeout", "30", "--max-time", "3600",
|
|
"-o", str(tmp), url,
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
log.error("Download failed: %s", result.stderr[:500])
|
|
tmp.unlink(missing_ok=True)
|
|
return False
|
|
# Sanity check: file must be at least 1 MB
|
|
if tmp.stat().st_size < 1_000_000:
|
|
log.error("Downloaded file too small (%d bytes), likely an error page", tmp.stat().st_size)
|
|
tmp.unlink(missing_ok=True)
|
|
return False
|
|
tmp.rename(dest)
|
|
log.info("Downloaded %s (%.1f MB)", dest.name, dest.stat().st_size / 1e6)
|
|
return True
|
|
|
|
|
|
def process_month(csv_path: Path, output_path: Path) -> int:
|
|
"""
|
|
Run the TESS processor on csv_path, write results to output_path.
|
|
Returns number of twilight events extracted.
|
|
"""
|
|
script = f"""
|
|
import sys, csv
|
|
sys.path.insert(0, '{REPO_ROOT}')
|
|
from src.collect.tess_processor import process_tess_csv
|
|
from pathlib import Path
|
|
|
|
csv_path = Path('{csv_path}')
|
|
records = process_tess_csv(csv_path)
|
|
print(f'RECORDS_COUNT: {{len(records)}}', flush=True)
|
|
|
|
if records:
|
|
output_path = Path('{output_path}')
|
|
fieldnames = ['prayer', 'date_local', 'time_local', 'utc_offset',
|
|
'lat', 'lng', 'elevation_m', 'source', 'notes']
|
|
with open(output_path, 'w', newline='') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(records)
|
|
print(f'Written to {{output_path}}', flush=True)
|
|
"""
|
|
result = subprocess.run(
|
|
[str(VENV_PYTHON), "-c", script],
|
|
capture_output=True, text=True, timeout=7200,
|
|
cwd=str(REPO_ROOT),
|
|
)
|
|
# Extract record count from output
|
|
count = 0
|
|
for line in result.stdout.splitlines():
|
|
if line.startswith("RECORDS_COUNT:"):
|
|
count = int(line.split(":")[1].strip())
|
|
if result.returncode != 0:
|
|
log.error("Processor error for %s: %s", csv_path.name, result.stderr[-1000:])
|
|
return 0
|
|
log.info("Processed %s: %d twilight events", csv_path.name, count)
|
|
return count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Download and process all TESS monthly archives")
|
|
parser.add_argument("--skip-existing", action="store_true",
|
|
help="Skip download if raw CSV already exists")
|
|
parser.add_argument("--process-only", action="store_true",
|
|
help="Skip downloads, only process existing CSVs")
|
|
parser.add_argument("--download-only", action="store_true",
|
|
help="Only download, do not process")
|
|
parser.add_argument("--months", nargs="+", default=None,
|
|
help="Only process these month labels (e.g. jan2019 feb2019)")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
stream=sys.stdout,
|
|
)
|
|
|
|
RAW_TESS_DIR.mkdir(parents=True, exist_ok=True)
|
|
SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
records_to_process = TESS_RECORDS
|
|
if args.months:
|
|
records_to_process = [r for r in TESS_RECORDS if r[1] in args.months]
|
|
log.info("Filtered to %d months: %s", len(records_to_process), args.months)
|
|
|
|
total_downloaded = 0
|
|
total_processed = 0
|
|
total_events = 0
|
|
results = []
|
|
|
|
for zenodo_id, month_label, filename, size_mb in records_to_process:
|
|
raw_path = RAW_TESS_DIR / f"tess_{month_label}.csv"
|
|
output_path = SIGHTINGS_DIR / f"tess_{month_label}.csv"
|
|
|
|
log.info("=== %s (Zenodo %s) ===", month_label, zenodo_id)
|
|
|
|
# --- Download ---
|
|
if not args.process_only:
|
|
if raw_path.exists() and args.skip_existing:
|
|
log.info("Skipping download (exists): %s", raw_path.name)
|
|
elif raw_path.exists() and raw_path.stat().st_size > 1_000_000:
|
|
log.info("File already present: %s (%.1f MB)", raw_path.name,
|
|
raw_path.stat().st_size / 1e6)
|
|
else:
|
|
url = zenodo_download_url(zenodo_id, filename)
|
|
ok = download_file(url, raw_path, size_mb)
|
|
if not ok:
|
|
log.error("FAILED to download %s, skipping processing", month_label)
|
|
results.append((month_label, "download_failed", 0))
|
|
continue
|
|
total_downloaded += 1
|
|
|
|
# --- Process ---
|
|
if not args.download_only:
|
|
if not raw_path.exists():
|
|
log.warning("Raw file missing, cannot process: %s", raw_path)
|
|
results.append((month_label, "no_file", 0))
|
|
continue
|
|
|
|
n_events = process_month(raw_path, output_path)
|
|
total_processed += 1
|
|
total_events += n_events
|
|
results.append((month_label, "ok", n_events))
|
|
else:
|
|
results.append((month_label, "download_only", 0))
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print(f"SUMMARY: {total_downloaded} downloaded, {total_processed} processed, "
|
|
f"{total_events} total twilight events")
|
|
print("=" * 70)
|
|
print(f"{'Month':<12} {'Status':<18} {'Events':>8}")
|
|
print("-" * 40)
|
|
for month, status, count in results:
|
|
print(f"{month:<12} {status:<18} {count:>8}")
|
|
print("-" * 40)
|
|
print(f"{'TOTAL':<12} {'':<18} {total_events:>8}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|