mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-06-30 19:04:26 +00:00
Add 6 new data collection pipelines and their processed outputs: Sources added: - TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020), ~40k raw events from 100+ European stations via Zenodo archives - Globe at Night citizen science: 26k twilight observations (2006-2024), filtered from 308k total observations for solar depression 6-22 deg - GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025), ~12.5k twilight events from 88 stations across 20+ countries - Galicia SQM network: 14 stations, 1-min resolution, 7.5k events - Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events - washetdonker.nl Netherlands: 7 stations, 3.3k morning events - Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer Pipeline changes: - ingest.py: add all new files to APPROVED_RAW_CSVS allowlist, fix filter to use allowlist instead of hardcoded exclusions - .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN, washetdonker, Globe at Night downloads) Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records Previous: 5,871 Fajr + 46 Isha = 5,917 total records
117 lines
3.5 KiB
Python
117 lines
3.5 KiB
Python
"""
|
|
Process all downloaded TESS monthly CSV files using the existing tess_processor.
|
|
|
|
Run this after download_all_tess.py has fetched the raw files.
|
|
Writes one output CSV per month to data/raw/raw_sightings/tess_{month}.csv
|
|
|
|
Usage:
|
|
python scripts/process_tess_batch.py [--month jan2019] [--all]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Add repo root to path
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(REPO_ROOT))
|
|
|
|
from src.collect.tess_processor import process_tess_csv # noqa: E402
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
RAW_TESS_DIR = REPO_ROOT / "data" / "raw" / "tess"
|
|
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
|
|
FIELDNAMES = ["prayer", "date_local", "time_local", "utc_offset",
|
|
"lat", "lng", "elevation_m", "source", "notes"]
|
|
|
|
|
|
def process_month(csv_path: Path, output_path: Path) -> int:
|
|
"""Process one TESS monthly CSV. Returns event count."""
|
|
log.info("Processing: %s", csv_path.name)
|
|
t0 = time.time()
|
|
try:
|
|
records = process_tess_csv(csv_path)
|
|
except Exception as e:
|
|
log.error("Error processing %s: %s", csv_path.name, e)
|
|
return 0
|
|
|
|
elapsed = time.time() - t0
|
|
log.info(" -> %d events in %.1fs", len(records), elapsed)
|
|
|
|
if records:
|
|
with open(output_path, "w", newline="") as f:
|
|
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
|
|
writer.writeheader()
|
|
writer.writerows(records)
|
|
log.info(" -> Written: %s", output_path.name)
|
|
|
|
return len(records)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--month", nargs="+", default=None, help="Process specific month(s) (e.g. jan2019 feb2019)")
|
|
parser.add_argument("--skip-existing", action="store_true",
|
|
help="Skip months already in raw_sightings/")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
stream=sys.stdout,
|
|
)
|
|
|
|
SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
if args.month:
|
|
files = [RAW_TESS_DIR / f"tess_{m}.csv" for m in args.month]
|
|
else:
|
|
files = sorted(RAW_TESS_DIR.glob("tess_*.csv"))
|
|
|
|
if not files:
|
|
log.error("No TESS CSV files found in %s", RAW_TESS_DIR)
|
|
sys.exit(1)
|
|
|
|
log.info("Found %d TESS CSV files to process", len(files))
|
|
|
|
total_events = 0
|
|
results = []
|
|
|
|
for csv_path in files:
|
|
if not csv_path.exists():
|
|
log.warning("File not found: %s", csv_path)
|
|
continue
|
|
|
|
month_label = csv_path.stem.replace("tess_", "")
|
|
output_path = SIGHTINGS_DIR / f"tess_{month_label}.csv"
|
|
|
|
if args.skip_existing and output_path.exists():
|
|
existing = sum(1 for _ in open(output_path)) - 1
|
|
log.info("SKIP (exists): %s (%d rows)", output_path.name, existing)
|
|
total_events += existing
|
|
results.append((month_label, existing, "skipped"))
|
|
continue
|
|
|
|
n = process_month(csv_path, output_path)
|
|
total_events += n
|
|
results.append((month_label, n, "processed"))
|
|
|
|
# Print summary table
|
|
print("\n" + "=" * 55)
|
|
print(f"{'Month':<12} {'Events':>8} {'Status'}")
|
|
print("-" * 55)
|
|
for month, n, status in results:
|
|
print(f"{month:<12} {n:>8} {status}")
|
|
print("-" * 55)
|
|
print(f"{'TOTAL':<12} {total_events:>8}")
|
|
print("=" * 55)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|