pray-calc-ml/src/process_globe_at_night.py
Aric Camarata ada08e7ec4 data: expand dataset from 5.9k to 91k records via 6 new SQM sources
Add 6 new data collection pipelines and their processed outputs:

Sources added:
- TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020),
  ~40k raw events from 100+ European stations via Zenodo archives
- Globe at Night citizen science: 26k twilight observations (2006-2024),
  filtered from 308k total observations for solar depression 6-22 deg
- GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025),
  ~12.5k twilight events from 88 stations across 20+ countries
- Galicia SQM network: 14 stations, 1-min resolution, 7.5k events
- Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events
- washetdonker.nl Netherlands: 7 stations, 3.3k morning events
- Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer

Pipeline changes:
- ingest.py: add all new files to APPROVED_RAW_CSVS allowlist,
  fix filter to use allowlist instead of hardcoded exclusions
- .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN,
  washetdonker, Globe at Night downloads)

Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records
Previous: 5,871 Fajr + 46 Isha = 5,917 total records
2026-03-22 16:39:29 -04:00

259 lines
8.7 KiB
Python

"""
Process Globe at Night citizen science observation data.
Filters for twilight observations (solar depression 6°-22°) across all
available yearly CSVs (2006-2024) and computes solar depression angles
using PyEphem for each observation.
Output: data/raw/raw_sightings/globe_at_night_twilight.csv
"""
import math
import os
import sys
from datetime import datetime, timezone
import ephem
import numpy as np
import pandas as pd
RAW_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/globe_at_night"
OUT_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/raw_sightings"
OUT_FILE = os.path.join(OUT_DIR, "globe_at_night_twilight.csv")
# Solar depression angle range for twilight (degrees below horizon)
TWILIGHT_MIN = 6.0
TWILIGHT_MAX = 22.0
def solar_depression_angle(utc_dt: datetime, lat: float, lon: float) -> float:
"""
Compute solar depression angle (degrees below horizon) at a given UTC
datetime and geographic position.
Returns positive values for sun below horizon, negative for above.
"""
obs = ephem.Observer()
obs.lat = str(lat)
obs.lon = str(lon)
obs.elevation = 0
obs.pressure = 0 # disable refraction for geometric angle
obs.date = utc_dt.strftime("%Y/%m/%d %H:%M:%S")
sun = ephem.Sun()
sun.compute(obs)
# sun.alt is in radians, positive = above horizon
alt_deg = math.degrees(float(sun.alt))
return -alt_deg # positive = below horizon (depression angle)
def parse_ut_datetime(ut_date: str, ut_time: str):
"""
Parse UTDate + UTTime into a UTC datetime. Returns None on failure.
Expected formats: '2024-07-27' + '20:33' or '20:33:00'
"""
if not ut_date or not ut_time:
return None
ut_date = str(ut_date).strip()
ut_time = str(ut_time).strip()
if not ut_date or not ut_time or ut_date == "nan" or ut_time == "nan":
return None
try:
# Normalize time to HH:MM
parts = ut_time.split(":")
if len(parts) < 2:
return None
hh = int(parts[0])
mm = int(parts[1])
ss = int(parts[2]) if len(parts) > 2 else 0
date_parts = ut_date.split("-")
if len(date_parts) != 3:
return None
year, month, day = int(date_parts[0]), int(date_parts[1]), int(date_parts[2])
return datetime(year, month, day, hh, mm, ss, tzinfo=timezone.utc)
except (ValueError, IndexError):
return None
def load_year(csv_path: str) -> pd.DataFrame:
"""Load a single yearly CSV, returning a cleaned DataFrame."""
try:
df = pd.read_csv(csv_path, dtype=str, low_memory=False)
except Exception as e:
print(f" ERROR reading {csv_path}: {e}", file=sys.stderr)
return pd.DataFrame()
# Normalize column names (strip whitespace)
df.columns = [c.strip() for c in df.columns]
required = {"Latitude", "Longitude", "UTDate", "UTTime"}
if not required.issubset(set(df.columns)):
print(
f" SKIP {os.path.basename(csv_path)}: missing columns {required - set(df.columns)}",
file=sys.stderr,
)
return pd.DataFrame()
return df
def process_file(csv_path: str) -> pd.DataFrame:
"""
Load one yearly CSV, filter for twilight observations, compute solar
depression angles. Returns a DataFrame of twilight rows.
"""
year = os.path.basename(csv_path).replace("GaN", "").replace(".csv", "")
print(f"Processing {year}...")
df = load_year(csv_path)
if df.empty:
return pd.DataFrame()
total_rows = len(df)
# Drop rows missing lat/lng or UT time
df = df.dropna(subset=["Latitude", "Longitude", "UTDate", "UTTime"])
df = df[
(df["Latitude"] != "") & (df["Longitude"] != "") &
(df["UTDate"] != "") & (df["UTTime"] != "")
]
# Convert lat/lng
try:
df["lat"] = pd.to_numeric(df["Latitude"], errors="coerce")
df["lng"] = pd.to_numeric(df["Longitude"], errors="coerce")
except Exception:
return pd.DataFrame()
df = df.dropna(subset=["lat", "lng"])
df = df[(df["lat"].between(-90, 90)) & (df["lng"].between(-180, 180))]
# Parse UTC datetime
df["utc_dt"] = df.apply(
lambda r: parse_ut_datetime(r["UTDate"], r["UTTime"]), axis=1
)
df = df[df["utc_dt"].notna()]
after_parse = len(df)
# Compute solar depression angle for each row
# This is the slow step — vectorized via apply
def compute_depression(row):
try:
return solar_depression_angle(row["utc_dt"], row["lat"], row["lng"])
except Exception:
return float("nan")
df["solar_depression_deg"] = df.apply(compute_depression, axis=1)
df = df[df["solar_depression_deg"].notna()]
# Filter: twilight range 6°-22° below horizon
twilight = df[
(df["solar_depression_deg"] >= TWILIGHT_MIN) &
(df["solar_depression_deg"] <= TWILIGHT_MAX)
].copy()
# Classify as Fajr (morning) or Isha (evening)
# Morning twilight: sun is rising (hour < 12 UTC, roughly)
# More precisely: check if UTTime hour is in AM vs PM relative to solar noon
# We use a simple heuristic: if the sun is below horizon and time is before noon UTC = Fajr
# For accuracy, use local hour from UTTime adjusted by longitude
def classify_twilight(row):
# Approximate solar time: UTC + (lng / 15) hours
solar_hour = row["utc_dt"].hour + row["lng"] / 15.0
solar_hour = solar_hour % 24
# Morning if between 0-14 (captures pre-sunrise in all timezones)
return "fajr" if solar_hour < 14 else "isha"
twilight["twilight_type"] = twilight.apply(classify_twilight, axis=1)
# Build output columns
sqm_col = "SQMReading" if "SQMReading" in twilight.columns else None
limiting_mag_col = "LimitingMag" if "LimitingMag" in twilight.columns else None
cloud_col = "CloudCover" if "CloudCover" in twilight.columns else None
country_col = "Country" if "Country" in twilight.columns else None
elevation_col = "Elevation(m)" if "Elevation(m)" in twilight.columns else None
out = pd.DataFrame({
"source": "globe_at_night",
"year": year,
"id": twilight.get("ID", ""),
"utc_datetime": twilight["utc_dt"].dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
"lat": twilight["lat"],
"lng": twilight["lng"],
"elevation_m": twilight[elevation_col].replace("", np.nan) if elevation_col else np.nan,
"solar_depression_deg": twilight["solar_depression_deg"].round(4),
"twilight_type": twilight["twilight_type"],
"sqm_reading": twilight[sqm_col].replace("", np.nan) if sqm_col else np.nan,
"limiting_mag": twilight[limiting_mag_col].replace("", np.nan) if limiting_mag_col else np.nan,
"cloud_cover": twilight[cloud_col] if cloud_col else "",
"country": twilight[country_col] if country_col else "",
})
n_twilight = len(out)
n_fajr = (out["twilight_type"] == "fajr").sum()
n_isha = (out["twilight_type"] == "isha").sum()
print(
f" {year}: {total_rows} rows → {after_parse} with valid UT → "
f"{n_twilight} twilight ({n_fajr} fajr, {n_isha} isha)"
)
return out
def main():
os.makedirs(OUT_DIR, exist_ok=True)
years = list(range(2006, 2025))
all_results = []
for year in years:
csv_path = os.path.join(RAW_DIR, f"GaN{year}.csv")
if not os.path.exists(csv_path):
print(f"Missing: {csv_path}", file=sys.stderr)
continue
result = process_file(csv_path)
if not result.empty:
all_results.append(result)
if not all_results:
print("No twilight observations found.", file=sys.stderr)
sys.exit(1)
combined = pd.concat(all_results, ignore_index=True)
# Sort by UTC datetime
combined = combined.sort_values("utc_datetime")
combined.to_csv(OUT_FILE, index=False)
total = len(combined)
fajr = (combined["twilight_type"] == "fajr").sum()
isha = (combined["twilight_type"] == "isha").sum()
with_sqm = combined["sqm_reading"].notna().sum()
print()
print("=" * 60)
print(f"Output: {OUT_FILE}")
print(f"Total twilight observations: {total:,}")
print(f" Fajr (morning): {fajr:,}")
print(f" Isha (evening): {isha:,}")
print(f" With SQM: {with_sqm:,}")
print()
print("Depression angle distribution:")
bins = [6, 8, 10, 12, 14, 16, 18, 20, 22]
for i in range(len(bins) - 1):
n = ((combined["solar_depression_deg"] >= bins[i]) &
(combined["solar_depression_deg"] < bins[i + 1])).sum()
print(f" {bins[i]:2d}°-{bins[i+1]:2d}°: {n:,}")
print()
print("Countries (top 10):")
top_countries = combined["country"].value_counts().head(10)
for country, count in top_countries.items():
print(f" {country}: {count:,}")
if __name__ == "__main__":
main()