pray-calc-ml/src/process_globe_at_night.py

"""
Process Globe at Night citizen science observation data.

Filters for twilight observations (solar depression 6°-22°) across all
available yearly CSVs (2006-2024) and computes solar depression angles
using PyEphem for each observation.

Output: data/raw/raw_sightings/globe_at_night_twilight.csv
"""

import math
import os
import sys
from datetime import datetime, timezone

import ephem
import numpy as np
import pandas as pd

RAW_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/globe_at_night"
OUT_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/raw_sightings"
OUT_FILE = os.path.join(OUT_DIR, "globe_at_night_twilight.csv")

# Solar depression angle range for twilight (degrees below horizon)
TWILIGHT_MIN = 6.0
TWILIGHT_MAX = 22.0


def solar_depression_angle(utc_dt: datetime, lat: float, lon: float) -> float:
    """
    Compute solar depression angle (degrees below horizon) at a given UTC
    datetime and geographic position.

    Returns positive values for sun below horizon, negative for above.
    """
    obs = ephem.Observer()
    obs.lat = str(lat)
    obs.lon = str(lon)
    obs.elevation = 0
    obs.pressure = 0  # disable refraction for geometric angle
    obs.date = utc_dt.strftime("%Y/%m/%d %H:%M:%S")

    sun = ephem.Sun()
    sun.compute(obs)

    # sun.alt is in radians, positive = above horizon
    alt_deg = math.degrees(float(sun.alt))
    return -alt_deg  # positive = below horizon (depression angle)


def parse_ut_datetime(ut_date: str, ut_time: str):
    """
    Parse UTDate + UTTime into a UTC datetime. Returns None on failure.
    Expected formats: '2024-07-27' + '20:33' or '20:33:00'
    """
    if not ut_date or not ut_time:
        return None
    ut_date = str(ut_date).strip()
    ut_time = str(ut_time).strip()
    if not ut_date or not ut_time or ut_date == "nan" or ut_time == "nan":
        return None
    try:
        # Normalize time to HH:MM
        parts = ut_time.split(":")
        if len(parts) < 2:
            return None
        hh = int(parts[0])
        mm = int(parts[1])
        ss = int(parts[2]) if len(parts) > 2 else 0
        date_parts = ut_date.split("-")
        if len(date_parts) != 3:
            return None
        year, month, day = int(date_parts[0]), int(date_parts[1]), int(date_parts[2])
        return datetime(year, month, day, hh, mm, ss, tzinfo=timezone.utc)
    except (ValueError, IndexError):
        return None


def load_year(csv_path: str) -> pd.DataFrame:
    """Load a single yearly CSV, returning a cleaned DataFrame."""
    try:
        df = pd.read_csv(csv_path, dtype=str, low_memory=False)
    except Exception as e:
        print(f"  ERROR reading {csv_path}: {e}", file=sys.stderr)
        return pd.DataFrame()

    # Normalize column names (strip whitespace)
    df.columns = [c.strip() for c in df.columns]

    required = {"Latitude", "Longitude", "UTDate", "UTTime"}
    if not required.issubset(set(df.columns)):
        print(
            f"  SKIP {os.path.basename(csv_path)}: missing columns {required - set(df.columns)}",
            file=sys.stderr,
        )
        return pd.DataFrame()

    return df


def process_file(csv_path: str) -> pd.DataFrame:
    """
    Load one yearly CSV, filter for twilight observations, compute solar
    depression angles. Returns a DataFrame of twilight rows.
    """
    year = os.path.basename(csv_path).replace("GaN", "").replace(".csv", "")
    print(f"Processing {year}...")

    df = load_year(csv_path)
    if df.empty:
        return pd.DataFrame()

    total_rows = len(df)

    # Drop rows missing lat/lng or UT time
    df = df.dropna(subset=["Latitude", "Longitude", "UTDate", "UTTime"])
    df = df[
        (df["Latitude"] != "") & (df["Longitude"] != "") &
        (df["UTDate"] != "") & (df["UTTime"] != "")
    ]

    # Convert lat/lng
    try:
        df["lat"] = pd.to_numeric(df["Latitude"], errors="coerce")
        df["lng"] = pd.to_numeric(df["Longitude"], errors="coerce")
    except Exception:
        return pd.DataFrame()

    df = df.dropna(subset=["lat", "lng"])
    df = df[(df["lat"].between(-90, 90)) & (df["lng"].between(-180, 180))]

    # Parse UTC datetime
    df["utc_dt"] = df.apply(
        lambda r: parse_ut_datetime(r["UTDate"], r["UTTime"]), axis=1
    )
    df = df[df["utc_dt"].notna()]

    after_parse = len(df)

    # Compute solar depression angle for each row
    # This is the slow step — vectorized via apply
    def compute_depression(row):
        try:
            return solar_depression_angle(row["utc_dt"], row["lat"], row["lng"])
        except Exception:
            return float("nan")

    df["solar_depression_deg"] = df.apply(compute_depression, axis=1)
    df = df[df["solar_depression_deg"].notna()]

    # Filter: twilight range 6°-22° below horizon
    twilight = df[
        (df["solar_depression_deg"] >= TWILIGHT_MIN) &
        (df["solar_depression_deg"] <= TWILIGHT_MAX)
    ].copy()

    # Classify as Fajr (morning) or Isha (evening)
    # Morning twilight: sun is rising (hour < 12 UTC, roughly)
    # More precisely: check if UTTime hour is in AM vs PM relative to solar noon
    # We use a simple heuristic: if the sun is below horizon and time is before noon UTC = Fajr
    # For accuracy, use local hour from UTTime adjusted by longitude
    def classify_twilight(row):
        # Approximate solar time: UTC + (lng / 15) hours
        solar_hour = row["utc_dt"].hour + row["lng"] / 15.0
        solar_hour = solar_hour % 24
        # Morning if between 0-14 (captures pre-sunrise in all timezones)
        return "fajr" if solar_hour < 14 else "isha"

    twilight["twilight_type"] = twilight.apply(classify_twilight, axis=1)

    # Build output columns
    sqm_col = "SQMReading" if "SQMReading" in twilight.columns else None
    limiting_mag_col = "LimitingMag" if "LimitingMag" in twilight.columns else None
    cloud_col = "CloudCover" if "CloudCover" in twilight.columns else None
    country_col = "Country" if "Country" in twilight.columns else None
    elevation_col = "Elevation(m)" if "Elevation(m)" in twilight.columns else None

    out = pd.DataFrame({
        "source": "globe_at_night",
        "year": year,
        "id": twilight.get("ID", ""),
        "utc_datetime": twilight["utc_dt"].dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "lat": twilight["lat"],
        "lng": twilight["lng"],
        "elevation_m": twilight[elevation_col].replace("", np.nan) if elevation_col else np.nan,
        "solar_depression_deg": twilight["solar_depression_deg"].round(4),
        "twilight_type": twilight["twilight_type"],
        "sqm_reading": twilight[sqm_col].replace("", np.nan) if sqm_col else np.nan,
        "limiting_mag": twilight[limiting_mag_col].replace("", np.nan) if limiting_mag_col else np.nan,
        "cloud_cover": twilight[cloud_col] if cloud_col else "",
        "country": twilight[country_col] if country_col else "",
    })

    n_twilight = len(out)
    n_fajr = (out["twilight_type"] == "fajr").sum()
    n_isha = (out["twilight_type"] == "isha").sum()

    print(
        f"  {year}: {total_rows} rows → {after_parse} with valid UT → "
        f"{n_twilight} twilight ({n_fajr} fajr, {n_isha} isha)"
    )

    return out


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    years = list(range(2006, 2025))
    all_results = []

    for year in years:
        csv_path = os.path.join(RAW_DIR, f"GaN{year}.csv")
        if not os.path.exists(csv_path):
            print(f"Missing: {csv_path}", file=sys.stderr)
            continue
        result = process_file(csv_path)
        if not result.empty:
            all_results.append(result)

    if not all_results:
        print("No twilight observations found.", file=sys.stderr)
        sys.exit(1)

    combined = pd.concat(all_results, ignore_index=True)

    # Sort by UTC datetime
    combined = combined.sort_values("utc_datetime")

    combined.to_csv(OUT_FILE, index=False)

    total = len(combined)
    fajr = (combined["twilight_type"] == "fajr").sum()
    isha = (combined["twilight_type"] == "isha").sum()
    with_sqm = combined["sqm_reading"].notna().sum()

    print()
    print("=" * 60)
    print(f"Output: {OUT_FILE}")
    print(f"Total twilight observations: {total:,}")
    print(f"  Fajr (morning):  {fajr:,}")
    print(f"  Isha (evening):  {isha:,}")
    print(f"  With SQM:        {with_sqm:,}")
    print()
    print("Depression angle distribution:")
    bins = [6, 8, 10, 12, 14, 16, 18, 20, 22]
    for i in range(len(bins) - 1):
        n = ((combined["solar_depression_deg"] >= bins[i]) &
             (combined["solar_depression_deg"] < bins[i + 1])).sum()
        print(f"  {bins[i]:2d}°-{bins[i+1]:2d}°: {n:,}")
    print()
    print("Countries (top 10):")
    top_countries = combined["country"].value_counts().head(10)
    for country, count in top_countries.items():
        print(f"  {country}: {count:,}")


if __name__ == "__main__":
    main()