mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-07-01 03:14:27 +00:00
Add 6 new data collection pipelines and their processed outputs: Sources added: - TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020), ~40k raw events from 100+ European stations via Zenodo archives - Globe at Night citizen science: 26k twilight observations (2006-2024), filtered from 308k total observations for solar depression 6-22 deg - GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025), ~12.5k twilight events from 88 stations across 20+ countries - Galicia SQM network: 14 stations, 1-min resolution, 7.5k events - Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events - washetdonker.nl Netherlands: 7 stations, 3.3k morning events - Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer Pipeline changes: - ingest.py: add all new files to APPROVED_RAW_CSVS allowlist, fix filter to use allowlist instead of hardcoded exclusions - .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN, washetdonker, Globe at Night downloads) Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records Previous: 5,871 Fajr + 46 Isha = 5,917 total records
259 lines
8.7 KiB
Python
259 lines
8.7 KiB
Python
"""
|
|
Process Globe at Night citizen science observation data.
|
|
|
|
Filters for twilight observations (solar depression 6°-22°) across all
|
|
available yearly CSVs (2006-2024) and computes solar depression angles
|
|
using PyEphem for each observation.
|
|
|
|
Output: data/raw/raw_sightings/globe_at_night_twilight.csv
|
|
"""
|
|
|
|
import math
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
|
|
import ephem
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
RAW_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/globe_at_night"
|
|
OUT_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/raw_sightings"
|
|
OUT_FILE = os.path.join(OUT_DIR, "globe_at_night_twilight.csv")
|
|
|
|
# Solar depression angle range for twilight (degrees below horizon)
|
|
TWILIGHT_MIN = 6.0
|
|
TWILIGHT_MAX = 22.0
|
|
|
|
|
|
def solar_depression_angle(utc_dt: datetime, lat: float, lon: float) -> float:
|
|
"""
|
|
Compute solar depression angle (degrees below horizon) at a given UTC
|
|
datetime and geographic position.
|
|
|
|
Returns positive values for sun below horizon, negative for above.
|
|
"""
|
|
obs = ephem.Observer()
|
|
obs.lat = str(lat)
|
|
obs.lon = str(lon)
|
|
obs.elevation = 0
|
|
obs.pressure = 0 # disable refraction for geometric angle
|
|
obs.date = utc_dt.strftime("%Y/%m/%d %H:%M:%S")
|
|
|
|
sun = ephem.Sun()
|
|
sun.compute(obs)
|
|
|
|
# sun.alt is in radians, positive = above horizon
|
|
alt_deg = math.degrees(float(sun.alt))
|
|
return -alt_deg # positive = below horizon (depression angle)
|
|
|
|
|
|
def parse_ut_datetime(ut_date: str, ut_time: str):
|
|
"""
|
|
Parse UTDate + UTTime into a UTC datetime. Returns None on failure.
|
|
Expected formats: '2024-07-27' + '20:33' or '20:33:00'
|
|
"""
|
|
if not ut_date or not ut_time:
|
|
return None
|
|
ut_date = str(ut_date).strip()
|
|
ut_time = str(ut_time).strip()
|
|
if not ut_date or not ut_time or ut_date == "nan" or ut_time == "nan":
|
|
return None
|
|
try:
|
|
# Normalize time to HH:MM
|
|
parts = ut_time.split(":")
|
|
if len(parts) < 2:
|
|
return None
|
|
hh = int(parts[0])
|
|
mm = int(parts[1])
|
|
ss = int(parts[2]) if len(parts) > 2 else 0
|
|
date_parts = ut_date.split("-")
|
|
if len(date_parts) != 3:
|
|
return None
|
|
year, month, day = int(date_parts[0]), int(date_parts[1]), int(date_parts[2])
|
|
return datetime(year, month, day, hh, mm, ss, tzinfo=timezone.utc)
|
|
except (ValueError, IndexError):
|
|
return None
|
|
|
|
|
|
def load_year(csv_path: str) -> pd.DataFrame:
|
|
"""Load a single yearly CSV, returning a cleaned DataFrame."""
|
|
try:
|
|
df = pd.read_csv(csv_path, dtype=str, low_memory=False)
|
|
except Exception as e:
|
|
print(f" ERROR reading {csv_path}: {e}", file=sys.stderr)
|
|
return pd.DataFrame()
|
|
|
|
# Normalize column names (strip whitespace)
|
|
df.columns = [c.strip() for c in df.columns]
|
|
|
|
required = {"Latitude", "Longitude", "UTDate", "UTTime"}
|
|
if not required.issubset(set(df.columns)):
|
|
print(
|
|
f" SKIP {os.path.basename(csv_path)}: missing columns {required - set(df.columns)}",
|
|
file=sys.stderr,
|
|
)
|
|
return pd.DataFrame()
|
|
|
|
return df
|
|
|
|
|
|
def process_file(csv_path: str) -> pd.DataFrame:
|
|
"""
|
|
Load one yearly CSV, filter for twilight observations, compute solar
|
|
depression angles. Returns a DataFrame of twilight rows.
|
|
"""
|
|
year = os.path.basename(csv_path).replace("GaN", "").replace(".csv", "")
|
|
print(f"Processing {year}...")
|
|
|
|
df = load_year(csv_path)
|
|
if df.empty:
|
|
return pd.DataFrame()
|
|
|
|
total_rows = len(df)
|
|
|
|
# Drop rows missing lat/lng or UT time
|
|
df = df.dropna(subset=["Latitude", "Longitude", "UTDate", "UTTime"])
|
|
df = df[
|
|
(df["Latitude"] != "") & (df["Longitude"] != "") &
|
|
(df["UTDate"] != "") & (df["UTTime"] != "")
|
|
]
|
|
|
|
# Convert lat/lng
|
|
try:
|
|
df["lat"] = pd.to_numeric(df["Latitude"], errors="coerce")
|
|
df["lng"] = pd.to_numeric(df["Longitude"], errors="coerce")
|
|
except Exception:
|
|
return pd.DataFrame()
|
|
|
|
df = df.dropna(subset=["lat", "lng"])
|
|
df = df[(df["lat"].between(-90, 90)) & (df["lng"].between(-180, 180))]
|
|
|
|
# Parse UTC datetime
|
|
df["utc_dt"] = df.apply(
|
|
lambda r: parse_ut_datetime(r["UTDate"], r["UTTime"]), axis=1
|
|
)
|
|
df = df[df["utc_dt"].notna()]
|
|
|
|
after_parse = len(df)
|
|
|
|
# Compute solar depression angle for each row
|
|
# This is the slow step — vectorized via apply
|
|
def compute_depression(row):
|
|
try:
|
|
return solar_depression_angle(row["utc_dt"], row["lat"], row["lng"])
|
|
except Exception:
|
|
return float("nan")
|
|
|
|
df["solar_depression_deg"] = df.apply(compute_depression, axis=1)
|
|
df = df[df["solar_depression_deg"].notna()]
|
|
|
|
# Filter: twilight range 6°-22° below horizon
|
|
twilight = df[
|
|
(df["solar_depression_deg"] >= TWILIGHT_MIN) &
|
|
(df["solar_depression_deg"] <= TWILIGHT_MAX)
|
|
].copy()
|
|
|
|
# Classify as Fajr (morning) or Isha (evening)
|
|
# Morning twilight: sun is rising (hour < 12 UTC, roughly)
|
|
# More precisely: check if UTTime hour is in AM vs PM relative to solar noon
|
|
# We use a simple heuristic: if the sun is below horizon and time is before noon UTC = Fajr
|
|
# For accuracy, use local hour from UTTime adjusted by longitude
|
|
def classify_twilight(row):
|
|
# Approximate solar time: UTC + (lng / 15) hours
|
|
solar_hour = row["utc_dt"].hour + row["lng"] / 15.0
|
|
solar_hour = solar_hour % 24
|
|
# Morning if between 0-14 (captures pre-sunrise in all timezones)
|
|
return "fajr" if solar_hour < 14 else "isha"
|
|
|
|
twilight["twilight_type"] = twilight.apply(classify_twilight, axis=1)
|
|
|
|
# Build output columns
|
|
sqm_col = "SQMReading" if "SQMReading" in twilight.columns else None
|
|
limiting_mag_col = "LimitingMag" if "LimitingMag" in twilight.columns else None
|
|
cloud_col = "CloudCover" if "CloudCover" in twilight.columns else None
|
|
country_col = "Country" if "Country" in twilight.columns else None
|
|
elevation_col = "Elevation(m)" if "Elevation(m)" in twilight.columns else None
|
|
|
|
out = pd.DataFrame({
|
|
"source": "globe_at_night",
|
|
"year": year,
|
|
"id": twilight.get("ID", ""),
|
|
"utc_datetime": twilight["utc_dt"].dt.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"lat": twilight["lat"],
|
|
"lng": twilight["lng"],
|
|
"elevation_m": twilight[elevation_col].replace("", np.nan) if elevation_col else np.nan,
|
|
"solar_depression_deg": twilight["solar_depression_deg"].round(4),
|
|
"twilight_type": twilight["twilight_type"],
|
|
"sqm_reading": twilight[sqm_col].replace("", np.nan) if sqm_col else np.nan,
|
|
"limiting_mag": twilight[limiting_mag_col].replace("", np.nan) if limiting_mag_col else np.nan,
|
|
"cloud_cover": twilight[cloud_col] if cloud_col else "",
|
|
"country": twilight[country_col] if country_col else "",
|
|
})
|
|
|
|
n_twilight = len(out)
|
|
n_fajr = (out["twilight_type"] == "fajr").sum()
|
|
n_isha = (out["twilight_type"] == "isha").sum()
|
|
|
|
print(
|
|
f" {year}: {total_rows} rows → {after_parse} with valid UT → "
|
|
f"{n_twilight} twilight ({n_fajr} fajr, {n_isha} isha)"
|
|
)
|
|
|
|
return out
|
|
|
|
|
|
def main():
|
|
os.makedirs(OUT_DIR, exist_ok=True)
|
|
|
|
years = list(range(2006, 2025))
|
|
all_results = []
|
|
|
|
for year in years:
|
|
csv_path = os.path.join(RAW_DIR, f"GaN{year}.csv")
|
|
if not os.path.exists(csv_path):
|
|
print(f"Missing: {csv_path}", file=sys.stderr)
|
|
continue
|
|
result = process_file(csv_path)
|
|
if not result.empty:
|
|
all_results.append(result)
|
|
|
|
if not all_results:
|
|
print("No twilight observations found.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
combined = pd.concat(all_results, ignore_index=True)
|
|
|
|
# Sort by UTC datetime
|
|
combined = combined.sort_values("utc_datetime")
|
|
|
|
combined.to_csv(OUT_FILE, index=False)
|
|
|
|
total = len(combined)
|
|
fajr = (combined["twilight_type"] == "fajr").sum()
|
|
isha = (combined["twilight_type"] == "isha").sum()
|
|
with_sqm = combined["sqm_reading"].notna().sum()
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print(f"Output: {OUT_FILE}")
|
|
print(f"Total twilight observations: {total:,}")
|
|
print(f" Fajr (morning): {fajr:,}")
|
|
print(f" Isha (evening): {isha:,}")
|
|
print(f" With SQM: {with_sqm:,}")
|
|
print()
|
|
print("Depression angle distribution:")
|
|
bins = [6, 8, 10, 12, 14, 16, 18, 20, 22]
|
|
for i in range(len(bins) - 1):
|
|
n = ((combined["solar_depression_deg"] >= bins[i]) &
|
|
(combined["solar_depression_deg"] < bins[i + 1])).sum()
|
|
print(f" {bins[i]:2d}°-{bins[i+1]:2d}°: {n:,}")
|
|
print()
|
|
print("Countries (top 10):")
|
|
top_countries = combined["country"].value_counts().head(10)
|
|
for country, count in top_countries.items():
|
|
print(f" {country}: {count:,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|