pray-calc-ml/src/process_majadahonda_sqm.py
Aric Camarata ada08e7ec4 data: expand dataset from 5.9k to 91k records via 6 new SQM sources
Add 6 new data collection pipelines and their processed outputs:

Sources added:
- TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020),
  ~40k raw events from 100+ European stations via Zenodo archives
- Globe at Night citizen science: 26k twilight observations (2006-2024),
  filtered from 308k total observations for solar depression 6-22 deg
- GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025),
  ~12.5k twilight events from 88 stations across 20+ countries
- Galicia SQM network: 14 stations, 1-min resolution, 7.5k events
- Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events
- washetdonker.nl Netherlands: 7 stations, 3.3k morning events
- Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer

Pipeline changes:
- ingest.py: add all new files to APPROVED_RAW_CSVS allowlist,
  fix filter to use allowlist instead of hardcoded exclusions
- .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN,
  washetdonker, Globe at Night downloads)

Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records
Previous: 5,871 Fajr + 46 Isha = 5,917 total records
2026-03-22 16:39:29 -04:00

202 lines
6.1 KiB
Python

"""
Process Majadahonda SQM 2019 tesstractor data.
Station: Majadahonda, Madrid, Spain
Coordinates: 40.469°N, 3.863°W
Elevation: ~700m (Majadahonda suburb, 700m asl approx)
Timezone: Europe/Madrid (UTC+1/+2)
Format: ;-separated
col0: UTC timestamp (ISO 8601)
col1: local timestamp
col2: integration time (99.0 = 5 min)
col3: temperature °C
col4: frequency Hz
col5: MSAS magnitude (mag/arcsec²) — 0.00 means daylight/overexposed
col6: calibration MSAS
Source: https://zenodo.org/records/5709962
DOI: 10.5281/zenodo.5709962
Processing:
- Parse all .dat files (5-minute cadence, tesstractor format)
- For each night, compute solar depression angle per reading using PyEphem
- Find morning twilight window: depression 10°-20°, MSAS dropping (sky brightening)
- Find evening twilight window: depression 10°-20°, MSAS rising (sky darkening)
- Extract inflection point (maximum |d(MSAS)/dt|) as the "twilight event"
- Output one row per event
"""
import os
import re
import datetime
import ephem
import pandas as pd
import numpy as np
LAT = 40.469
LON = -3.863
ELEV = 700.0
UTC_OFFSET_WINTER = 1 # CET
UTC_OFFSET_SUMMER = 2 # CEST
DAT_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/majadahonda_sqm_2019/tesstractor"
OUT_CSV = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/raw_sightings/majadahonda_2019_sqm.csv"
SOURCE = "Majadahonda_SQM_2019_Zenodo5709962"
DEP_MIN = 10.0
DEP_MAX = 20.0
def compute_solar_depression(utc_dt: datetime.datetime) -> float:
obs = ephem.Observer()
obs.lat = str(LAT)
obs.lon = str(LON)
obs.elev = ELEV
obs.pressure = 0
obs.epoch = ephem.J2000
obs.date = utc_dt.strftime("%Y/%m/%d %H:%M:%S")
sun = ephem.Sun()
sun.compute(obs)
alt_deg = float(sun.alt) * 180.0 / ephem.pi
return -alt_deg
def parse_dat_file(fpath: str) -> pd.DataFrame:
rows = []
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(";")
if len(parts) < 7:
continue
try:
utc_str = parts[0]
msas_raw = float(parts[6])
# Remove sub-second part for parsing
utc_str_clean = re.sub(r"\.\d+$", "", utc_str)
utc_dt = datetime.datetime.strptime(utc_str_clean, "%Y-%m-%dT%H:%M:%S")
rows.append({"utc_dt": utc_dt, "msas": msas_raw})
except (ValueError, IndexError):
continue
if not rows:
return pd.DataFrame()
df = pd.DataFrame(rows)
df = df.sort_values("utc_dt").reset_index(drop=True)
return df
def find_twilight_events(df: pd.DataFrame) -> list:
"""
For a night's data, find morning and evening twilight inflection points.
Returns list of dicts with event details.
"""
if df.empty:
return []
events = []
# Compute solar depression for each row
depressions = []
for _, row in df.iterrows():
dep = compute_solar_depression(row["utc_dt"])
depressions.append(dep)
df = df.copy()
df["depression"] = depressions
# Filter to twilight window only (valid MSAS > 5, depression in range)
mask = (
(df["depression"] >= DEP_MIN)
& (df["depression"] <= DEP_MAX)
& (df["msas"] > 5.0)
)
twi = df[mask].copy()
if len(twi) < 4:
return []
# Compute rate of change of MSAS
twi = twi.sort_values("utc_dt").reset_index(drop=True)
twi["dmsas"] = twi["msas"].diff()
# Separate morning (depression decreasing over time = sun rising)
# and evening (depression increasing over time = sun setting)
twi["ddep"] = twi["depression"].diff()
# Morning: depression decreasing (sun rising), MSAS dropping (sky brightening)
morning = twi[twi["ddep"] < 0].copy()
evening = twi[twi["ddep"] > 0].copy()
def extract_event(segment: pd.DataFrame, prayer: str) -> dict | None:
if len(segment) < 3:
return None
# Find max absolute rate of change
segment = segment.copy()
segment["abs_dmsas"] = segment["dmsas"].abs()
peak_idx = segment["abs_dmsas"].idxmax()
peak_row = segment.loc[peak_idx]
utc_dt = peak_row["utc_dt"]
depression = peak_row["depression"]
date_local = utc_dt.date()
# Estimate local time offset (rough: Spain UTC+1 winter, +2 summer)
month = utc_dt.month
utc_off = UTC_OFFSET_SUMMER if 4 <= month <= 10 else UTC_OFFSET_WINTER
local_dt = utc_dt + datetime.timedelta(hours=utc_off)
return {
"prayer": prayer,
"date_local": local_dt.strftime("%Y-%m-%d"),
"time_local": local_dt.strftime("%H:%M:%S"),
"utc_offset": utc_off,
"lat": LAT,
"lng": LON,
"elevation_m": ELEV,
"source": SOURCE,
"notes": f"sqm_msas={peak_row['msas']:.2f},solar_dep={depression:.2f}deg,inflection_method",
}
e = extract_event(morning, "fajr")
if e:
events.append(e)
e = extract_event(evening, "isha")
if e:
events.append(e)
return events
def main():
all_events = []
dat_files = sorted(
f for f in os.listdir(DAT_DIR) if f.endswith(".dat")
)
print(f"Processing {len(dat_files)} .dat files...")
for fname in dat_files:
fpath = os.path.join(DAT_DIR, fname)
df = parse_dat_file(fpath)
if df.empty:
continue
events = find_twilight_events(df)
all_events.extend(events)
if events:
print(f" {fname}: {len(events)} events")
if not all_events:
print("No events found.")
return
out_df = pd.DataFrame(all_events, columns=[
"prayer", "date_local", "time_local", "utc_offset",
"lat", "lng", "elevation_m", "source", "notes"
])
out_df = out_df.sort_values(["date_local", "prayer"]).reset_index(drop=True)
out_df.to_csv(OUT_CSV, index=False)
print(f"\nWrote {len(out_df)} rows to {OUT_CSV}")
print(out_df["prayer"].value_counts().to_string())
if __name__ == "__main__":
main()